aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/00-INDEX2
-rw-r--r--Documentation/ABI/obsolete/proc-pid-oom_adj22
-rw-r--r--Documentation/cgroups/memory.txt90
-rw-r--r--Documentation/filesystems/proc.txt22
-rw-r--r--Documentation/memory.txt33
-rw-r--r--Documentation/prio_tree.txt107
-rw-r--r--Documentation/rbtree.txt209
-rw-r--r--Documentation/vm/unevictable-lru.txt14
-rw-r--r--MAINTAINERS8
-rw-r--r--arch/Kconfig3
-rw-r--r--arch/alpha/kernel/pci-sysfs.c2
-rw-r--r--arch/arm/Kconfig2
-rw-r--r--arch/arm/mm/fault-armv.c3
-rw-r--r--arch/arm/mm/fault.c1
-rw-r--r--arch/arm/mm/flush.c3
-rw-r--r--arch/arm64/Kconfig4
-rw-r--r--arch/avr32/mm/fault.c1
-rw-r--r--arch/blackfin/Kconfig1
-rw-r--r--arch/cris/Kconfig1
-rw-r--r--arch/cris/mm/fault.c1
-rw-r--r--arch/frv/Kconfig2
-rw-r--r--arch/h8300/Kconfig1
-rw-r--r--arch/hexagon/mm/vm_fault.c1
-rw-r--r--arch/ia64/include/asm/hugetlb.h4
-rw-r--r--arch/ia64/kernel/perfmon.c2
-rw-r--r--arch/ia64/mm/fault.c1
-rw-r--r--arch/ia64/mm/init.c4
-rw-r--r--arch/m32r/Kconfig1
-rw-r--r--arch/m68k/Kconfig2
-rw-r--r--arch/m68k/mm/fault.c1
-rw-r--r--arch/microblaze/Kconfig1
-rw-r--r--arch/microblaze/include/asm/atomic.h1
-rw-r--r--arch/microblaze/mm/fault.c1
-rw-r--r--arch/mips/Kconfig1
-rw-r--r--arch/mips/include/asm/hugetlb.h4
-rw-r--r--arch/mips/mm/fault.c1
-rw-r--r--arch/openrisc/mm/fault.c1
-rw-r--r--arch/parisc/kernel/cache.c3
-rw-r--r--arch/powerpc/Kconfig2
-rw-r--r--arch/powerpc/include/asm/atomic.h1
-rw-r--r--arch/powerpc/include/asm/hugetlb.h4
-rw-r--r--arch/powerpc/kvm/book3s_hv.c2
-rw-r--r--arch/powerpc/mm/fault.c1
-rw-r--r--arch/powerpc/oprofile/cell/spu_task_sync.c15
-rw-r--r--arch/powerpc/platforms/pseries/hotplug-memory.c13
-rw-r--r--arch/s390/Kconfig3
-rw-r--r--arch/s390/include/asm/hugetlb.h19
-rw-r--r--arch/s390/include/asm/pgtable.h210
-rw-r--r--arch/s390/include/asm/setup.h5
-rw-r--r--arch/s390/include/asm/tlb.h1
-rw-r--r--arch/s390/kernel/early.c2
-rw-r--r--arch/s390/mm/fault.c1
-rw-r--r--arch/s390/mm/gup.c11
-rw-r--r--arch/s390/mm/pgtable.c108
-rw-r--r--arch/sh/Kconfig3
-rw-r--r--arch/sh/include/asm/hugetlb.h6
-rw-r--r--arch/sh/mm/fault.c1
-rw-r--r--arch/sparc/Kconfig41
-rw-r--r--arch/sparc/include/asm/hugetlb.h9
-rw-r--r--arch/sparc/include/asm/mmu_64.h19
-rw-r--r--arch/sparc/include/asm/mmu_context_64.h2
-rw-r--r--arch/sparc/include/asm/page_64.h21
-rw-r--r--arch/sparc/include/asm/pgalloc_64.h56
-rw-r--r--arch/sparc/include/asm/pgtable_64.h253
-rw-r--r--arch/sparc/include/asm/tsb.h106
-rw-r--r--arch/sparc/kernel/pci.c2
-rw-r--r--arch/sparc/kernel/sun4v_tlb_miss.S2
-rw-r--r--arch/sparc/kernel/tsb.S9
-rw-r--r--arch/sparc/mm/fault_32.c1
-rw-r--r--arch/sparc/mm/fault_64.c5
-rw-r--r--arch/sparc/mm/hugetlbpage.c50
-rw-r--r--arch/sparc/mm/init_64.c314
-rw-r--r--arch/sparc/mm/tlb.c118
-rw-r--r--arch/sparc/mm/tsb.c40
-rw-r--r--arch/tile/Kconfig3
-rw-r--r--arch/tile/include/asm/hugetlb.h4
-rw-r--r--arch/tile/mm/elf.c19
-rw-r--r--arch/tile/mm/fault.c1
-rw-r--r--arch/um/Kconfig.common1
-rw-r--r--arch/um/kernel/trap.c1
-rw-r--r--arch/unicore32/kernel/process.c2
-rw-r--r--arch/x86/Kconfig5
-rw-r--r--arch/x86/include/asm/atomic.h24
-rw-r--r--arch/x86/include/asm/hugetlb.h4
-rw-r--r--arch/x86/include/asm/pgtable.h11
-rw-r--r--arch/x86/include/asm/pgtable_32.h1
-rw-r--r--arch/x86/include/asm/pgtable_64.h1
-rw-r--r--arch/x86/mm/fault.c1
-rw-r--r--arch/x86/mm/hugetlbpage.c3
-rw-r--r--arch/x86/mm/pat.c87
-rw-r--r--arch/x86/mm/pat_rbtree.c34
-rw-r--r--arch/x86/xen/mmu.c3
-rw-r--r--arch/xtensa/mm/fault.c1
-rw-r--r--drivers/base/memory.c40
-rw-r--r--drivers/char/mbcs.c2
-rw-r--r--drivers/char/mem.c2
-rw-r--r--drivers/char/mspec.c2
-rw-r--r--drivers/gpu/drm/drm_gem.c2
-rw-r--r--drivers/gpu/drm/drm_vm.c10
-rw-r--r--drivers/gpu/drm/exynos/exynos_drm_gem.c2
-rw-r--r--drivers/gpu/drm/gma500/framebuffer.c3
-rw-r--r--drivers/gpu/drm/ttm/ttm_bo_vm.c4
-rw-r--r--drivers/gpu/drm/udl/udl_fb.c2
-rw-r--r--drivers/infiniband/hw/ehca/ehca_uverbs.c4
-rw-r--r--drivers/infiniband/hw/ipath/ipath_file_ops.c2
-rw-r--r--drivers/infiniband/hw/qib/qib_file_ops.c2
-rw-r--r--drivers/media/pci/meye/meye.c2
-rw-r--r--drivers/media/platform/omap/omap_vout.c2
-rw-r--r--drivers/media/platform/vino.c2
-rw-r--r--drivers/media/usb/sn9c102/sn9c102_core.c3
-rw-r--r--drivers/media/usb/usbvision/usbvision-video.c3
-rw-r--r--drivers/media/v4l2-core/videobuf-dma-sg.c2
-rw-r--r--drivers/media/v4l2-core/videobuf-vmalloc.c2
-rw-r--r--drivers/media/v4l2-core/videobuf2-memops.c2
-rw-r--r--drivers/misc/carma/carma-fpga.c2
-rw-r--r--drivers/misc/sgi-gru/grufile.c5
-rw-r--r--drivers/mtd/mtdchar.c2
-rw-r--r--drivers/mtd/mtdcore.c6
-rw-r--r--drivers/oprofile/buffer_sync.c17
-rw-r--r--drivers/scsi/sg.c2
-rw-r--r--drivers/staging/android/ashmem.c1
-rw-r--r--drivers/staging/omapdrm/omap_gem_dmabuf.c2
-rw-r--r--drivers/staging/tidspbridge/rmgr/drv_interface.c2
-rw-r--r--drivers/uio/uio.c4
-rw-r--r--drivers/usb/mon/mon_bin.c2
-rw-r--r--drivers/video/68328fb.c2
-rw-r--r--drivers/video/aty/atyfb_base.c3
-rw-r--r--drivers/video/fb-puv3.c3
-rw-r--r--drivers/video/fb_defio.c2
-rw-r--r--drivers/video/fbmem.c3
-rw-r--r--drivers/video/gbefb.c2
-rw-r--r--drivers/video/omap2/omapfb/omapfb-main.c2
-rw-r--r--drivers/video/sbuslib.c5
-rw-r--r--drivers/video/smscufx.c1
-rw-r--r--drivers/video/udlfb.c1
-rw-r--r--drivers/video/vermilion/vermilion.c1
-rw-r--r--drivers/video/vfb.c1
-rw-r--r--drivers/xen/gntalloc.c2
-rw-r--r--drivers/xen/gntdev.c2
-rw-r--r--drivers/xen/privcmd.c3
-rw-r--r--fs/9p/vfs_file.c1
-rw-r--r--fs/binfmt_elf.c4
-rw-r--r--fs/binfmt_elf_fdpic.c2
-rw-r--r--fs/btrfs/file.c2
-rw-r--r--fs/ceph/addr.c2
-rw-r--r--fs/cifs/file.c1
-rw-r--r--fs/exec.c2
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/fs-writeback.c7
-rw-r--r--fs/fuse/file.c1
-rw-r--r--fs/gfs2/file.c2
-rw-r--r--fs/hugetlbfs/inode.c11
-rw-r--r--fs/inode.c2
-rw-r--r--fs/jffs2/readinode.c13
-rw-r--r--fs/nfs/file.c1
-rw-r--r--fs/nilfs2/file.c2
-rw-r--r--fs/ocfs2/mmap.c2
-rw-r--r--fs/proc/base.c117
-rw-r--r--fs/proc/page.c8
-rw-r--r--fs/proc/proc_sysctl.c5
-rw-r--r--fs/proc/task_mmu.c2
-rw-r--r--fs/ubifs/file.c1
-rw-r--r--fs/xfs/xfs_file.c2
-rw-r--r--include/asm-generic/pgtable.h72
-rw-r--r--include/linux/atomic.h25
-rw-r--r--include/linux/compaction.h19
-rw-r--r--include/linux/fs.h8
-rw-r--r--include/linux/gfp.h9
-rw-r--r--include/linux/huge_mm.h3
-rw-r--r--include/linux/interval_tree.h27
-rw-r--r--include/linux/interval_tree_generic.h191
-rw-r--r--include/linux/memblock.h3
-rw-r--r--include/linux/memcontrol.h14
-rw-r--r--include/linux/memory_hotplug.h3
-rw-r--r--include/linux/mempolicy.h4
-rw-r--r--include/linux/mm.h140
-rw-r--r--include/linux/mm_types.h16
-rw-r--r--include/linux/mman.h1
-rw-r--r--include/linux/mmu_notifier.h60
-rw-r--r--include/linux/mmzone.h10
-rw-r--r--include/linux/oom.h11
-rw-r--r--include/linux/page-isolation.h7
-rw-r--r--include/linux/pageblock-flags.h19
-rw-r--r--include/linux/prio_tree.h120
-rw-r--r--include/linux/rbtree.h119
-rw-r--r--include/linux/rbtree_augmented.h223
-rw-r--r--include/linux/rmap.h36
-rw-r--r--include/linux/sched.h1
-rw-r--r--include/linux/swap.h2
-rw-r--r--include/linux/timerqueue.h2
-rw-r--r--include/linux/vm_event_item.h1
-rw-r--r--include/linux/vmstat.h12
-rw-r--r--include/trace/events/gfpflags.h1
-rw-r--r--init/Kconfig11
-rw-r--r--init/main.c2
-rw-r--r--ipc/mqueue.c3
-rw-r--r--kernel/auditsc.c13
-rw-r--r--kernel/cpu.c4
-rw-r--r--kernel/events/core.c2
-rw-r--r--kernel/events/uprobes.c8
-rw-r--r--kernel/fork.c32
-rw-r--r--kernel/sysctl.c3
-rw-r--r--lib/Kconfig.debug38
-rw-r--r--lib/Makefile7
-rw-r--r--lib/interval_tree.c10
-rw-r--r--lib/interval_tree_test_main.c105
-rw-r--r--lib/prio_tree.c466
-rw-r--r--lib/rbtree.c656
-rw-r--r--lib/rbtree_test.c234
-rw-r--r--mm/Kconfig3
-rw-r--r--mm/Makefile4
-rw-r--r--mm/bootmem.c10
-rw-r--r--mm/compaction.c562
-rw-r--r--mm/filemap.c6
-rw-r--r--mm/filemap_xip.c10
-rw-r--r--mm/fremap.c16
-rw-r--r--mm/huge_memory.c440
-rw-r--r--mm/hugetlb.c34
-rw-r--r--mm/internal.h52
-rw-r--r--mm/interval_tree.c112
-rw-r--r--mm/kmemleak.c100
-rw-r--r--mm/ksm.c40
-rw-r--r--mm/madvise.c8
-rw-r--r--mm/memblock.c5
-rw-r--r--mm/memcontrol.c22
-rw-r--r--mm/memory-failure.c8
-rw-r--r--mm/memory.c115
-rw-r--r--mm/memory_hotplug.c77
-rw-r--r--mm/mempolicy.c148
-rw-r--r--mm/mlock.c27
-rw-r--r--mm/mmap.c207
-rw-r--r--mm/mmu_notifier.c103
-rw-r--r--mm/mremap.c73
-rw-r--r--mm/nobootmem.c5
-rw-r--r--mm/nommu.c33
-rw-r--r--mm/oom_kill.c4
-rw-r--r--mm/page_alloc.c317
-rw-r--r--mm/page_isolation.c43
-rw-r--r--mm/pgtable-generic.c50
-rw-r--r--mm/prio_tree.c208
-rw-r--r--mm/rmap.c159
-rw-r--r--mm/shmem.c3
-rw-r--r--mm/swap.c13
-rw-r--r--mm/truncate.c3
-rw-r--r--mm/vmalloc.c5
-rw-r--r--mm/vmscan.c111
-rw-r--r--mm/vmstat.c14
-rw-r--r--net/ceph/osd_client.c1
-rw-r--r--security/selinux/selinuxfs.c2
-rw-r--r--security/tomoyo/util.c9
-rw-r--r--sound/core/pcm_native.c6
-rw-r--r--sound/usb/usx2y/us122l.c2
-rw-r--r--sound/usb/usx2y/usX2Yhwdep.c2
-rw-r--r--sound/usb/usx2y/usx2yhwdeppcm.c2
-rw-r--r--tools/perf/util/include/linux/rbtree.h1
255 files changed, 4976 insertions, 3540 deletions
diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
index 49c051380daf..f54273e2ac97 100644
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -270,8 +270,6 @@ preempt-locking.txt
270 - info on locking under a preemptive kernel. 270 - info on locking under a preemptive kernel.
271printk-formats.txt 271printk-formats.txt
272 - how to get printk format specifiers right 272 - how to get printk format specifiers right
273prio_tree.txt
274 - info on radix-priority-search-tree use for indexing vmas.
275ramoops.txt 273ramoops.txt
276 - documentation of the ramoops oops/panic logging module. 274 - documentation of the ramoops oops/panic logging module.
277rbtree.txt 275rbtree.txt
diff --git a/Documentation/ABI/obsolete/proc-pid-oom_adj b/Documentation/ABI/obsolete/proc-pid-oom_adj
deleted file mode 100644
index 9a3cb88ade47..000000000000
--- a/Documentation/ABI/obsolete/proc-pid-oom_adj
+++ /dev/null
@@ -1,22 +0,0 @@
1What: /proc/<pid>/oom_adj
2When: August 2012
3Why: /proc/<pid>/oom_adj allows userspace to influence the oom killer's
4 badness heuristic used to determine which task to kill when the kernel
5 is out of memory.
6
7 The badness heuristic has since been rewritten since the introduction of
8 this tunable such that its meaning is deprecated. The value was
9 implemented as a bitshift on a score generated by the badness()
10 function that did not have any precise units of measure. With the
11 rewrite, the score is given as a proportion of available memory to the
12 task allocating pages, so using a bitshift which grows the score
13 exponentially is, thus, impossible to tune with fine granularity.
14
15 A much more powerful interface, /proc/<pid>/oom_score_adj, was
16 introduced with the oom killer rewrite that allows users to increase or
17 decrease the badness score linearly. This interface will replace
18 /proc/<pid>/oom_adj.
19
20 A warning will be emitted to the kernel log if an application uses this
21 deprecated interface. After it is printed once, future warnings will be
22 suppressed until the kernel is rebooted.
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 4372e6b8a353..c07f7b4fb88d 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -18,16 +18,16 @@ from the rest of the system. The article on LWN [12] mentions some probable
18uses of the memory controller. The memory controller can be used to 18uses of the memory controller. The memory controller can be used to
19 19
20a. Isolate an application or a group of applications 20a. Isolate an application or a group of applications
21 Memory hungry applications can be isolated and limited to a smaller 21 Memory-hungry applications can be isolated and limited to a smaller
22 amount of memory. 22 amount of memory.
23b. Create a cgroup with limited amount of memory, this can be used 23b. Create a cgroup with a limited amount of memory; this can be used
24 as a good alternative to booting with mem=XXXX. 24 as a good alternative to booting with mem=XXXX.
25c. Virtualization solutions can control the amount of memory they want 25c. Virtualization solutions can control the amount of memory they want
26 to assign to a virtual machine instance. 26 to assign to a virtual machine instance.
27d. A CD/DVD burner could control the amount of memory used by the 27d. A CD/DVD burner could control the amount of memory used by the
28 rest of the system to ensure that burning does not fail due to lack 28 rest of the system to ensure that burning does not fail due to lack
29 of available memory. 29 of available memory.
30e. There are several other use cases, find one or use the controller just 30e. There are several other use cases; find one or use the controller just
31 for fun (to learn and hack on the VM subsystem). 31 for fun (to learn and hack on the VM subsystem).
32 32
33Current Status: linux-2.6.34-mmotm(development version of 2010/April) 33Current Status: linux-2.6.34-mmotm(development version of 2010/April)
@@ -38,12 +38,12 @@ Features:
38 - optionally, memory+swap usage can be accounted and limited. 38 - optionally, memory+swap usage can be accounted and limited.
39 - hierarchical accounting 39 - hierarchical accounting
40 - soft limit 40 - soft limit
41 - moving(recharging) account at moving a task is selectable. 41 - moving (recharging) account at moving a task is selectable.
42 - usage threshold notifier 42 - usage threshold notifier
43 - oom-killer disable knob and oom-notifier 43 - oom-killer disable knob and oom-notifier
44 - Root cgroup has no limit controls. 44 - Root cgroup has no limit controls.
45 45
46 Kernel memory support is work in progress, and the current version provides 46 Kernel memory support is a work in progress, and the current version provides
47 basically functionality. (See Section 2.7) 47 basically functionality. (See Section 2.7)
48 48
49Brief summary of control files. 49Brief summary of control files.
@@ -144,9 +144,9 @@ Figure 1 shows the important aspects of the controller
1443. Each page has a pointer to the page_cgroup, which in turn knows the 1443. Each page has a pointer to the page_cgroup, which in turn knows the
145 cgroup it belongs to 145 cgroup it belongs to
146 146
147The accounting is done as follows: mem_cgroup_charge() is invoked to setup 147The accounting is done as follows: mem_cgroup_charge() is invoked to set up
148the necessary data structures and check if the cgroup that is being charged 148the necessary data structures and check if the cgroup that is being charged
149is over its limit. If it is then reclaim is invoked on the cgroup. 149is over its limit. If it is, then reclaim is invoked on the cgroup.
150More details can be found in the reclaim section of this document. 150More details can be found in the reclaim section of this document.
151If everything goes well, a page meta-data-structure called page_cgroup is 151If everything goes well, a page meta-data-structure called page_cgroup is
152updated. page_cgroup has its own LRU on cgroup. 152updated. page_cgroup has its own LRU on cgroup.
@@ -163,13 +163,13 @@ for earlier. A file page will be accounted for as Page Cache when it's
163inserted into inode (radix-tree). While it's mapped into the page tables of 163inserted into inode (radix-tree). While it's mapped into the page tables of
164processes, duplicate accounting is carefully avoided. 164processes, duplicate accounting is carefully avoided.
165 165
166A RSS page is unaccounted when it's fully unmapped. A PageCache page is 166An RSS page is unaccounted when it's fully unmapped. A PageCache page is
167unaccounted when it's removed from radix-tree. Even if RSS pages are fully 167unaccounted when it's removed from radix-tree. Even if RSS pages are fully
168unmapped (by kswapd), they may exist as SwapCache in the system until they 168unmapped (by kswapd), they may exist as SwapCache in the system until they
169are really freed. Such SwapCaches also also accounted. 169are really freed. Such SwapCaches are also accounted.
170A swapped-in page is not accounted until it's mapped. 170A swapped-in page is not accounted until it's mapped.
171 171
172Note: The kernel does swapin-readahead and read multiple swaps at once. 172Note: The kernel does swapin-readahead and reads multiple swaps at once.
173This means swapped-in pages may contain pages for other tasks than a task 173This means swapped-in pages may contain pages for other tasks than a task
174causing page fault. So, we avoid accounting at swap-in I/O. 174causing page fault. So, we avoid accounting at swap-in I/O.
175 175
@@ -209,7 +209,7 @@ memsw.limit_in_bytes.
209Example: Assume a system with 4G of swap. A task which allocates 6G of memory 209Example: Assume a system with 4G of swap. A task which allocates 6G of memory
210(by mistake) under 2G memory limitation will use all swap. 210(by mistake) under 2G memory limitation will use all swap.
211In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap. 211In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap.
212By using memsw limit, you can avoid system OOM which can be caused by swap 212By using the memsw limit, you can avoid system OOM which can be caused by swap
213shortage. 213shortage.
214 214
215* why 'memory+swap' rather than swap. 215* why 'memory+swap' rather than swap.
@@ -217,7 +217,7 @@ The global LRU(kswapd) can swap out arbitrary pages. Swap-out means
217to move account from memory to swap...there is no change in usage of 217to move account from memory to swap...there is no change in usage of
218memory+swap. In other words, when we want to limit the usage of swap without 218memory+swap. In other words, when we want to limit the usage of swap without
219affecting global LRU, memory+swap limit is better than just limiting swap from 219affecting global LRU, memory+swap limit is better than just limiting swap from
220OS point of view. 220an OS point of view.
221 221
222* What happens when a cgroup hits memory.memsw.limit_in_bytes 222* What happens when a cgroup hits memory.memsw.limit_in_bytes
223When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out 223When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out
@@ -236,7 +236,7 @@ an OOM routine is invoked to select and kill the bulkiest task in the
236cgroup. (See 10. OOM Control below.) 236cgroup. (See 10. OOM Control below.)
237 237
238The reclaim algorithm has not been modified for cgroups, except that 238The reclaim algorithm has not been modified for cgroups, except that
239pages that are selected for reclaiming come from the per cgroup LRU 239pages that are selected for reclaiming come from the per-cgroup LRU
240list. 240list.
241 241
242NOTE: Reclaim does not work for the root cgroup, since we cannot set any 242NOTE: Reclaim does not work for the root cgroup, since we cannot set any
@@ -316,7 +316,7 @@ We can check the usage:
316# cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes 316# cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes
3171216512 3171216512
318 318
319A successful write to this file does not guarantee a successful set of 319A successful write to this file does not guarantee a successful setting of
320this limit to the value written into the file. This can be due to a 320this limit to the value written into the file. This can be due to a
321number of factors, such as rounding up to page boundaries or the total 321number of factors, such as rounding up to page boundaries or the total
322availability of memory on the system. The user is required to re-read 322availability of memory on the system. The user is required to re-read
@@ -350,7 +350,7 @@ Trying usual test under memory controller is always helpful.
3504.1 Troubleshooting 3504.1 Troubleshooting
351 351
352Sometimes a user might find that the application under a cgroup is 352Sometimes a user might find that the application under a cgroup is
353terminated by OOM killer. There are several causes for this: 353terminated by the OOM killer. There are several causes for this:
354 354
3551. The cgroup limit is too low (just too low to do anything useful) 3551. The cgroup limit is too low (just too low to do anything useful)
3562. The user is using anonymous memory and swap is turned off or too low 3562. The user is using anonymous memory and swap is turned off or too low
@@ -358,7 +358,7 @@ terminated by OOM killer. There are several causes for this:
358A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of 358A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of
359some of the pages cached in the cgroup (page cache pages). 359some of the pages cached in the cgroup (page cache pages).
360 360
361To know what happens, disable OOM_Kill by 10. OOM Control(see below) and 361To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and
362seeing what happens will be helpful. 362seeing what happens will be helpful.
363 363
3644.2 Task migration 3644.2 Task migration
@@ -399,10 +399,10 @@ About use_hierarchy, see Section 6.
399 399
400 Almost all pages tracked by this memory cgroup will be unmapped and freed. 400 Almost all pages tracked by this memory cgroup will be unmapped and freed.
401 Some pages cannot be freed because they are locked or in-use. Such pages are 401 Some pages cannot be freed because they are locked or in-use. Such pages are
402 moved to parent(if use_hierarchy==1) or root (if use_hierarchy==0) and this 402 moved to parent (if use_hierarchy==1) or root (if use_hierarchy==0) and this
403 cgroup will be empty. 403 cgroup will be empty.
404 404
405 Typical use case of this interface is that calling this before rmdir(). 405 The typical use case for this interface is before calling rmdir().
406 Because rmdir() moves all pages to parent, some out-of-use page caches can be 406 Because rmdir() moves all pages to parent, some out-of-use page caches can be
407 moved to the parent. If you want to avoid that, force_empty will be useful. 407 moved to the parent. If you want to avoid that, force_empty will be useful.
408 408
@@ -486,7 +486,7 @@ You can reset failcnt by writing 0 to failcnt file.
486 486
487For efficiency, as other kernel components, memory cgroup uses some optimization 487For efficiency, as other kernel components, memory cgroup uses some optimization
488to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the 488to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the
489method and doesn't show 'exact' value of memory(and swap) usage, it's an fuzz 489method and doesn't show 'exact' value of memory (and swap) usage, it's a fuzz
490value for efficient access. (Of course, when necessary, it's synchronized.) 490value for efficient access. (Of course, when necessary, it's synchronized.)
491If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP) 491If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP)
492value in memory.stat(see 5.2). 492value in memory.stat(see 5.2).
@@ -496,8 +496,8 @@ value in memory.stat(see 5.2).
496This is similar to numa_maps but operates on a per-memcg basis. This is 496This is similar to numa_maps but operates on a per-memcg basis. This is
497useful for providing visibility into the numa locality information within 497useful for providing visibility into the numa locality information within
498an memcg since the pages are allowed to be allocated from any physical 498an memcg since the pages are allowed to be allocated from any physical
499node. One of the usecases is evaluating application performance by 499node. One of the use cases is evaluating application performance by
500combining this information with the application's cpu allocation. 500combining this information with the application's CPU allocation.
501 501
502We export "total", "file", "anon" and "unevictable" pages per-node for 502We export "total", "file", "anon" and "unevictable" pages per-node for
503each memcg. The ouput format of memory.numa_stat is: 503each memcg. The ouput format of memory.numa_stat is:
@@ -561,10 +561,10 @@ are pushed back to their soft limits. If the soft limit of each control
561group is very high, they are pushed back as much as possible to make 561group is very high, they are pushed back as much as possible to make
562sure that one control group does not starve the others of memory. 562sure that one control group does not starve the others of memory.
563 563
564Please note that soft limits is a best effort feature, it comes with 564Please note that soft limits is a best-effort feature; it comes with
565no guarantees, but it does its best to make sure that when memory is 565no guarantees, but it does its best to make sure that when memory is
566heavily contended for, memory is allocated based on the soft limit 566heavily contended for, memory is allocated based on the soft limit
567hints/setup. Currently soft limit based reclaim is setup such that 567hints/setup. Currently soft limit based reclaim is set up such that
568it gets invoked from balance_pgdat (kswapd). 568it gets invoked from balance_pgdat (kswapd).
569 569
5707.1 Interface 5707.1 Interface
@@ -592,7 +592,7 @@ page tables.
592 592
5938.1 Interface 5938.1 Interface
594 594
595This feature is disabled by default. It can be enabled(and disabled again) by 595This feature is disabled by default. It can be enabledi (and disabled again) by
596writing to memory.move_charge_at_immigrate of the destination cgroup. 596writing to memory.move_charge_at_immigrate of the destination cgroup.
597 597
598If you want to enable it: 598If you want to enable it:
@@ -601,8 +601,8 @@ If you want to enable it:
601 601
602Note: Each bits of move_charge_at_immigrate has its own meaning about what type 602Note: Each bits of move_charge_at_immigrate has its own meaning about what type
603 of charges should be moved. See 8.2 for details. 603 of charges should be moved. See 8.2 for details.
604Note: Charges are moved only when you move mm->owner, IOW, a leader of a thread 604Note: Charges are moved only when you move mm->owner, in other words,
605 group. 605 a leader of a thread group.
606Note: If we cannot find enough space for the task in the destination cgroup, we 606Note: If we cannot find enough space for the task in the destination cgroup, we
607 try to make space by reclaiming memory. Task migration may fail if we 607 try to make space by reclaiming memory. Task migration may fail if we
608 cannot make enough space. 608 cannot make enough space.
@@ -612,25 +612,25 @@ And if you want disable it again:
612 612
613# echo 0 > memory.move_charge_at_immigrate 613# echo 0 > memory.move_charge_at_immigrate
614 614
6158.2 Type of charges which can be move 6158.2 Type of charges which can be moved
616 616
617Each bits of move_charge_at_immigrate has its own meaning about what type of 617Each bit in move_charge_at_immigrate has its own meaning about what type of
618charges should be moved. But in any cases, it must be noted that an account of 618charges should be moved. But in any case, it must be noted that an account of
619a page or a swap can be moved only when it is charged to the task's current(old) 619a page or a swap can be moved only when it is charged to the task's current
620memory cgroup. 620(old) memory cgroup.
621 621
622 bit | what type of charges would be moved ? 622 bit | what type of charges would be moved ?
623 -----+------------------------------------------------------------------------ 623 -----+------------------------------------------------------------------------
624 0 | A charge of an anonymous page(or swap of it) used by the target task. 624 0 | A charge of an anonymous page (or swap of it) used by the target task.
625 | You must enable Swap Extension(see 2.4) to enable move of swap charges. 625 | You must enable Swap Extension (see 2.4) to enable move of swap charges.
626 -----+------------------------------------------------------------------------ 626 -----+------------------------------------------------------------------------
627 1 | A charge of file pages(normal file, tmpfs file(e.g. ipc shared memory) 627 1 | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory)
628 | and swaps of tmpfs file) mmapped by the target task. Unlike the case of 628 | and swaps of tmpfs file) mmapped by the target task. Unlike the case of
629 | anonymous pages, file pages(and swaps) in the range mmapped by the task 629 | anonymous pages, file pages (and swaps) in the range mmapped by the task
630 | will be moved even if the task hasn't done page fault, i.e. they might 630 | will be moved even if the task hasn't done page fault, i.e. they might
631 | not be the task's "RSS", but other task's "RSS" that maps the same file. 631 | not be the task's "RSS", but other task's "RSS" that maps the same file.
632 | And mapcount of the page is ignored(the page can be moved even if 632 | And mapcount of the page is ignored (the page can be moved even if
633 | page_mapcount(page) > 1). You must enable Swap Extension(see 2.4) to 633 | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to
634 | enable move of swap charges. 634 | enable move of swap charges.
635 635
6368.3 TODO 6368.3 TODO
@@ -640,11 +640,11 @@ memory cgroup.
640 640
6419. Memory thresholds 6419. Memory thresholds
642 642
643Memory cgroup implements memory thresholds using cgroups notification 643Memory cgroup implements memory thresholds using the cgroups notification
644API (see cgroups.txt). It allows to register multiple memory and memsw 644API (see cgroups.txt). It allows to register multiple memory and memsw
645thresholds and gets notifications when it crosses. 645thresholds and gets notifications when it crosses.
646 646
647To register a threshold application need: 647To register a threshold, an application must:
648- create an eventfd using eventfd(2); 648- create an eventfd using eventfd(2);
649- open memory.usage_in_bytes or memory.memsw.usage_in_bytes; 649- open memory.usage_in_bytes or memory.memsw.usage_in_bytes;
650- write string like "<event_fd> <fd of memory.usage_in_bytes> <threshold>" to 650- write string like "<event_fd> <fd of memory.usage_in_bytes> <threshold>" to
@@ -659,24 +659,24 @@ It's applicable for root and non-root cgroup.
659 659
660memory.oom_control file is for OOM notification and other controls. 660memory.oom_control file is for OOM notification and other controls.
661 661
662Memory cgroup implements OOM notifier using cgroup notification 662Memory cgroup implements OOM notifier using the cgroup notification
663API (See cgroups.txt). It allows to register multiple OOM notification 663API (See cgroups.txt). It allows to register multiple OOM notification
664delivery and gets notification when OOM happens. 664delivery and gets notification when OOM happens.
665 665
666To register a notifier, application need: 666To register a notifier, an application must:
667 - create an eventfd using eventfd(2) 667 - create an eventfd using eventfd(2)
668 - open memory.oom_control file 668 - open memory.oom_control file
669 - write string like "<event_fd> <fd of memory.oom_control>" to 669 - write string like "<event_fd> <fd of memory.oom_control>" to
670 cgroup.event_control 670 cgroup.event_control
671 671
672Application will be notified through eventfd when OOM happens. 672The application will be notified through eventfd when OOM happens.
673OOM notification doesn't work for root cgroup. 673OOM notification doesn't work for the root cgroup.
674 674
675You can disable OOM-killer by writing "1" to memory.oom_control file, as: 675You can disable the OOM-killer by writing "1" to memory.oom_control file, as:
676 676
677 #echo 1 > memory.oom_control 677 #echo 1 > memory.oom_control
678 678
679This operation is only allowed to the top cgroup of sub-hierarchy. 679This operation is only allowed to the top cgroup of a sub-hierarchy.
680If OOM-killer is disabled, tasks under cgroup will hang/sleep 680If OOM-killer is disabled, tasks under cgroup will hang/sleep
681in memory cgroup's OOM-waitqueue when they request accountable memory. 681in memory cgroup's OOM-waitqueue when they request accountable memory.
682 682
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index fb0a6aeb936c..a1793d670cd0 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -33,7 +33,7 @@ Table of Contents
33 2 Modifying System Parameters 33 2 Modifying System Parameters
34 34
35 3 Per-Process Parameters 35 3 Per-Process Parameters
36 3.1 /proc/<pid>/oom_adj & /proc/<pid>/oom_score_adj - Adjust the oom-killer 36 3.1 /proc/<pid>/oom_score_adj - Adjust the oom-killer
37 score 37 score
38 3.2 /proc/<pid>/oom_score - Display current oom-killer score 38 3.2 /proc/<pid>/oom_score - Display current oom-killer score
39 3.3 /proc/<pid>/io - Display the IO accounting fields 39 3.3 /proc/<pid>/io - Display the IO accounting fields
@@ -1320,10 +1320,10 @@ of the kernel.
1320CHAPTER 3: PER-PROCESS PARAMETERS 1320CHAPTER 3: PER-PROCESS PARAMETERS
1321------------------------------------------------------------------------------ 1321------------------------------------------------------------------------------
1322 1322
13233.1 /proc/<pid>/oom_adj & /proc/<pid>/oom_score_adj- Adjust the oom-killer score 13233.1 /proc/<pid>/oom_score_adj- Adjust the oom-killer score
1324-------------------------------------------------------------------------------- 1324--------------------------------------------------------------------------------
1325 1325
1326These file can be used to adjust the badness heuristic used to select which 1326This file can be used to adjust the badness heuristic used to select which
1327process gets killed in out of memory conditions. 1327process gets killed in out of memory conditions.
1328 1328
1329The badness heuristic assigns a value to each candidate task ranging from 0 1329The badness heuristic assigns a value to each candidate task ranging from 0
@@ -1361,22 +1361,10 @@ same system, cpuset, mempolicy, or memory controller resources to use at least
1361equivalent to discounting 50% of the task's allowed memory from being considered 1361equivalent to discounting 50% of the task's allowed memory from being considered
1362as scoring against the task. 1362as scoring against the task.
1363 1363
1364For backwards compatibility with previous kernels, /proc/<pid>/oom_adj may also
1365be used to tune the badness score. Its acceptable values range from -16
1366(OOM_ADJUST_MIN) to +15 (OOM_ADJUST_MAX) and a special value of -17
1367(OOM_DISABLE) to disable oom killing entirely for that task. Its value is
1368scaled linearly with /proc/<pid>/oom_score_adj.
1369
1370Writing to /proc/<pid>/oom_score_adj or /proc/<pid>/oom_adj will change the
1371other with its scaled value.
1372
1373The value of /proc/<pid>/oom_score_adj may be reduced no lower than the last 1364The value of /proc/<pid>/oom_score_adj may be reduced no lower than the last
1374value set by a CAP_SYS_RESOURCE process. To reduce the value any lower 1365value set by a CAP_SYS_RESOURCE process. To reduce the value any lower
1375requires CAP_SYS_RESOURCE. 1366requires CAP_SYS_RESOURCE.
1376 1367
1377NOTICE: /proc/<pid>/oom_adj is deprecated and will be removed, please see
1378Documentation/feature-removal-schedule.txt.
1379
1380Caveat: when a parent task is selected, the oom killer will sacrifice any first 1368Caveat: when a parent task is selected, the oom killer will sacrifice any first
1381generation children with separate address spaces instead, if possible. This 1369generation children with separate address spaces instead, if possible. This
1382avoids servers and important system daemons from being killed and loses the 1370avoids servers and important system daemons from being killed and loses the
@@ -1387,9 +1375,7 @@ minimal amount of work.
1387------------------------------------------------------------- 1375-------------------------------------------------------------
1388 1376
1389This file can be used to check the current score used by the oom-killer is for 1377This file can be used to check the current score used by the oom-killer is for
1390any given <pid>. Use it together with /proc/<pid>/oom_adj to tune which 1378any given <pid>.
1391process should be killed in an out-of-memory situation.
1392
1393 1379
13943.3 /proc/<pid>/io - Display the IO accounting fields 13803.3 /proc/<pid>/io - Display the IO accounting fields
1395------------------------------------------------------- 1381-------------------------------------------------------
diff --git a/Documentation/memory.txt b/Documentation/memory.txt
deleted file mode 100644
index 802efe58647c..000000000000
--- a/Documentation/memory.txt
+++ /dev/null
@@ -1,33 +0,0 @@
1There are several classic problems related to memory on Linux
2systems.
3
4 1) There are some motherboards that will not cache above
5 a certain quantity of memory. If you have one of these
6 motherboards, your system will be SLOWER, not faster
7 as you add more memory. Consider exchanging your
8 motherboard.
9
10All of these problems can be addressed with the "mem=XXXM" boot option
11(where XXX is the size of RAM to use in megabytes).
12It can also tell Linux to use less memory than is actually installed.
13If you use "mem=" on a machine with PCI, consider using "memmap=" to avoid
14physical address space collisions.
15
16See the documentation of your boot loader (LILO, grub, loadlin, etc.) about
17how to pass options to the kernel.
18
19There are other memory problems which Linux cannot deal with. Random
20corruption of memory is usually a sign of serious hardware trouble.
21Try:
22
23 * Reducing memory settings in the BIOS to the most conservative
24 timings.
25
26 * Adding a cooling fan.
27
28 * Not overclocking your CPU.
29
30 * Having the memory tested in a memory tester or exchanged
31 with the vendor. Consider testing it with memtest86 yourself.
32
33 * Exchanging your CPU, cache, or motherboard for one that works.
diff --git a/Documentation/prio_tree.txt b/Documentation/prio_tree.txt
deleted file mode 100644
index 3aa68f9a117b..000000000000
--- a/Documentation/prio_tree.txt
+++ /dev/null
@@ -1,107 +0,0 @@
1The prio_tree.c code indexes vmas using 3 different indexes:
2 * heap_index = vm_pgoff + vm_size_in_pages : end_vm_pgoff
3 * radix_index = vm_pgoff : start_vm_pgoff
4 * size_index = vm_size_in_pages
5
6A regular radix-priority-search-tree indexes vmas using only heap_index and
7radix_index. The conditions for indexing are:
8 * ->heap_index >= ->left->heap_index &&
9 ->heap_index >= ->right->heap_index
10 * if (->heap_index == ->left->heap_index)
11 then ->radix_index < ->left->radix_index;
12 * if (->heap_index == ->right->heap_index)
13 then ->radix_index < ->right->radix_index;
14 * nodes are hashed to left or right subtree using radix_index
15 similar to a pure binary radix tree.
16
17A regular radix-priority-search-tree helps to store and query
18intervals (vmas). However, a regular radix-priority-search-tree is only
19suitable for storing vmas with different radix indices (vm_pgoff).
20
21Therefore, the prio_tree.c extends the regular radix-priority-search-tree
22to handle many vmas with the same vm_pgoff. Such vmas are handled in
232 different ways: 1) All vmas with the same radix _and_ heap indices are
24linked using vm_set.list, 2) if there are many vmas with the same radix
25index, but different heap indices and if the regular radix-priority-search
26tree cannot index them all, we build an overflow-sub-tree that indexes such
27vmas using heap and size indices instead of heap and radix indices. For
28example, in the figure below some vmas with vm_pgoff = 0 (zero) are
29indexed by regular radix-priority-search-tree whereas others are pushed
30into an overflow-subtree. Note that all vmas in an overflow-sub-tree have
31the same vm_pgoff (radix_index) and if necessary we build different
32overflow-sub-trees to handle each possible radix_index. For example,
33in figure we have 3 overflow-sub-trees corresponding to radix indices
340, 2, and 4.
35
36In the final tree the first few (prio_tree_root->index_bits) levels
37are indexed using heap and radix indices whereas the overflow-sub-trees below
38those levels (i.e. levels prio_tree_root->index_bits + 1 and higher) are
39indexed using heap and size indices. In overflow-sub-trees the size_index
40is used for hashing the nodes to appropriate places.
41
42Now, an example prio_tree:
43
44 vmas are represented [radix_index, size_index, heap_index]
45 i.e., [start_vm_pgoff, vm_size_in_pages, end_vm_pgoff]
46
47level prio_tree_root->index_bits = 3
48-----
49 _
50 0 [0,7,7] |
51 / \ |
52 ------------------ ------------ | Regular
53 / \ | radix priority
54 1 [1,6,7] [4,3,7] | search tree
55 / \ / \ |
56 ------- ----- ------ ----- | heap-and-radix
57 / \ / \ | indexed
58 2 [0,6,6] [2,5,7] [5,2,7] [6,1,7] |
59 / \ / \ / \ / \ |
60 3 [0,5,5] [1,5,6] [2,4,6] [3,4,7] [4,2,6] [5,1,6] [6,0,6] [7,0,7] |
61 / / / _
62 / / / _
63 4 [0,4,4] [2,3,5] [4,1,5] |
64 / / / |
65 5 [0,3,3] [2,2,4] [4,0,4] | Overflow-sub-trees
66 / / |
67 6 [0,2,2] [2,1,3] | heap-and-size
68 / / | indexed
69 7 [0,1,1] [2,0,2] |
70 / |
71 8 [0,0,0] |
72 _
73
74Note that we use prio_tree_root->index_bits to optimize the height
75of the heap-and-radix indexed tree. Since prio_tree_root->index_bits is
76set according to the maximum end_vm_pgoff mapped, we are sure that all
77bits (in vm_pgoff) above prio_tree_root->index_bits are 0 (zero). Therefore,
78we only use the first prio_tree_root->index_bits as radix_index.
79Whenever index_bits is increased in prio_tree_expand, we shuffle the tree
80to make sure that the first prio_tree_root->index_bits levels of the tree
81is indexed properly using heap and radix indices.
82
83We do not optimize the height of overflow-sub-trees using index_bits.
84The reason is: there can be many such overflow-sub-trees and all of
85them have to be suffled whenever the index_bits increases. This may involve
86walking the whole prio_tree in prio_tree_insert->prio_tree_expand code
87path which is not desirable. Hence, we do not optimize the height of the
88heap-and-size indexed overflow-sub-trees using prio_tree->index_bits.
89Instead the overflow sub-trees are indexed using full BITS_PER_LONG bits
90of size_index. This may lead to skewed sub-trees because most of the
91higher significant bits of the size_index are likely to be 0 (zero). In
92the example above, all 3 overflow-sub-trees are skewed. This may marginally
93affect the performance. However, processes rarely map many vmas with the
94same start_vm_pgoff but different end_vm_pgoffs. Therefore, we normally
95do not require overflow-sub-trees to index all vmas.
96
97From the above discussion it is clear that the maximum height of
98a prio_tree can be prio_tree_root->index_bits + BITS_PER_LONG.
99However, in most of the common cases we do not need overflow-sub-trees,
100so the tree height in the common cases will be prio_tree_root->index_bits.
101
102It is fair to mention here that the prio_tree_root->index_bits
103is increased on demand, however, the index_bits is not decreased when
104vmas are removed from the prio_tree. That's tricky to do. Hence, it's
105left as a home work problem.
106
107
diff --git a/Documentation/rbtree.txt b/Documentation/rbtree.txt
index 8d32d85a5234..61b6c48871a0 100644
--- a/Documentation/rbtree.txt
+++ b/Documentation/rbtree.txt
@@ -193,24 +193,55 @@ Example:
193Support for Augmented rbtrees 193Support for Augmented rbtrees
194----------------------------- 194-----------------------------
195 195
196Augmented rbtree is an rbtree with "some" additional data stored in each node. 196Augmented rbtree is an rbtree with "some" additional data stored in
197This data can be used to augment some new functionality to rbtree. 197each node, where the additional data for node N must be a function of
198Augmented rbtree is an optional feature built on top of basic rbtree 198the contents of all nodes in the subtree rooted at N. This data can
199infrastructure. An rbtree user who wants this feature will have to call the 199be used to augment some new functionality to rbtree. Augmented rbtree
200augmentation functions with the user provided augmentation callback 200is an optional feature built on top of basic rbtree infrastructure.
201when inserting and erasing nodes. 201An rbtree user who wants this feature will have to call the augmentation
202 202functions with the user provided augmentation callback when inserting
203On insertion, the user must call rb_augment_insert() once the new node is in 203and erasing nodes.
204place. This will cause the augmentation function callback to be called for 204
205each node between the new node and the root which has been affected by the 205C files implementing augmented rbtree manipulation must include
206insertion. 206<linux/rbtree_augmented.h> instead of <linus/rbtree.h>. Note that
207 207linux/rbtree_augmented.h exposes some rbtree implementations details
208When erasing a node, the user must call rb_augment_erase_begin() first to 208you are not expected to rely on; please stick to the documented APIs
209retrieve the deepest node on the rebalance path. Then, after erasing the 209there and do not include <linux/rbtree_augmented.h> from header files
210original node, the user must call rb_augment_erase_end() with the deepest 210either so as to minimize chances of your users accidentally relying on
211node found earlier. This will cause the augmentation function to be called 211such implementation details.
212for each affected node between the deepest node and the root. 212
213 213On insertion, the user must update the augmented information on the path
214leading to the inserted node, then call rb_link_node() as usual and
215rb_augment_inserted() instead of the usual rb_insert_color() call.
216If rb_augment_inserted() rebalances the rbtree, it will callback into
217a user provided function to update the augmented information on the
218affected subtrees.
219
220When erasing a node, the user must call rb_erase_augmented() instead of
221rb_erase(). rb_erase_augmented() calls back into user provided functions
222to updated the augmented information on affected subtrees.
223
224In both cases, the callbacks are provided through struct rb_augment_callbacks.
2253 callbacks must be defined:
226
227- A propagation callback, which updates the augmented value for a given
228 node and its ancestors, up to a given stop point (or NULL to update
229 all the way to the root).
230
231- A copy callback, which copies the augmented value for a given subtree
232 to a newly assigned subtree root.
233
234- A tree rotation callback, which copies the augmented value for a given
235 subtree to a newly assigned subtree root AND recomputes the augmented
236 information for the former subtree root.
237
238The compiled code for rb_erase_augmented() may inline the propagation and
239copy callbacks, which results in a large function, so each augmented rbtree
240user should have a single rb_erase_augmented() call site in order to limit
241compiled code size.
242
243
244Sample usage:
214 245
215Interval tree is an example of augmented rb tree. Reference - 246Interval tree is an example of augmented rb tree. Reference -
216"Introduction to Algorithms" by Cormen, Leiserson, Rivest and Stein. 247"Introduction to Algorithms" by Cormen, Leiserson, Rivest and Stein.
@@ -230,26 +261,132 @@ and its immediate children. And this will be used in O(log n) lookup
230for lowest match (lowest start address among all possible matches) 261for lowest match (lowest start address among all possible matches)
231with something like: 262with something like:
232 263
233find_lowest_match(lo, hi, node) 264struct interval_tree_node *
265interval_tree_first_match(struct rb_root *root,
266 unsigned long start, unsigned long last)
234{ 267{
235 lowest_match = NULL; 268 struct interval_tree_node *node;
236 while (node) { 269
237 if (max_hi(node->left) > lo) { 270 if (!root->rb_node)
238 // Lowest overlap if any must be on left side 271 return NULL;
239 node = node->left; 272 node = rb_entry(root->rb_node, struct interval_tree_node, rb);
240 } else if (overlap(lo, hi, node)) { 273
241 lowest_match = node; 274 while (true) {
242 break; 275 if (node->rb.rb_left) {
243 } else if (lo > node->lo) { 276 struct interval_tree_node *left =
244 // Lowest overlap if any must be on right side 277 rb_entry(node->rb.rb_left,
245 node = node->right; 278 struct interval_tree_node, rb);
246 } else { 279 if (left->__subtree_last >= start) {
247 break; 280 /*
281 * Some nodes in left subtree satisfy Cond2.
282 * Iterate to find the leftmost such node N.
283 * If it also satisfies Cond1, that's the match
284 * we are looking for. Otherwise, there is no
285 * matching interval as nodes to the right of N
286 * can't satisfy Cond1 either.
287 */
288 node = left;
289 continue;
290 }
248 } 291 }
292 if (node->start <= last) { /* Cond1 */
293 if (node->last >= start) /* Cond2 */
294 return node; /* node is leftmost match */
295 if (node->rb.rb_right) {
296 node = rb_entry(node->rb.rb_right,
297 struct interval_tree_node, rb);
298 if (node->__subtree_last >= start)
299 continue;
300 }
301 }
302 return NULL; /* No match */
303 }
304}
305
306Insertion/removal are defined using the following augmented callbacks:
307
308static inline unsigned long
309compute_subtree_last(struct interval_tree_node *node)
310{
311 unsigned long max = node->last, subtree_last;
312 if (node->rb.rb_left) {
313 subtree_last = rb_entry(node->rb.rb_left,
314 struct interval_tree_node, rb)->__subtree_last;
315 if (max < subtree_last)
316 max = subtree_last;
317 }
318 if (node->rb.rb_right) {
319 subtree_last = rb_entry(node->rb.rb_right,
320 struct interval_tree_node, rb)->__subtree_last;
321 if (max < subtree_last)
322 max = subtree_last;
323 }
324 return max;
325}
326
327static void augment_propagate(struct rb_node *rb, struct rb_node *stop)
328{
329 while (rb != stop) {
330 struct interval_tree_node *node =
331 rb_entry(rb, struct interval_tree_node, rb);
332 unsigned long subtree_last = compute_subtree_last(node);
333 if (node->__subtree_last == subtree_last)
334 break;
335 node->__subtree_last = subtree_last;
336 rb = rb_parent(&node->rb);
249 } 337 }
250 return lowest_match;
251} 338}
252 339
253Finding exact match will be to first find lowest match and then to follow 340static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new)
254successor nodes looking for exact match, until the start of a node is beyond 341{
255the hi value we are looking for. 342 struct interval_tree_node *old =
343 rb_entry(rb_old, struct interval_tree_node, rb);
344 struct interval_tree_node *new =
345 rb_entry(rb_new, struct interval_tree_node, rb);
346
347 new->__subtree_last = old->__subtree_last;
348}
349
350static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new)
351{
352 struct interval_tree_node *old =
353 rb_entry(rb_old, struct interval_tree_node, rb);
354 struct interval_tree_node *new =
355 rb_entry(rb_new, struct interval_tree_node, rb);
356
357 new->__subtree_last = old->__subtree_last;
358 old->__subtree_last = compute_subtree_last(old);
359}
360
361static const struct rb_augment_callbacks augment_callbacks = {
362 augment_propagate, augment_copy, augment_rotate
363};
364
365void interval_tree_insert(struct interval_tree_node *node,
366 struct rb_root *root)
367{
368 struct rb_node **link = &root->rb_node, *rb_parent = NULL;
369 unsigned long start = node->start, last = node->last;
370 struct interval_tree_node *parent;
371
372 while (*link) {
373 rb_parent = *link;
374 parent = rb_entry(rb_parent, struct interval_tree_node, rb);
375 if (parent->__subtree_last < last)
376 parent->__subtree_last = last;
377 if (start < parent->start)
378 link = &parent->rb.rb_left;
379 else
380 link = &parent->rb.rb_right;
381 }
382
383 node->__subtree_last = last;
384 rb_link_node(&node->rb, rb_parent, link);
385 rb_insert_augmented(&node->rb, root, &augment_callbacks);
386}
387
388void interval_tree_remove(struct interval_tree_node *node,
389 struct rb_root *root)
390{
391 rb_erase_augmented(&node->rb, root, &augment_callbacks);
392}
diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt
index fa206cccf89f..a68db7692ee8 100644
--- a/Documentation/vm/unevictable-lru.txt
+++ b/Documentation/vm/unevictable-lru.txt
@@ -197,12 +197,8 @@ the pages are also "rescued" from the unevictable list in the process of
197freeing them. 197freeing them.
198 198
199page_evictable() also checks for mlocked pages by testing an additional page 199page_evictable() also checks for mlocked pages by testing an additional page
200flag, PG_mlocked (as wrapped by PageMlocked()). If the page is NOT mlocked, 200flag, PG_mlocked (as wrapped by PageMlocked()), which is set when a page is
201and a non-NULL VMA is supplied, page_evictable() will check whether the VMA is 201faulted into a VM_LOCKED vma, or found in a vma being VM_LOCKED.
202VM_LOCKED via is_mlocked_vma(). is_mlocked_vma() will SetPageMlocked() and
203update the appropriate statistics if the vma is VM_LOCKED. This method allows
204efficient "culling" of pages in the fault path that are being faulted in to
205VM_LOCKED VMAs.
206 202
207 203
208VMSCAN'S HANDLING OF UNEVICTABLE PAGES 204VMSCAN'S HANDLING OF UNEVICTABLE PAGES
@@ -371,8 +367,8 @@ mlock_fixup() filters several classes of "special" VMAs:
371 mlock_fixup() will call make_pages_present() in the hugetlbfs VMA range to 367 mlock_fixup() will call make_pages_present() in the hugetlbfs VMA range to
372 allocate the huge pages and populate the ptes. 368 allocate the huge pages and populate the ptes.
373 369
3743) VMAs with VM_DONTEXPAND or VM_RESERVED are generally userspace mappings of 3703) VMAs with VM_DONTEXPAND are generally userspace mappings of kernel pages,
375 kernel pages, such as the VDSO page, relay channel pages, etc. These pages 371 such as the VDSO page, relay channel pages, etc. These pages
376 are inherently unevictable and are not managed on the LRU lists. 372 are inherently unevictable and are not managed on the LRU lists.
377 mlock_fixup() treats these VMAs the same as hugetlbfs VMAs. It calls 373 mlock_fixup() treats these VMAs the same as hugetlbfs VMAs. It calls
378 make_pages_present() to populate the ptes. 374 make_pages_present() to populate the ptes.
@@ -651,7 +647,7 @@ PAGE RECLAIM IN shrink_*_list()
651------------------------------- 647-------------------------------
652 648
653shrink_active_list() culls any obviously unevictable pages - i.e. 649shrink_active_list() culls any obviously unevictable pages - i.e.
654!page_evictable(page, NULL) - diverting these to the unevictable list. 650!page_evictable(page) - diverting these to the unevictable list.
655However, shrink_active_list() only sees unevictable pages that made it onto the 651However, shrink_active_list() only sees unevictable pages that made it onto the
656active/inactive lru lists. Note that these pages do not have PageUnevictable 652active/inactive lru lists. Note that these pages do not have PageUnevictable
657set - otherwise they would be on the unevictable list and shrink_active_list 653set - otherwise they would be on the unevictable list and shrink_active_list
diff --git a/MAINTAINERS b/MAINTAINERS
index ab98a99bee92..eae3cd86831e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7039,6 +7039,14 @@ S: Maintained
7039F: Documentation/svga.txt 7039F: Documentation/svga.txt
7040F: arch/x86/boot/video* 7040F: arch/x86/boot/video*
7041 7041
7042SWIOTLB SUBSYSTEM
7043M: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
7044L: linux-kernel@vger.kernel.org
7045S: Supported
7046F: lib/swiotlb.c
7047F: arch/*/kernel/pci-swiotlb.c
7048F: include/linux/swiotlb.h
7049
7042SYSV FILESYSTEM 7050SYSV FILESYSTEM
7043M: Christoph Hellwig <hch@infradead.org> 7051M: Christoph Hellwig <hch@infradead.org>
7044S: Maintained 7052S: Maintained
diff --git a/arch/Kconfig b/arch/Kconfig
index a62965d057f6..550cce4dd648 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -313,4 +313,7 @@ config HAVE_IRQ_TIME_ACCOUNTING
313 Archs need to ensure they use a high enough resolution clock to 313 Archs need to ensure they use a high enough resolution clock to
314 support irq time accounting and then call enable_sched_clock_irqtime(). 314 support irq time accounting and then call enable_sched_clock_irqtime().
315 315
316config HAVE_ARCH_TRANSPARENT_HUGEPAGE
317 bool
318
316source "kernel/gcov/Kconfig" 319source "kernel/gcov/Kconfig"
diff --git a/arch/alpha/kernel/pci-sysfs.c b/arch/alpha/kernel/pci-sysfs.c
index 53649c7d0068..b51f7b4818cd 100644
--- a/arch/alpha/kernel/pci-sysfs.c
+++ b/arch/alpha/kernel/pci-sysfs.c
@@ -26,7 +26,7 @@ static int hose_mmap_page_range(struct pci_controller *hose,
26 base = sparse ? hose->sparse_io_base : hose->dense_io_base; 26 base = sparse ? hose->sparse_io_base : hose->dense_io_base;
27 27
28 vma->vm_pgoff += base >> PAGE_SHIFT; 28 vma->vm_pgoff += base >> PAGE_SHIFT;
29 vma->vm_flags |= (VM_IO | VM_RESERVED); 29 vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
30 30
31 return io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, 31 return io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
32 vma->vm_end - vma->vm_start, 32 vma->vm_end - vma->vm_start,
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 6d2f7f5c0036..2867a7742306 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -25,6 +25,7 @@ config ARM
25 select HAVE_FUNCTION_GRAPH_TRACER if (!THUMB2_KERNEL) 25 select HAVE_FUNCTION_GRAPH_TRACER if (!THUMB2_KERNEL)
26 select ARCH_BINFMT_ELF_RANDOMIZE_PIE 26 select ARCH_BINFMT_ELF_RANDOMIZE_PIE
27 select HAVE_GENERIC_DMA_COHERENT 27 select HAVE_GENERIC_DMA_COHERENT
28 select HAVE_DEBUG_KMEMLEAK
28 select HAVE_KERNEL_GZIP 29 select HAVE_KERNEL_GZIP
29 select HAVE_KERNEL_LZO 30 select HAVE_KERNEL_LZO
30 select HAVE_KERNEL_LZMA 31 select HAVE_KERNEL_LZMA
@@ -39,6 +40,7 @@ config ARM
39 select HARDIRQS_SW_RESEND 40 select HARDIRQS_SW_RESEND
40 select GENERIC_IRQ_PROBE 41 select GENERIC_IRQ_PROBE
41 select GENERIC_IRQ_SHOW 42 select GENERIC_IRQ_SHOW
43 select HAVE_UID16
42 select ARCH_WANT_IPC_PARSE_VERSION 44 select ARCH_WANT_IPC_PARSE_VERSION
43 select HARDIRQS_SW_RESEND 45 select HARDIRQS_SW_RESEND
44 select CPU_PM if (SUSPEND || CPU_IDLE) 46 select CPU_PM if (SUSPEND || CPU_IDLE)
diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c
index 7599e2625c7d..2a5907b5c8d2 100644
--- a/arch/arm/mm/fault-armv.c
+++ b/arch/arm/mm/fault-armv.c
@@ -134,7 +134,6 @@ make_coherent(struct address_space *mapping, struct vm_area_struct *vma,
134{ 134{
135 struct mm_struct *mm = vma->vm_mm; 135 struct mm_struct *mm = vma->vm_mm;
136 struct vm_area_struct *mpnt; 136 struct vm_area_struct *mpnt;
137 struct prio_tree_iter iter;
138 unsigned long offset; 137 unsigned long offset;
139 pgoff_t pgoff; 138 pgoff_t pgoff;
140 int aliases = 0; 139 int aliases = 0;
@@ -147,7 +146,7 @@ make_coherent(struct address_space *mapping, struct vm_area_struct *vma,
147 * cache coherency. 146 * cache coherency.
148 */ 147 */
149 flush_dcache_mmap_lock(mapping); 148 flush_dcache_mmap_lock(mapping);
150 vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) { 149 vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) {
151 /* 150 /*
152 * If this VMA is not in our MM, we can ignore it. 151 * If this VMA is not in our MM, we can ignore it.
153 * Note that we intentionally mask out the VMA 152 * Note that we intentionally mask out the VMA
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index c3bd83450227..5dbf13f954f6 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -336,6 +336,7 @@ retry:
336 /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk 336 /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
337 * of starvation. */ 337 * of starvation. */
338 flags &= ~FAULT_FLAG_ALLOW_RETRY; 338 flags &= ~FAULT_FLAG_ALLOW_RETRY;
339 flags |= FAULT_FLAG_TRIED;
339 goto retry; 340 goto retry;
340 } 341 }
341 } 342 }
diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c
index 40ca11ed6e5f..1c8f7f564175 100644
--- a/arch/arm/mm/flush.c
+++ b/arch/arm/mm/flush.c
@@ -196,7 +196,6 @@ static void __flush_dcache_aliases(struct address_space *mapping, struct page *p
196{ 196{
197 struct mm_struct *mm = current->active_mm; 197 struct mm_struct *mm = current->active_mm;
198 struct vm_area_struct *mpnt; 198 struct vm_area_struct *mpnt;
199 struct prio_tree_iter iter;
200 pgoff_t pgoff; 199 pgoff_t pgoff;
201 200
202 /* 201 /*
@@ -208,7 +207,7 @@ static void __flush_dcache_aliases(struct address_space *mapping, struct page *p
208 pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 207 pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
209 208
210 flush_dcache_mmap_lock(mapping); 209 flush_dcache_mmap_lock(mapping);
211 vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) { 210 vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) {
212 unsigned long offset; 211 unsigned long offset;
213 212
214 /* 213 /*
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 767ba5685454..7ff68c946073 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -10,6 +10,8 @@ config ARM64
10 select GENERIC_TIME_VSYSCALL 10 select GENERIC_TIME_VSYSCALL
11 select HARDIRQS_SW_RESEND 11 select HARDIRQS_SW_RESEND
12 select HAVE_ARCH_TRACEHOOK 12 select HAVE_ARCH_TRACEHOOK
13 select HAVE_DEBUG_BUGVERBOSE
14 select HAVE_DEBUG_KMEMLEAK
13 select HAVE_DMA_API_DEBUG 15 select HAVE_DMA_API_DEBUG
14 select HAVE_DMA_ATTRS 16 select HAVE_DMA_ATTRS
15 select HAVE_GENERIC_DMA_COHERENT 17 select HAVE_GENERIC_DMA_COHERENT
@@ -26,6 +28,7 @@ config ARM64
26 select PERF_USE_VMALLOC 28 select PERF_USE_VMALLOC
27 select RTC_LIB 29 select RTC_LIB
28 select SPARSE_IRQ 30 select SPARSE_IRQ
31 select SYSCTL_EXCEPTION_TRACE
29 help 32 help
30 ARM 64-bit (AArch64) Linux support. 33 ARM 64-bit (AArch64) Linux support.
31 34
@@ -193,6 +196,7 @@ config COMPAT
193 bool "Kernel support for 32-bit EL0" 196 bool "Kernel support for 32-bit EL0"
194 depends on !ARM64_64K_PAGES 197 depends on !ARM64_64K_PAGES
195 select COMPAT_BINFMT_ELF 198 select COMPAT_BINFMT_ELF
199 select HAVE_UID16
196 help 200 help
197 This option enables support for a 32-bit EL0 running under a 64-bit 201 This option enables support for a 32-bit EL0 running under a 64-bit
198 kernel at EL1. AArch32-specific components such as system calls, 202 kernel at EL1. AArch32-specific components such as system calls,
diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c
index b92e60958617..b2f2d2d66849 100644
--- a/arch/avr32/mm/fault.c
+++ b/arch/avr32/mm/fault.c
@@ -152,6 +152,7 @@ good_area:
152 tsk->min_flt++; 152 tsk->min_flt++;
153 if (fault & VM_FAULT_RETRY) { 153 if (fault & VM_FAULT_RETRY) {
154 flags &= ~FAULT_FLAG_ALLOW_RETRY; 154 flags &= ~FAULT_FLAG_ALLOW_RETRY;
155 flags |= FAULT_FLAG_TRIED;
155 156
156 /* 157 /*
157 * No need to up_read(&mm->mmap_sem) as we would have 158 * No need to up_read(&mm->mmap_sem) as we would have
diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig
index 99224c4eb86b..ccd9193932b2 100644
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig
@@ -33,6 +33,7 @@ config BLACKFIN
33 select HAVE_PERF_EVENTS 33 select HAVE_PERF_EVENTS
34 select ARCH_HAVE_CUSTOM_GPIO_H 34 select ARCH_HAVE_CUSTOM_GPIO_H
35 select ARCH_WANT_OPTIONAL_GPIOLIB 35 select ARCH_WANT_OPTIONAL_GPIOLIB
36 select HAVE_UID16
36 select ARCH_WANT_IPC_PARSE_VERSION 37 select ARCH_WANT_IPC_PARSE_VERSION
37 select HAVE_GENERIC_HARDIRQS 38 select HAVE_GENERIC_HARDIRQS
38 select GENERIC_ATOMIC64 39 select GENERIC_ATOMIC64
diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index 72bd5ae50a89..a118163b04ee 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -42,6 +42,7 @@ config CRIS
42 select HAVE_IDE 42 select HAVE_IDE
43 select GENERIC_ATOMIC64 43 select GENERIC_ATOMIC64
44 select HAVE_GENERIC_HARDIRQS 44 select HAVE_GENERIC_HARDIRQS
45 select HAVE_UID16
45 select ARCH_WANT_IPC_PARSE_VERSION 46 select ARCH_WANT_IPC_PARSE_VERSION
46 select GENERIC_IRQ_SHOW 47 select GENERIC_IRQ_SHOW
47 select GENERIC_IOMAP 48 select GENERIC_IOMAP
diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c
index 45fd542cf173..73312ab6c696 100644
--- a/arch/cris/mm/fault.c
+++ b/arch/cris/mm/fault.c
@@ -186,6 +186,7 @@ retry:
186 tsk->min_flt++; 186 tsk->min_flt++;
187 if (fault & VM_FAULT_RETRY) { 187 if (fault & VM_FAULT_RETRY) {
188 flags &= ~FAULT_FLAG_ALLOW_RETRY; 188 flags &= ~FAULT_FLAG_ALLOW_RETRY;
189 flags |= FAULT_FLAG_TRIED;
189 190
190 /* 191 /*
191 * No need to up_read(&mm->mmap_sem) as we would 192 * No need to up_read(&mm->mmap_sem) as we would
diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index 971c0a19facb..9d262645f667 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -5,8 +5,10 @@ config FRV
5 select HAVE_ARCH_TRACEHOOK 5 select HAVE_ARCH_TRACEHOOK
6 select HAVE_IRQ_WORK 6 select HAVE_IRQ_WORK
7 select HAVE_PERF_EVENTS 7 select HAVE_PERF_EVENTS
8 select HAVE_UID16
8 select HAVE_GENERIC_HARDIRQS 9 select HAVE_GENERIC_HARDIRQS
9 select GENERIC_IRQ_SHOW 10 select GENERIC_IRQ_SHOW
11 select HAVE_DEBUG_BUGVERBOSE
10 select ARCH_HAVE_NMI_SAFE_CMPXCHG 12 select ARCH_HAVE_NMI_SAFE_CMPXCHG
11 select GENERIC_CPU_DEVICES 13 select GENERIC_CPU_DEVICES
12 select ARCH_WANT_IPC_PARSE_VERSION 14 select ARCH_WANT_IPC_PARSE_VERSION
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index 5e8a0d9a09ce..90462eb23d02 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -3,6 +3,7 @@ config H8300
3 default y 3 default y
4 select HAVE_IDE 4 select HAVE_IDE
5 select HAVE_GENERIC_HARDIRQS 5 select HAVE_GENERIC_HARDIRQS
6 select HAVE_UID16
6 select ARCH_WANT_IPC_PARSE_VERSION 7 select ARCH_WANT_IPC_PARSE_VERSION
7 select GENERIC_IRQ_SHOW 8 select GENERIC_IRQ_SHOW
8 select GENERIC_CPU_DEVICES 9 select GENERIC_CPU_DEVICES
diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c
index 06695cc4fe58..513b74cb397e 100644
--- a/arch/hexagon/mm/vm_fault.c
+++ b/arch/hexagon/mm/vm_fault.c
@@ -113,6 +113,7 @@ good_area:
113 current->min_flt++; 113 current->min_flt++;
114 if (fault & VM_FAULT_RETRY) { 114 if (fault & VM_FAULT_RETRY) {
115 flags &= ~FAULT_FLAG_ALLOW_RETRY; 115 flags &= ~FAULT_FLAG_ALLOW_RETRY;
116 flags |= FAULT_FLAG_TRIED;
116 goto retry; 117 goto retry;
117 } 118 }
118 } 119 }
diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h
index da55c63728e0..94eaa5bd5d0c 100644
--- a/arch/ia64/include/asm/hugetlb.h
+++ b/arch/ia64/include/asm/hugetlb.h
@@ -77,4 +77,8 @@ static inline void arch_release_hugepage(struct page *page)
77{ 77{
78} 78}
79 79
80static inline void arch_clear_hugepage_flags(struct page *page)
81{
82}
83
80#endif /* _ASM_IA64_HUGETLB_H */ 84#endif /* _ASM_IA64_HUGETLB_H */
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index f388b4e18a37..ea39eba61ef5 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2307,7 +2307,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t
2307 */ 2307 */
2308 vma->vm_mm = mm; 2308 vma->vm_mm = mm;
2309 vma->vm_file = get_file(filp); 2309 vma->vm_file = get_file(filp);
2310 vma->vm_flags = VM_READ| VM_MAYREAD |VM_RESERVED; 2310 vma->vm_flags = VM_READ|VM_MAYREAD|VM_DONTEXPAND|VM_DONTDUMP;
2311 vma->vm_page_prot = PAGE_READONLY; /* XXX may need to change */ 2311 vma->vm_page_prot = PAGE_READONLY; /* XXX may need to change */
2312 2312
2313 /* 2313 /*
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index 8443daf4f515..6cf0341f978e 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -184,6 +184,7 @@ retry:
184 current->min_flt++; 184 current->min_flt++;
185 if (fault & VM_FAULT_RETRY) { 185 if (fault & VM_FAULT_RETRY) {
186 flags &= ~FAULT_FLAG_ALLOW_RETRY; 186 flags &= ~FAULT_FLAG_ALLOW_RETRY;
187 flags |= FAULT_FLAG_TRIED;
187 188
188 /* No need to up_read(&mm->mmap_sem) as we would 189 /* No need to up_read(&mm->mmap_sem) as we would
189 * have already released it in __lock_page_or_retry 190 * have already released it in __lock_page_or_retry
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 0eab454867a2..acd5b68e8871 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -138,7 +138,8 @@ ia64_init_addr_space (void)
138 vma->vm_mm = current->mm; 138 vma->vm_mm = current->mm;
139 vma->vm_end = PAGE_SIZE; 139 vma->vm_end = PAGE_SIZE;
140 vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT); 140 vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT);
141 vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO | VM_RESERVED; 141 vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO |
142 VM_DONTEXPAND | VM_DONTDUMP;
142 down_write(&current->mm->mmap_sem); 143 down_write(&current->mm->mmap_sem);
143 if (insert_vm_struct(current->mm, vma)) { 144 if (insert_vm_struct(current->mm, vma)) {
144 up_write(&current->mm->mmap_sem); 145 up_write(&current->mm->mmap_sem);
@@ -636,6 +637,7 @@ mem_init (void)
636 637
637 high_memory = __va(max_low_pfn * PAGE_SIZE); 638 high_memory = __va(max_low_pfn * PAGE_SIZE);
638 639
640 reset_zone_present_pages();
639 for_each_online_pgdat(pgdat) 641 for_each_online_pgdat(pgdat)
640 if (pgdat->bdata->node_bootmem_map) 642 if (pgdat->bdata->node_bootmem_map)
641 totalram_pages += free_all_bootmem_node(pgdat); 643 totalram_pages += free_all_bootmem_node(pgdat);
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index 49498bbb9616..e875fc3ce9cb 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -8,6 +8,7 @@ config M32R
8 select HAVE_KERNEL_BZIP2 8 select HAVE_KERNEL_BZIP2
9 select HAVE_KERNEL_LZMA 9 select HAVE_KERNEL_LZMA
10 select ARCH_WANT_IPC_PARSE_VERSION 10 select ARCH_WANT_IPC_PARSE_VERSION
11 select HAVE_DEBUG_BUGVERBOSE
11 select HAVE_GENERIC_HARDIRQS 12 select HAVE_GENERIC_HARDIRQS
12 select GENERIC_IRQ_PROBE 13 select GENERIC_IRQ_PROBE
13 select GENERIC_IRQ_SHOW 14 select GENERIC_IRQ_SHOW
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index b22df9410dce..dae1e7e16a37 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -3,9 +3,11 @@ config M68K
3 default y 3 default y
4 select HAVE_IDE 4 select HAVE_IDE
5 select HAVE_AOUT if MMU 5 select HAVE_AOUT if MMU
6 select HAVE_DEBUG_BUGVERBOSE
6 select HAVE_GENERIC_HARDIRQS 7 select HAVE_GENERIC_HARDIRQS
7 select GENERIC_IRQ_SHOW 8 select GENERIC_IRQ_SHOW
8 select GENERIC_ATOMIC64 9 select GENERIC_ATOMIC64
10 select HAVE_UID16
9 select ARCH_HAVE_NMI_SAFE_CMPXCHG if RMW_INSNS 11 select ARCH_HAVE_NMI_SAFE_CMPXCHG if RMW_INSNS
10 select GENERIC_CPU_DEVICES 12 select GENERIC_CPU_DEVICES
11 select GENERIC_STRNCPY_FROM_USER if MMU 13 select GENERIC_STRNCPY_FROM_USER if MMU
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index aeebbb7b30f0..a563727806bf 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -170,6 +170,7 @@ good_area:
170 /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk 170 /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
171 * of starvation. */ 171 * of starvation. */
172 flags &= ~FAULT_FLAG_ALLOW_RETRY; 172 flags &= ~FAULT_FLAG_ALLOW_RETRY;
173 flags |= FAULT_FLAG_TRIED;
173 174
174 /* 175 /*
175 * No need to up_read(&mm->mmap_sem) as we would 176 * No need to up_read(&mm->mmap_sem) as we would
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 6133bed2b855..53fd94ab60f0 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -16,6 +16,7 @@ config MICROBLAZE
16 select OF 16 select OF
17 select OF_EARLY_FLATTREE 17 select OF_EARLY_FLATTREE
18 select ARCH_WANT_IPC_PARSE_VERSION 18 select ARCH_WANT_IPC_PARSE_VERSION
19 select HAVE_DEBUG_KMEMLEAK
19 select IRQ_DOMAIN 20 select IRQ_DOMAIN
20 select HAVE_GENERIC_HARDIRQS 21 select HAVE_GENERIC_HARDIRQS
21 select GENERIC_IRQ_PROBE 22 select GENERIC_IRQ_PROBE
diff --git a/arch/microblaze/include/asm/atomic.h b/arch/microblaze/include/asm/atomic.h
index 472d8bf726df..42ac382a09da 100644
--- a/arch/microblaze/include/asm/atomic.h
+++ b/arch/microblaze/include/asm/atomic.h
@@ -22,5 +22,6 @@ static inline int atomic_dec_if_positive(atomic_t *v)
22 22
23 return res; 23 return res;
24} 24}
25#define atomic_dec_if_positive atomic_dec_if_positive
25 26
26#endif /* _ASM_MICROBLAZE_ATOMIC_H */ 27#endif /* _ASM_MICROBLAZE_ATOMIC_H */
diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c
index eb365d6795fa..714b35a9c4f7 100644
--- a/arch/microblaze/mm/fault.c
+++ b/arch/microblaze/mm/fault.c
@@ -233,6 +233,7 @@ good_area:
233 current->min_flt++; 233 current->min_flt++;
234 if (fault & VM_FAULT_RETRY) { 234 if (fault & VM_FAULT_RETRY) {
235 flags &= ~FAULT_FLAG_ALLOW_RETRY; 235 flags &= ~FAULT_FLAG_ALLOW_RETRY;
236 flags |= FAULT_FLAG_TRIED;
236 237
237 /* 238 /*
238 * No need to up_read(&mm->mmap_sem) as we would 239 * No need to up_read(&mm->mmap_sem) as we would
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 4cd538b42a3f..35453eaeffb5 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -17,6 +17,7 @@ config MIPS
17 select HAVE_FUNCTION_GRAPH_TRACER 17 select HAVE_FUNCTION_GRAPH_TRACER
18 select HAVE_KPROBES 18 select HAVE_KPROBES
19 select HAVE_KRETPROBES 19 select HAVE_KRETPROBES
20 select HAVE_DEBUG_KMEMLEAK
20 select ARCH_BINFMT_ELF_RANDOMIZE_PIE 21 select ARCH_BINFMT_ELF_RANDOMIZE_PIE
21 select RTC_LIB if !MACH_LOONGSON 22 select RTC_LIB if !MACH_LOONGSON
22 select GENERIC_ATOMIC64 if !64BIT 23 select GENERIC_ATOMIC64 if !64BIT
diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h
index 58d36889f09b..bd94946a18f3 100644
--- a/arch/mips/include/asm/hugetlb.h
+++ b/arch/mips/include/asm/hugetlb.h
@@ -112,4 +112,8 @@ static inline void arch_release_hugepage(struct page *page)
112{ 112{
113} 113}
114 114
115static inline void arch_clear_hugepage_flags(struct page *page)
116{
117}
118
115#endif /* __ASM_HUGETLB_H */ 119#endif /* __ASM_HUGETLB_H */
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index 7a19957735e9..ddcec1e1a0cd 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -171,6 +171,7 @@ good_area:
171 } 171 }
172 if (fault & VM_FAULT_RETRY) { 172 if (fault & VM_FAULT_RETRY) {
173 flags &= ~FAULT_FLAG_ALLOW_RETRY; 173 flags &= ~FAULT_FLAG_ALLOW_RETRY;
174 flags |= FAULT_FLAG_TRIED;
174 175
175 /* 176 /*
176 * No need to up_read(&mm->mmap_sem) as we would 177 * No need to up_read(&mm->mmap_sem) as we would
diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c
index 40f850e9766c..e2bfafce66c5 100644
--- a/arch/openrisc/mm/fault.c
+++ b/arch/openrisc/mm/fault.c
@@ -183,6 +183,7 @@ good_area:
183 tsk->min_flt++; 183 tsk->min_flt++;
184 if (fault & VM_FAULT_RETRY) { 184 if (fault & VM_FAULT_RETRY) {
185 flags &= ~FAULT_FLAG_ALLOW_RETRY; 185 flags &= ~FAULT_FLAG_ALLOW_RETRY;
186 flags |= FAULT_FLAG_TRIED;
186 187
187 /* No need to up_read(&mm->mmap_sem) as we would 188 /* No need to up_read(&mm->mmap_sem) as we would
188 * have already released it in __lock_page_or_retry 189 * have already released it in __lock_page_or_retry
diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c
index 9d181890a7e3..48e16dc20102 100644
--- a/arch/parisc/kernel/cache.c
+++ b/arch/parisc/kernel/cache.c
@@ -276,7 +276,6 @@ void flush_dcache_page(struct page *page)
276{ 276{
277 struct address_space *mapping = page_mapping(page); 277 struct address_space *mapping = page_mapping(page);
278 struct vm_area_struct *mpnt; 278 struct vm_area_struct *mpnt;
279 struct prio_tree_iter iter;
280 unsigned long offset; 279 unsigned long offset;
281 unsigned long addr, old_addr = 0; 280 unsigned long addr, old_addr = 0;
282 pgoff_t pgoff; 281 pgoff_t pgoff;
@@ -299,7 +298,7 @@ void flush_dcache_page(struct page *page)
299 * to flush one address here for them all to become coherent */ 298 * to flush one address here for them all to become coherent */
300 299
301 flush_dcache_mmap_lock(mapping); 300 flush_dcache_mmap_lock(mapping);
302 vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) { 301 vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) {
303 offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT; 302 offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT;
304 addr = mpnt->vm_start + offset; 303 addr = mpnt->vm_start + offset;
305 304
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 4ce0be32d153..df7edb887a04 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -99,6 +99,7 @@ config PPC
99 select HAVE_DYNAMIC_FTRACE 99 select HAVE_DYNAMIC_FTRACE
100 select HAVE_FUNCTION_TRACER 100 select HAVE_FUNCTION_TRACER
101 select HAVE_FUNCTION_GRAPH_TRACER 101 select HAVE_FUNCTION_GRAPH_TRACER
102 select SYSCTL_EXCEPTION_TRACE
102 select ARCH_WANT_OPTIONAL_GPIOLIB 103 select ARCH_WANT_OPTIONAL_GPIOLIB
103 select HAVE_IDE 104 select HAVE_IDE
104 select HAVE_IOREMAP_PROT 105 select HAVE_IOREMAP_PROT
@@ -113,6 +114,7 @@ config PPC
113 select HAVE_DMA_API_DEBUG 114 select HAVE_DMA_API_DEBUG
114 select USE_GENERIC_SMP_HELPERS if SMP 115 select USE_GENERIC_SMP_HELPERS if SMP
115 select HAVE_OPROFILE 116 select HAVE_OPROFILE
117 select HAVE_DEBUG_KMEMLEAK
116 select HAVE_SYSCALL_WRAPPERS if PPC64 118 select HAVE_SYSCALL_WRAPPERS if PPC64
117 select GENERIC_ATOMIC64 if PPC32 119 select GENERIC_ATOMIC64 if PPC32
118 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE 120 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h
index da29032ae38f..e3b1d41c89be 100644
--- a/arch/powerpc/include/asm/atomic.h
+++ b/arch/powerpc/include/asm/atomic.h
@@ -268,6 +268,7 @@ static __inline__ int atomic_dec_if_positive(atomic_t *v)
268 268
269 return t; 269 return t;
270} 270}
271#define atomic_dec_if_positive atomic_dec_if_positive
271 272
272#define smp_mb__before_atomic_dec() smp_mb() 273#define smp_mb__before_atomic_dec() smp_mb()
273#define smp_mb__after_atomic_dec() smp_mb() 274#define smp_mb__after_atomic_dec() smp_mb()
diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h
index dfdb95bc59a5..62e11a32c4c2 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -151,6 +151,10 @@ static inline void arch_release_hugepage(struct page *page)
151{ 151{
152} 152}
153 153
154static inline void arch_clear_hugepage_flags(struct page *page)
155{
156}
157
154#else /* ! CONFIG_HUGETLB_PAGE */ 158#else /* ! CONFIG_HUGETLB_PAGE */
155static inline void flush_hugetlb_page(struct vm_area_struct *vma, 159static inline void flush_hugetlb_page(struct vm_area_struct *vma,
156 unsigned long vmaddr) 160 unsigned long vmaddr)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 83e929e66f9d..721d4603a235 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1183,7 +1183,7 @@ static const struct vm_operations_struct kvm_rma_vm_ops = {
1183 1183
1184static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma) 1184static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma)
1185{ 1185{
1186 vma->vm_flags |= VM_RESERVED; 1186 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
1187 vma->vm_ops = &kvm_rma_vm_ops; 1187 vma->vm_ops = &kvm_rma_vm_ops;
1188 return 0; 1188 return 0;
1189} 1189}
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 5495ebe983a2..0a6b28336eb0 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -451,6 +451,7 @@ good_area:
451 /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk 451 /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
452 * of starvation. */ 452 * of starvation. */
453 flags &= ~FAULT_FLAG_ALLOW_RETRY; 453 flags &= ~FAULT_FLAG_ALLOW_RETRY;
454 flags |= FAULT_FLAG_TRIED;
454 goto retry; 455 goto retry;
455 } 456 }
456 } 457 }
diff --git a/arch/powerpc/oprofile/cell/spu_task_sync.c b/arch/powerpc/oprofile/cell/spu_task_sync.c
index 642fca137ccb..28f1af2db1f5 100644
--- a/arch/powerpc/oprofile/cell/spu_task_sync.c
+++ b/arch/powerpc/oprofile/cell/spu_task_sync.c
@@ -304,7 +304,7 @@ static inline unsigned long fast_get_dcookie(struct path *path)
304 return cookie; 304 return cookie;
305} 305}
306 306
307/* Look up the dcookie for the task's first VM_EXECUTABLE mapping, 307/* Look up the dcookie for the task's mm->exe_file,
308 * which corresponds loosely to "application name". Also, determine 308 * which corresponds loosely to "application name". Also, determine
309 * the offset for the SPU ELF object. If computed offset is 309 * the offset for the SPU ELF object. If computed offset is
310 * non-zero, it implies an embedded SPU object; otherwise, it's a 310 * non-zero, it implies an embedded SPU object; otherwise, it's a
@@ -321,7 +321,6 @@ get_exec_dcookie_and_offset(struct spu *spu, unsigned int *offsetp,
321{ 321{
322 unsigned long app_cookie = 0; 322 unsigned long app_cookie = 0;
323 unsigned int my_offset = 0; 323 unsigned int my_offset = 0;
324 struct file *app = NULL;
325 struct vm_area_struct *vma; 324 struct vm_area_struct *vma;
326 struct mm_struct *mm = spu->mm; 325 struct mm_struct *mm = spu->mm;
327 326
@@ -330,16 +329,10 @@ get_exec_dcookie_and_offset(struct spu *spu, unsigned int *offsetp,
330 329
331 down_read(&mm->mmap_sem); 330 down_read(&mm->mmap_sem);
332 331
333 for (vma = mm->mmap; vma; vma = vma->vm_next) { 332 if (mm->exe_file) {
334 if (!vma->vm_file) 333 app_cookie = fast_get_dcookie(&mm->exe_file->f_path);
335 continue;
336 if (!(vma->vm_flags & VM_EXECUTABLE))
337 continue;
338 app_cookie = fast_get_dcookie(&vma->vm_file->f_path);
339 pr_debug("got dcookie for %s\n", 334 pr_debug("got dcookie for %s\n",
340 vma->vm_file->f_dentry->d_name.name); 335 mm->exe_file->f_dentry->d_name.name);
341 app = vma->vm_file;
342 break;
343 } 336 }
344 337
345 for (vma = mm->mmap; vma; vma = vma->vm_next) { 338 for (vma = mm->mmap; vma; vma = vma->vm_next) {
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 11d8e0544ac0..dc0a035e63bb 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -77,7 +77,8 @@ static int pseries_remove_memblock(unsigned long base, unsigned int memblock_siz
77{ 77{
78 unsigned long start, start_pfn; 78 unsigned long start, start_pfn;
79 struct zone *zone; 79 struct zone *zone;
80 int ret; 80 int i, ret;
81 int sections_to_remove;
81 82
82 start_pfn = base >> PAGE_SHIFT; 83 start_pfn = base >> PAGE_SHIFT;
83 84
@@ -97,9 +98,13 @@ static int pseries_remove_memblock(unsigned long base, unsigned int memblock_siz
97 * to sysfs "state" file and we can't remove sysfs entries 98 * to sysfs "state" file and we can't remove sysfs entries
98 * while writing to it. So we have to defer it to here. 99 * while writing to it. So we have to defer it to here.
99 */ 100 */
100 ret = __remove_pages(zone, start_pfn, memblock_size >> PAGE_SHIFT); 101 sections_to_remove = (memblock_size >> PAGE_SHIFT) / PAGES_PER_SECTION;
101 if (ret) 102 for (i = 0; i < sections_to_remove; i++) {
102 return ret; 103 unsigned long pfn = start_pfn + i * PAGES_PER_SECTION;
104 ret = __remove_pages(zone, start_pfn, PAGES_PER_SECTION);
105 if (ret)
106 return ret;
107 }
103 108
104 /* 109 /*
105 * Update memory regions for memory remove 110 * Update memory regions for memory remove
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index c8af429991d9..ceff7aef2477 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -68,6 +68,7 @@ config S390
68 select HAVE_FTRACE_MCOUNT_RECORD 68 select HAVE_FTRACE_MCOUNT_RECORD
69 select HAVE_C_RECORDMCOUNT 69 select HAVE_C_RECORDMCOUNT
70 select HAVE_SYSCALL_TRACEPOINTS 70 select HAVE_SYSCALL_TRACEPOINTS
71 select SYSCTL_EXCEPTION_TRACE
71 select HAVE_DYNAMIC_FTRACE 72 select HAVE_DYNAMIC_FTRACE
72 select HAVE_FUNCTION_GRAPH_TRACER 73 select HAVE_FUNCTION_GRAPH_TRACER
73 select HAVE_REGS_AND_STACK_ACCESS_API 74 select HAVE_REGS_AND_STACK_ACCESS_API
@@ -80,6 +81,7 @@ config S390
80 select HAVE_IRQ_WORK 81 select HAVE_IRQ_WORK
81 select HAVE_PERF_EVENTS 82 select HAVE_PERF_EVENTS
82 select ARCH_HAVE_NMI_SAFE_CMPXCHG 83 select ARCH_HAVE_NMI_SAFE_CMPXCHG
84 select HAVE_DEBUG_KMEMLEAK
83 select HAVE_KERNEL_GZIP 85 select HAVE_KERNEL_GZIP
84 select HAVE_KERNEL_BZIP2 86 select HAVE_KERNEL_BZIP2
85 select HAVE_KERNEL_LZMA 87 select HAVE_KERNEL_LZMA
@@ -126,6 +128,7 @@ config S390
126 select ARCH_INLINE_WRITE_UNLOCK_BH 128 select ARCH_INLINE_WRITE_UNLOCK_BH
127 select ARCH_INLINE_WRITE_UNLOCK_IRQ 129 select ARCH_INLINE_WRITE_UNLOCK_IRQ
128 select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE 130 select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
131 select HAVE_UID16 if 32BIT
129 select ARCH_WANT_IPC_PARSE_VERSION 132 select ARCH_WANT_IPC_PARSE_VERSION
130 select GENERIC_SMP_IDLE_THREAD 133 select GENERIC_SMP_IDLE_THREAD
131 select GENERIC_TIME_VSYSCALL 134 select GENERIC_TIME_VSYSCALL
diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index 2d6e6e380564..593753ee07f3 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -33,6 +33,7 @@ static inline int prepare_hugepage_range(struct file *file,
33} 33}
34 34
35#define hugetlb_prefault_arch_hook(mm) do { } while (0) 35#define hugetlb_prefault_arch_hook(mm) do { } while (0)
36#define arch_clear_hugepage_flags(page) do { } while (0)
36 37
37int arch_prepare_hugepage(struct page *page); 38int arch_prepare_hugepage(struct page *page);
38void arch_release_hugepage(struct page *page); 39void arch_release_hugepage(struct page *page);
@@ -77,23 +78,6 @@ static inline void __pmd_csp(pmd_t *pmdp)
77 " csp %1,%3" 78 " csp %1,%3"
78 : "=m" (*pmdp) 79 : "=m" (*pmdp)
79 : "d" (reg2), "d" (reg3), "d" (reg4), "m" (*pmdp) : "cc"); 80 : "d" (reg2), "d" (reg3), "d" (reg4), "m" (*pmdp) : "cc");
80 pmd_val(*pmdp) = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY;
81}
82
83static inline void __pmd_idte(unsigned long address, pmd_t *pmdp)
84{
85 unsigned long sto = (unsigned long) pmdp -
86 pmd_index(address) * sizeof(pmd_t);
87
88 if (!(pmd_val(*pmdp) & _SEGMENT_ENTRY_INV)) {
89 asm volatile(
90 " .insn rrf,0xb98e0000,%2,%3,0,0"
91 : "=m" (*pmdp)
92 : "m" (*pmdp), "a" (sto),
93 "a" ((address & HPAGE_MASK))
94 );
95 }
96 pmd_val(*pmdp) = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY;
97} 81}
98 82
99static inline void huge_ptep_invalidate(struct mm_struct *mm, 83static inline void huge_ptep_invalidate(struct mm_struct *mm,
@@ -105,6 +89,7 @@ static inline void huge_ptep_invalidate(struct mm_struct *mm,
105 __pmd_idte(address, pmdp); 89 __pmd_idte(address, pmdp);
106 else 90 else
107 __pmd_csp(pmdp); 91 __pmd_csp(pmdp);
92 pmd_val(*pmdp) = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY;
108} 93}
109 94
110static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, 95static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 6bd7d7483017..979fe3dc0788 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -42,6 +42,7 @@ extern void fault_init(void);
42 * tables contain all the necessary information. 42 * tables contain all the necessary information.
43 */ 43 */
44#define update_mmu_cache(vma, address, ptep) do { } while (0) 44#define update_mmu_cache(vma, address, ptep) do { } while (0)
45#define update_mmu_cache_pmd(vma, address, ptep) do { } while (0)
45 46
46/* 47/*
47 * ZERO_PAGE is a global shared page that is always zero; used 48 * ZERO_PAGE is a global shared page that is always zero; used
@@ -347,6 +348,12 @@ extern struct page *vmemmap;
347 348
348#define _SEGMENT_ENTRY_LARGE 0x400 /* STE-format control, large page */ 349#define _SEGMENT_ENTRY_LARGE 0x400 /* STE-format control, large page */
349#define _SEGMENT_ENTRY_CO 0x100 /* change-recording override */ 350#define _SEGMENT_ENTRY_CO 0x100 /* change-recording override */
351#define _SEGMENT_ENTRY_SPLIT_BIT 0 /* THP splitting bit number */
352#define _SEGMENT_ENTRY_SPLIT (1UL << _SEGMENT_ENTRY_SPLIT_BIT)
353
354/* Set of bits not changed in pmd_modify */
355#define _SEGMENT_CHG_MASK (_SEGMENT_ENTRY_ORIGIN | _SEGMENT_ENTRY_LARGE \
356 | _SEGMENT_ENTRY_SPLIT | _SEGMENT_ENTRY_CO)
350 357
351/* Page status table bits for virtualization */ 358/* Page status table bits for virtualization */
352#define RCP_ACC_BITS 0xf000000000000000UL 359#define RCP_ACC_BITS 0xf000000000000000UL
@@ -506,6 +513,30 @@ static inline int pmd_bad(pmd_t pmd)
506 return (pmd_val(pmd) & mask) != _SEGMENT_ENTRY; 513 return (pmd_val(pmd) & mask) != _SEGMENT_ENTRY;
507} 514}
508 515
516#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
517extern void pmdp_splitting_flush(struct vm_area_struct *vma,
518 unsigned long addr, pmd_t *pmdp);
519
520#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
521extern int pmdp_set_access_flags(struct vm_area_struct *vma,
522 unsigned long address, pmd_t *pmdp,
523 pmd_t entry, int dirty);
524
525#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
526extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
527 unsigned long address, pmd_t *pmdp);
528
529#define __HAVE_ARCH_PMD_WRITE
530static inline int pmd_write(pmd_t pmd)
531{
532 return (pmd_val(pmd) & _SEGMENT_ENTRY_RO) == 0;
533}
534
535static inline int pmd_young(pmd_t pmd)
536{
537 return 0;
538}
539
509static inline int pte_none(pte_t pte) 540static inline int pte_none(pte_t pte)
510{ 541{
511 return (pte_val(pte) & _PAGE_INVALID) && !(pte_val(pte) & _PAGE_SWT); 542 return (pte_val(pte) & _PAGE_INVALID) && !(pte_val(pte) & _PAGE_SWT);
@@ -1159,6 +1190,185 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
1159#define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address) 1190#define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address)
1160#define pte_unmap(pte) do { } while (0) 1191#define pte_unmap(pte) do { } while (0)
1161 1192
1193static inline void __pmd_idte(unsigned long address, pmd_t *pmdp)
1194{
1195 unsigned long sto = (unsigned long) pmdp -
1196 pmd_index(address) * sizeof(pmd_t);
1197
1198 if (!(pmd_val(*pmdp) & _SEGMENT_ENTRY_INV)) {
1199 asm volatile(
1200 " .insn rrf,0xb98e0000,%2,%3,0,0"
1201 : "=m" (*pmdp)
1202 : "m" (*pmdp), "a" (sto),
1203 "a" ((address & HPAGE_MASK))
1204 : "cc"
1205 );
1206 }
1207}
1208
1209#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1210#define __HAVE_ARCH_PGTABLE_DEPOSIT
1211extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable);
1212
1213#define __HAVE_ARCH_PGTABLE_WITHDRAW
1214extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm);
1215
1216static inline int pmd_trans_splitting(pmd_t pmd)
1217{
1218 return pmd_val(pmd) & _SEGMENT_ENTRY_SPLIT;
1219}
1220
1221static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
1222 pmd_t *pmdp, pmd_t entry)
1223{
1224 *pmdp = entry;
1225}
1226
1227static inline unsigned long massage_pgprot_pmd(pgprot_t pgprot)
1228{
1229 unsigned long pgprot_pmd = 0;
1230
1231 if (pgprot_val(pgprot) & _PAGE_INVALID) {
1232 if (pgprot_val(pgprot) & _PAGE_SWT)
1233 pgprot_pmd |= _HPAGE_TYPE_NONE;
1234 pgprot_pmd |= _SEGMENT_ENTRY_INV;
1235 }
1236 if (pgprot_val(pgprot) & _PAGE_RO)
1237 pgprot_pmd |= _SEGMENT_ENTRY_RO;
1238 return pgprot_pmd;
1239}
1240
1241static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
1242{
1243 pmd_val(pmd) &= _SEGMENT_CHG_MASK;
1244 pmd_val(pmd) |= massage_pgprot_pmd(newprot);
1245 return pmd;
1246}
1247
1248static inline pmd_t pmd_mkhuge(pmd_t pmd)
1249{
1250 pmd_val(pmd) |= _SEGMENT_ENTRY_LARGE;
1251 return pmd;
1252}
1253
1254static inline pmd_t pmd_mkwrite(pmd_t pmd)
1255{
1256 pmd_val(pmd) &= ~_SEGMENT_ENTRY_RO;
1257 return pmd;
1258}
1259
1260static inline pmd_t pmd_wrprotect(pmd_t pmd)
1261{
1262 pmd_val(pmd) |= _SEGMENT_ENTRY_RO;
1263 return pmd;
1264}
1265
1266static inline pmd_t pmd_mkdirty(pmd_t pmd)
1267{
1268 /* No dirty bit in the segment table entry. */
1269 return pmd;
1270}
1271
1272static inline pmd_t pmd_mkold(pmd_t pmd)
1273{
1274 /* No referenced bit in the segment table entry. */
1275 return pmd;
1276}
1277
1278static inline pmd_t pmd_mkyoung(pmd_t pmd)
1279{
1280 /* No referenced bit in the segment table entry. */
1281 return pmd;
1282}
1283
1284#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
1285static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
1286 unsigned long address, pmd_t *pmdp)
1287{
1288 unsigned long pmd_addr = pmd_val(*pmdp) & HPAGE_MASK;
1289 long tmp, rc;
1290 int counter;
1291
1292 rc = 0;
1293 if (MACHINE_HAS_RRBM) {
1294 counter = PTRS_PER_PTE >> 6;
1295 asm volatile(
1296 "0: .insn rre,0xb9ae0000,%0,%3\n" /* rrbm */
1297 " ogr %1,%0\n"
1298 " la %3,0(%4,%3)\n"
1299 " brct %2,0b\n"
1300 : "=&d" (tmp), "+&d" (rc), "+d" (counter),
1301 "+a" (pmd_addr)
1302 : "a" (64 * 4096UL) : "cc");
1303 rc = !!rc;
1304 } else {
1305 counter = PTRS_PER_PTE;
1306 asm volatile(
1307 "0: rrbe 0,%2\n"
1308 " la %2,0(%3,%2)\n"
1309 " brc 12,1f\n"
1310 " lhi %0,1\n"
1311 "1: brct %1,0b\n"
1312 : "+d" (rc), "+d" (counter), "+a" (pmd_addr)
1313 : "a" (4096UL) : "cc");
1314 }
1315 return rc;
1316}
1317
1318#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
1319static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
1320 unsigned long address, pmd_t *pmdp)
1321{
1322 pmd_t pmd = *pmdp;
1323
1324 __pmd_idte(address, pmdp);
1325 pmd_clear(pmdp);
1326 return pmd;
1327}
1328
1329#define __HAVE_ARCH_PMDP_CLEAR_FLUSH
1330static inline pmd_t pmdp_clear_flush(struct vm_area_struct *vma,
1331 unsigned long address, pmd_t *pmdp)
1332{
1333 return pmdp_get_and_clear(vma->vm_mm, address, pmdp);
1334}
1335
1336#define __HAVE_ARCH_PMDP_INVALIDATE
1337static inline void pmdp_invalidate(struct vm_area_struct *vma,
1338 unsigned long address, pmd_t *pmdp)
1339{
1340 __pmd_idte(address, pmdp);
1341}
1342
1343static inline pmd_t mk_pmd_phys(unsigned long physpage, pgprot_t pgprot)
1344{
1345 pmd_t __pmd;
1346 pmd_val(__pmd) = physpage + massage_pgprot_pmd(pgprot);
1347 return __pmd;
1348}
1349
1350#define pfn_pmd(pfn, pgprot) mk_pmd_phys(__pa((pfn) << PAGE_SHIFT), (pgprot))
1351#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot))
1352
1353static inline int pmd_trans_huge(pmd_t pmd)
1354{
1355 return pmd_val(pmd) & _SEGMENT_ENTRY_LARGE;
1356}
1357
1358static inline int has_transparent_hugepage(void)
1359{
1360 return MACHINE_HAS_HPAGE ? 1 : 0;
1361}
1362
1363static inline unsigned long pmd_pfn(pmd_t pmd)
1364{
1365 if (pmd_trans_huge(pmd))
1366 return pmd_val(pmd) >> HPAGE_SHIFT;
1367 else
1368 return pmd_val(pmd) >> PAGE_SHIFT;
1369}
1370#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1371
1162/* 1372/*
1163 * 31 bit swap entry format: 1373 * 31 bit swap entry format:
1164 * A page-table entry has some bits we have to treat in a special way. 1374 * A page-table entry has some bits we have to treat in a special way.
diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h
index 87b47ca954f1..8cfd731a18d8 100644
--- a/arch/s390/include/asm/setup.h
+++ b/arch/s390/include/asm/setup.h
@@ -81,6 +81,7 @@ extern unsigned int s390_user_mode;
81#define MACHINE_FLAG_SPP (1UL << 13) 81#define MACHINE_FLAG_SPP (1UL << 13)
82#define MACHINE_FLAG_TOPOLOGY (1UL << 14) 82#define MACHINE_FLAG_TOPOLOGY (1UL << 14)
83#define MACHINE_FLAG_TE (1UL << 15) 83#define MACHINE_FLAG_TE (1UL << 15)
84#define MACHINE_FLAG_RRBM (1UL << 16)
84 85
85#define MACHINE_IS_VM (S390_lowcore.machine_flags & MACHINE_FLAG_VM) 86#define MACHINE_IS_VM (S390_lowcore.machine_flags & MACHINE_FLAG_VM)
86#define MACHINE_IS_KVM (S390_lowcore.machine_flags & MACHINE_FLAG_KVM) 87#define MACHINE_IS_KVM (S390_lowcore.machine_flags & MACHINE_FLAG_KVM)
@@ -99,7 +100,8 @@ extern unsigned int s390_user_mode;
99#define MACHINE_HAS_PFMF (0) 100#define MACHINE_HAS_PFMF (0)
100#define MACHINE_HAS_SPP (0) 101#define MACHINE_HAS_SPP (0)
101#define MACHINE_HAS_TOPOLOGY (0) 102#define MACHINE_HAS_TOPOLOGY (0)
102#define MACHINE_HAS_TE (0) 103#define MACHINE_HAS_TE (0)
104#define MACHINE_HAS_RRBM (0)
103#else /* CONFIG_64BIT */ 105#else /* CONFIG_64BIT */
104#define MACHINE_HAS_IEEE (1) 106#define MACHINE_HAS_IEEE (1)
105#define MACHINE_HAS_CSP (1) 107#define MACHINE_HAS_CSP (1)
@@ -112,6 +114,7 @@ extern unsigned int s390_user_mode;
112#define MACHINE_HAS_SPP (S390_lowcore.machine_flags & MACHINE_FLAG_SPP) 114#define MACHINE_HAS_SPP (S390_lowcore.machine_flags & MACHINE_FLAG_SPP)
113#define MACHINE_HAS_TOPOLOGY (S390_lowcore.machine_flags & MACHINE_FLAG_TOPOLOGY) 115#define MACHINE_HAS_TOPOLOGY (S390_lowcore.machine_flags & MACHINE_FLAG_TOPOLOGY)
114#define MACHINE_HAS_TE (S390_lowcore.machine_flags & MACHINE_FLAG_TE) 116#define MACHINE_HAS_TE (S390_lowcore.machine_flags & MACHINE_FLAG_TE)
117#define MACHINE_HAS_RRBM (S390_lowcore.machine_flags & MACHINE_FLAG_RRBM)
115#endif /* CONFIG_64BIT */ 118#endif /* CONFIG_64BIT */
116 119
117#define ZFCPDUMP_HSA_SIZE (32UL<<20) 120#define ZFCPDUMP_HSA_SIZE (32UL<<20)
diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index 06e5acbc84bd..b75d7d686684 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -137,6 +137,7 @@ static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
137#define tlb_start_vma(tlb, vma) do { } while (0) 137#define tlb_start_vma(tlb, vma) do { } while (0)
138#define tlb_end_vma(tlb, vma) do { } while (0) 138#define tlb_end_vma(tlb, vma) do { } while (0)
139#define tlb_remove_tlb_entry(tlb, ptep, addr) do { } while (0) 139#define tlb_remove_tlb_entry(tlb, ptep, addr) do { } while (0)
140#define tlb_remove_pmd_tlb_entry(tlb, pmdp, addr) do { } while (0)
140#define tlb_migrate_finish(mm) do { } while (0) 141#define tlb_migrate_finish(mm) do { } while (0)
141 142
142#endif /* _S390_TLB_H */ 143#endif /* _S390_TLB_H */
diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index 7f4717675c19..00d114445068 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -388,6 +388,8 @@ static __init void detect_machine_facilities(void)
388 S390_lowcore.machine_flags |= MACHINE_FLAG_SPP; 388 S390_lowcore.machine_flags |= MACHINE_FLAG_SPP;
389 if (test_facility(50) && test_facility(73)) 389 if (test_facility(50) && test_facility(73))
390 S390_lowcore.machine_flags |= MACHINE_FLAG_TE; 390 S390_lowcore.machine_flags |= MACHINE_FLAG_TE;
391 if (test_facility(66))
392 S390_lowcore.machine_flags |= MACHINE_FLAG_RRBM;
391#endif 393#endif
392} 394}
393 395
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index ac9122ca1152..04ad4001a289 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -367,6 +367,7 @@ retry:
367 /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk 367 /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
368 * of starvation. */ 368 * of starvation. */
369 flags &= ~FAULT_FLAG_ALLOW_RETRY; 369 flags &= ~FAULT_FLAG_ALLOW_RETRY;
370 flags |= FAULT_FLAG_TRIED;
370 down_read(&mm->mmap_sem); 371 down_read(&mm->mmap_sem);
371 goto retry; 372 goto retry;
372 } 373 }
diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c
index eeaf8023851f..60acb93a4680 100644
--- a/arch/s390/mm/gup.c
+++ b/arch/s390/mm/gup.c
@@ -115,7 +115,16 @@ static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,
115 pmd = *pmdp; 115 pmd = *pmdp;
116 barrier(); 116 barrier();
117 next = pmd_addr_end(addr, end); 117 next = pmd_addr_end(addr, end);
118 if (pmd_none(pmd)) 118 /*
119 * The pmd_trans_splitting() check below explains why
120 * pmdp_splitting_flush() has to serialize with
121 * smp_call_function() against our disabled IRQs, to stop
122 * this gup-fast code from running while we set the
123 * splitting bit in the pmd. Returning zero will take
124 * the slow path that will call wait_split_huge_page()
125 * if the pmd is still in splitting state.
126 */
127 if (pmd_none(pmd) || pmd_trans_splitting(pmd))
119 return 0; 128 return 0;
120 if (unlikely(pmd_huge(pmd))) { 129 if (unlikely(pmd_huge(pmd))) {
121 if (!gup_huge_pmd(pmdp, pmd, addr, next, 130 if (!gup_huge_pmd(pmdp, pmd, addr, next,
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index b402991e43d7..c8188a18af05 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -787,6 +787,30 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
787 tlb_table_flush(tlb); 787 tlb_table_flush(tlb);
788} 788}
789 789
790#ifdef CONFIG_TRANSPARENT_HUGEPAGE
791void thp_split_vma(struct vm_area_struct *vma)
792{
793 unsigned long addr;
794 struct page *page;
795
796 for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
797 page = follow_page(vma, addr, FOLL_SPLIT);
798 }
799}
800
801void thp_split_mm(struct mm_struct *mm)
802{
803 struct vm_area_struct *vma = mm->mmap;
804
805 while (vma != NULL) {
806 thp_split_vma(vma);
807 vma->vm_flags &= ~VM_HUGEPAGE;
808 vma->vm_flags |= VM_NOHUGEPAGE;
809 vma = vma->vm_next;
810 }
811}
812#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
813
790/* 814/*
791 * switch on pgstes for its userspace process (for kvm) 815 * switch on pgstes for its userspace process (for kvm)
792 */ 816 */
@@ -824,6 +848,12 @@ int s390_enable_sie(void)
824 if (!mm) 848 if (!mm)
825 return -ENOMEM; 849 return -ENOMEM;
826 850
851#ifdef CONFIG_TRANSPARENT_HUGEPAGE
852 /* split thp mappings and disable thp for future mappings */
853 thp_split_mm(mm);
854 mm->def_flags |= VM_NOHUGEPAGE;
855#endif
856
827 /* Now lets check again if something happened */ 857 /* Now lets check again if something happened */
828 task_lock(tsk); 858 task_lock(tsk);
829 if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || 859 if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
@@ -866,3 +896,81 @@ bool kernel_page_present(struct page *page)
866 return cc == 0; 896 return cc == 0;
867} 897}
868#endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */ 898#endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */
899
900#ifdef CONFIG_TRANSPARENT_HUGEPAGE
901int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address,
902 pmd_t *pmdp)
903{
904 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
905 /* No need to flush TLB
906 * On s390 reference bits are in storage key and never in TLB */
907 return pmdp_test_and_clear_young(vma, address, pmdp);
908}
909
910int pmdp_set_access_flags(struct vm_area_struct *vma,
911 unsigned long address, pmd_t *pmdp,
912 pmd_t entry, int dirty)
913{
914 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
915
916 if (pmd_same(*pmdp, entry))
917 return 0;
918 pmdp_invalidate(vma, address, pmdp);
919 set_pmd_at(vma->vm_mm, address, pmdp, entry);
920 return 1;
921}
922
923static void pmdp_splitting_flush_sync(void *arg)
924{
925 /* Simply deliver the interrupt */
926}
927
928void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
929 pmd_t *pmdp)
930{
931 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
932 if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT,
933 (unsigned long *) pmdp)) {
934 /* need to serialize against gup-fast (IRQ disabled) */
935 smp_call_function(pmdp_splitting_flush_sync, NULL, 1);
936 }
937}
938
939void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
940{
941 struct list_head *lh = (struct list_head *) pgtable;
942
943 assert_spin_locked(&mm->page_table_lock);
944
945 /* FIFO */
946 if (!mm->pmd_huge_pte)
947 INIT_LIST_HEAD(lh);
948 else
949 list_add(lh, (struct list_head *) mm->pmd_huge_pte);
950 mm->pmd_huge_pte = pgtable;
951}
952
953pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm)
954{
955 struct list_head *lh;
956 pgtable_t pgtable;
957 pte_t *ptep;
958
959 assert_spin_locked(&mm->page_table_lock);
960
961 /* FIFO */
962 pgtable = mm->pmd_huge_pte;
963 lh = (struct list_head *) pgtable;
964 if (list_empty(lh))
965 mm->pmd_huge_pte = NULL;
966 else {
967 mm->pmd_huge_pte = (pgtable_t) lh->next;
968 list_del(lh);
969 }
970 ptep = (pte_t *) pgtable;
971 pte_val(*ptep) = _PAGE_TYPE_EMPTY;
972 ptep++;
973 pte_val(*ptep) = _PAGE_TYPE_EMPTY;
974 return pgtable;
975}
976#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 36f5141e8041..3b3e27a3ff2c 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -13,14 +13,17 @@ config SUPERH
13 select HAVE_DMA_ATTRS 13 select HAVE_DMA_ATTRS
14 select HAVE_IRQ_WORK 14 select HAVE_IRQ_WORK
15 select HAVE_PERF_EVENTS 15 select HAVE_PERF_EVENTS
16 select HAVE_DEBUG_BUGVERBOSE
16 select ARCH_HAVE_CUSTOM_GPIO_H 17 select ARCH_HAVE_CUSTOM_GPIO_H
17 select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A) 18 select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A)
18 select PERF_USE_VMALLOC 19 select PERF_USE_VMALLOC
20 select HAVE_DEBUG_KMEMLEAK
19 select HAVE_KERNEL_GZIP 21 select HAVE_KERNEL_GZIP
20 select HAVE_KERNEL_BZIP2 22 select HAVE_KERNEL_BZIP2
21 select HAVE_KERNEL_LZMA 23 select HAVE_KERNEL_LZMA
22 select HAVE_KERNEL_XZ 24 select HAVE_KERNEL_XZ
23 select HAVE_KERNEL_LZO 25 select HAVE_KERNEL_LZO
26 select HAVE_UID16
24 select ARCH_WANT_IPC_PARSE_VERSION 27 select ARCH_WANT_IPC_PARSE_VERSION
25 select HAVE_SYSCALL_TRACEPOINTS 28 select HAVE_SYSCALL_TRACEPOINTS
26 select HAVE_REGS_AND_STACK_ACCESS_API 29 select HAVE_REGS_AND_STACK_ACCESS_API
diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h
index 967068fb79ac..b3808c7d67b2 100644
--- a/arch/sh/include/asm/hugetlb.h
+++ b/arch/sh/include/asm/hugetlb.h
@@ -1,6 +1,7 @@
1#ifndef _ASM_SH_HUGETLB_H 1#ifndef _ASM_SH_HUGETLB_H
2#define _ASM_SH_HUGETLB_H 2#define _ASM_SH_HUGETLB_H
3 3
4#include <asm/cacheflush.h>
4#include <asm/page.h> 5#include <asm/page.h>
5 6
6 7
@@ -89,4 +90,9 @@ static inline void arch_release_hugepage(struct page *page)
89{ 90{
90} 91}
91 92
93static inline void arch_clear_hugepage_flags(struct page *page)
94{
95 clear_bit(PG_dcache_clean, &page->flags);
96}
97
92#endif /* _ASM_SH_HUGETLB_H */ 98#endif /* _ASM_SH_HUGETLB_H */
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index 3bdc1ad9a341..cbbdcad8fcb3 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -504,6 +504,7 @@ good_area:
504 } 504 }
505 if (fault & VM_FAULT_RETRY) { 505 if (fault & VM_FAULT_RETRY) {
506 flags &= ~FAULT_FLAG_ALLOW_RETRY; 506 flags &= ~FAULT_FLAG_ALLOW_RETRY;
507 flags |= FAULT_FLAG_TRIED;
507 508
508 /* 509 /*
509 * No need to up_read(&mm->mmap_sem) as we would 510 * No need to up_read(&mm->mmap_sem) as we would
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 67f1f6f5f4e1..91c780c973ba 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -18,6 +18,7 @@ config SPARC
18 select HAVE_OPROFILE 18 select HAVE_OPROFILE
19 select HAVE_ARCH_KGDB if !SMP || SPARC64 19 select HAVE_ARCH_KGDB if !SMP || SPARC64
20 select HAVE_ARCH_TRACEHOOK 20 select HAVE_ARCH_TRACEHOOK
21 select SYSCTL_EXCEPTION_TRACE
21 select ARCH_WANT_OPTIONAL_GPIOLIB 22 select ARCH_WANT_OPTIONAL_GPIOLIB
22 select RTC_CLASS 23 select RTC_CLASS
23 select RTC_DRV_M48T59 24 select RTC_DRV_M48T59
@@ -32,6 +33,7 @@ config SPARC
32 select GENERIC_PCI_IOMAP 33 select GENERIC_PCI_IOMAP
33 select HAVE_NMI_WATCHDOG if SPARC64 34 select HAVE_NMI_WATCHDOG if SPARC64
34 select HAVE_BPF_JIT 35 select HAVE_BPF_JIT
36 select HAVE_DEBUG_BUGVERBOSE
35 select GENERIC_SMP_IDLE_THREAD 37 select GENERIC_SMP_IDLE_THREAD
36 select GENERIC_CMOS_UPDATE 38 select GENERIC_CMOS_UPDATE
37 select GENERIC_CLOCKEVENTS 39 select GENERIC_CLOCKEVENTS
@@ -42,6 +44,7 @@ config SPARC32
42 def_bool !64BIT 44 def_bool !64BIT
43 select GENERIC_ATOMIC64 45 select GENERIC_ATOMIC64
44 select CLZ_TAB 46 select CLZ_TAB
47 select HAVE_UID16
45 48
46config SPARC64 49config SPARC64
47 def_bool 64BIT 50 def_bool 64BIT
@@ -59,6 +62,7 @@ config SPARC64
59 select HAVE_DYNAMIC_FTRACE 62 select HAVE_DYNAMIC_FTRACE
60 select HAVE_FTRACE_MCOUNT_RECORD 63 select HAVE_FTRACE_MCOUNT_RECORD
61 select HAVE_SYSCALL_TRACEPOINTS 64 select HAVE_SYSCALL_TRACEPOINTS
65 select HAVE_DEBUG_KMEMLEAK
62 select RTC_DRV_CMOS 66 select RTC_DRV_CMOS
63 select RTC_DRV_BQ4802 67 select RTC_DRV_BQ4802
64 select RTC_DRV_SUN4V 68 select RTC_DRV_SUN4V
@@ -226,25 +230,6 @@ config EARLYFB
226 help 230 help
227 Say Y here to enable a faster early framebuffer boot console. 231 Say Y here to enable a faster early framebuffer boot console.
228 232
229choice
230 prompt "Kernel page size" if SPARC64
231 default SPARC64_PAGE_SIZE_8KB
232
233config SPARC64_PAGE_SIZE_8KB
234 bool "8KB"
235 help
236 This lets you select the page size of the kernel.
237
238 8KB and 64KB work quite well, since SPARC ELF sections
239 provide for up to 64KB alignment.
240
241 If you don't know what to do, choose 8KB.
242
243config SPARC64_PAGE_SIZE_64KB
244 bool "64KB"
245
246endchoice
247
248config SECCOMP 233config SECCOMP
249 bool "Enable seccomp to safely compute untrusted bytecode" 234 bool "Enable seccomp to safely compute untrusted bytecode"
250 depends on SPARC64 && PROC_FS 235 depends on SPARC64 && PROC_FS
@@ -316,23 +301,6 @@ config GENERIC_LOCKBREAK
316 default y 301 default y
317 depends on SPARC64 && SMP && PREEMPT 302 depends on SPARC64 && SMP && PREEMPT
318 303
319choice
320 prompt "SPARC64 Huge TLB Page Size"
321 depends on SPARC64 && HUGETLB_PAGE
322 default HUGETLB_PAGE_SIZE_4MB
323
324config HUGETLB_PAGE_SIZE_4MB
325 bool "4MB"
326
327config HUGETLB_PAGE_SIZE_512K
328 bool "512K"
329
330config HUGETLB_PAGE_SIZE_64K
331 depends on !SPARC64_PAGE_SIZE_64KB
332 bool "64K"
333
334endchoice
335
336config NUMA 304config NUMA
337 bool "NUMA support" 305 bool "NUMA support"
338 depends on SPARC64 && SMP 306 depends on SPARC64 && SMP
@@ -571,6 +539,7 @@ config COMPAT
571 depends on SPARC64 539 depends on SPARC64
572 default y 540 default y
573 select COMPAT_BINFMT_ELF 541 select COMPAT_BINFMT_ELF
542 select HAVE_UID16
574 select ARCH_WANT_OLD_COMPAT_IPC 543 select ARCH_WANT_OLD_COMPAT_IPC
575 544
576config SYSVIPC_COMPAT 545config SYSVIPC_COMPAT
diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h
index 177061064ee6..8c5eed6d267f 100644
--- a/arch/sparc/include/asm/hugetlb.h
+++ b/arch/sparc/include/asm/hugetlb.h
@@ -10,7 +10,10 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
10pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, 10pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
11 pte_t *ptep); 11 pte_t *ptep);
12 12
13void hugetlb_prefault_arch_hook(struct mm_struct *mm); 13static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
14{
15 hugetlb_setup(mm);
16}
14 17
15static inline int is_hugepage_only_range(struct mm_struct *mm, 18static inline int is_hugepage_only_range(struct mm_struct *mm,
16 unsigned long addr, 19 unsigned long addr,
@@ -82,4 +85,8 @@ static inline void arch_release_hugepage(struct page *page)
82{ 85{
83} 86}
84 87
88static inline void arch_clear_hugepage_flags(struct page *page)
89{
90}
91
85#endif /* _ASM_SPARC64_HUGETLB_H */ 92#endif /* _ASM_SPARC64_HUGETLB_H */
diff --git a/arch/sparc/include/asm/mmu_64.h b/arch/sparc/include/asm/mmu_64.h
index 9067dc500535..76092c4dd277 100644
--- a/arch/sparc/include/asm/mmu_64.h
+++ b/arch/sparc/include/asm/mmu_64.h
@@ -30,22 +30,8 @@
30#define CTX_PGSZ_MASK ((CTX_PGSZ_BITS << CTX_PGSZ0_SHIFT) | \ 30#define CTX_PGSZ_MASK ((CTX_PGSZ_BITS << CTX_PGSZ0_SHIFT) | \
31 (CTX_PGSZ_BITS << CTX_PGSZ1_SHIFT)) 31 (CTX_PGSZ_BITS << CTX_PGSZ1_SHIFT))
32 32
33#if defined(CONFIG_SPARC64_PAGE_SIZE_8KB)
34#define CTX_PGSZ_BASE CTX_PGSZ_8KB 33#define CTX_PGSZ_BASE CTX_PGSZ_8KB
35#elif defined(CONFIG_SPARC64_PAGE_SIZE_64KB) 34#define CTX_PGSZ_HUGE CTX_PGSZ_4MB
36#define CTX_PGSZ_BASE CTX_PGSZ_64KB
37#else
38#error No page size specified in kernel configuration
39#endif
40
41#if defined(CONFIG_HUGETLB_PAGE_SIZE_4MB)
42#define CTX_PGSZ_HUGE CTX_PGSZ_4MB
43#elif defined(CONFIG_HUGETLB_PAGE_SIZE_512K)
44#define CTX_PGSZ_HUGE CTX_PGSZ_512KB
45#elif defined(CONFIG_HUGETLB_PAGE_SIZE_64K)
46#define CTX_PGSZ_HUGE CTX_PGSZ_64KB
47#endif
48
49#define CTX_PGSZ_KERN CTX_PGSZ_4MB 35#define CTX_PGSZ_KERN CTX_PGSZ_4MB
50 36
51/* Thus, when running on UltraSPARC-III+ and later, we use the following 37/* Thus, when running on UltraSPARC-III+ and later, we use the following
@@ -96,7 +82,7 @@ struct tsb_config {
96 82
97#define MM_TSB_BASE 0 83#define MM_TSB_BASE 0
98 84
99#ifdef CONFIG_HUGETLB_PAGE 85#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
100#define MM_TSB_HUGE 1 86#define MM_TSB_HUGE 1
101#define MM_NUM_TSBS 2 87#define MM_NUM_TSBS 2
102#else 88#else
@@ -107,6 +93,7 @@ typedef struct {
107 spinlock_t lock; 93 spinlock_t lock;
108 unsigned long sparc64_ctx_val; 94 unsigned long sparc64_ctx_val;
109 unsigned long huge_pte_count; 95 unsigned long huge_pte_count;
96 struct page *pgtable_page;
110 struct tsb_config tsb_block[MM_NUM_TSBS]; 97 struct tsb_config tsb_block[MM_NUM_TSBS];
111 struct hv_tsb_descr tsb_descr[MM_NUM_TSBS]; 98 struct hv_tsb_descr tsb_descr[MM_NUM_TSBS];
112} mm_context_t; 99} mm_context_t;
diff --git a/arch/sparc/include/asm/mmu_context_64.h b/arch/sparc/include/asm/mmu_context_64.h
index a97fd085cebe..9191ca62ed9c 100644
--- a/arch/sparc/include/asm/mmu_context_64.h
+++ b/arch/sparc/include/asm/mmu_context_64.h
@@ -36,7 +36,7 @@ static inline void tsb_context_switch(struct mm_struct *mm)
36{ 36{
37 __tsb_context_switch(__pa(mm->pgd), 37 __tsb_context_switch(__pa(mm->pgd),
38 &mm->context.tsb_block[0], 38 &mm->context.tsb_block[0],
39#ifdef CONFIG_HUGETLB_PAGE 39#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
40 (mm->context.tsb_block[1].tsb ? 40 (mm->context.tsb_block[1].tsb ?
41 &mm->context.tsb_block[1] : 41 &mm->context.tsb_block[1] :
42 NULL) 42 NULL)
diff --git a/arch/sparc/include/asm/page_64.h b/arch/sparc/include/asm/page_64.h
index f0d09b401036..4b39f74d6ca0 100644
--- a/arch/sparc/include/asm/page_64.h
+++ b/arch/sparc/include/asm/page_64.h
@@ -3,13 +3,7 @@
3 3
4#include <linux/const.h> 4#include <linux/const.h>
5 5
6#if defined(CONFIG_SPARC64_PAGE_SIZE_8KB)
7#define PAGE_SHIFT 13 6#define PAGE_SHIFT 13
8#elif defined(CONFIG_SPARC64_PAGE_SIZE_64KB)
9#define PAGE_SHIFT 16
10#else
11#error No page size specified in kernel configuration
12#endif
13 7
14#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) 8#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
15#define PAGE_MASK (~(PAGE_SIZE-1)) 9#define PAGE_MASK (~(PAGE_SIZE-1))
@@ -21,15 +15,9 @@
21#define DCACHE_ALIASING_POSSIBLE 15#define DCACHE_ALIASING_POSSIBLE
22#endif 16#endif
23 17
24#if defined(CONFIG_HUGETLB_PAGE_SIZE_4MB)
25#define HPAGE_SHIFT 22 18#define HPAGE_SHIFT 22
26#elif defined(CONFIG_HUGETLB_PAGE_SIZE_512K)
27#define HPAGE_SHIFT 19
28#elif defined(CONFIG_HUGETLB_PAGE_SIZE_64K)
29#define HPAGE_SHIFT 16
30#endif
31 19
32#ifdef CONFIG_HUGETLB_PAGE 20#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
33#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT) 21#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT)
34#define HPAGE_MASK (~(HPAGE_SIZE - 1UL)) 22#define HPAGE_MASK (~(HPAGE_SIZE - 1UL))
35#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) 23#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
@@ -38,6 +26,11 @@
38 26
39#ifndef __ASSEMBLY__ 27#ifndef __ASSEMBLY__
40 28
29#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
30struct mm_struct;
31extern void hugetlb_setup(struct mm_struct *mm);
32#endif
33
41#define WANT_PAGE_VIRTUAL 34#define WANT_PAGE_VIRTUAL
42 35
43extern void _clear_page(void *page); 36extern void _clear_page(void *page);
@@ -98,7 +91,7 @@ typedef unsigned long pgprot_t;
98 91
99#endif /* (STRICT_MM_TYPECHECKS) */ 92#endif /* (STRICT_MM_TYPECHECKS) */
100 93
101typedef struct page *pgtable_t; 94typedef pte_t *pgtable_t;
102 95
103#define TASK_UNMAPPED_BASE (test_thread_flag(TIF_32BIT) ? \ 96#define TASK_UNMAPPED_BASE (test_thread_flag(TIF_32BIT) ? \
104 (_AC(0x0000000070000000,UL)) : \ 97 (_AC(0x0000000070000000,UL)) : \
diff --git a/arch/sparc/include/asm/pgalloc_64.h b/arch/sparc/include/asm/pgalloc_64.h
index 40b2d7a7023d..bcfe063bce23 100644
--- a/arch/sparc/include/asm/pgalloc_64.h
+++ b/arch/sparc/include/asm/pgalloc_64.h
@@ -38,51 +38,20 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
38 kmem_cache_free(pgtable_cache, pmd); 38 kmem_cache_free(pgtable_cache, pmd);
39} 39}
40 40
41static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, 41extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
42 unsigned long address) 42 unsigned long address);
43{ 43extern pgtable_t pte_alloc_one(struct mm_struct *mm,
44 return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO); 44 unsigned long address);
45} 45extern void pte_free_kernel(struct mm_struct *mm, pte_t *pte);
46 46extern void pte_free(struct mm_struct *mm, pgtable_t ptepage);
47static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
48 unsigned long address)
49{
50 struct page *page;
51 pte_t *pte;
52
53 pte = pte_alloc_one_kernel(mm, address);
54 if (!pte)
55 return NULL;
56 page = virt_to_page(pte);
57 pgtable_page_ctor(page);
58 return page;
59}
60
61static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
62{
63 free_page((unsigned long)pte);
64}
65
66static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
67{
68 pgtable_page_dtor(ptepage);
69 __free_page(ptepage);
70}
71 47
72#define pmd_populate_kernel(MM, PMD, PTE) pmd_set(PMD, PTE) 48#define pmd_populate_kernel(MM, PMD, PTE) pmd_set(MM, PMD, PTE)
73#define pmd_populate(MM,PMD,PTE_PAGE) \ 49#define pmd_populate(MM, PMD, PTE) pmd_set(MM, PMD, PTE)
74 pmd_populate_kernel(MM,PMD,page_address(PTE_PAGE)) 50#define pmd_pgtable(PMD) ((pte_t *)__pmd_page(PMD))
75#define pmd_pgtable(pmd) pmd_page(pmd)
76 51
77#define check_pgt_cache() do { } while (0) 52#define check_pgt_cache() do { } while (0)
78 53
79static inline void pgtable_free(void *table, bool is_page) 54extern void pgtable_free(void *table, bool is_page);
80{
81 if (is_page)
82 free_page((unsigned long)table);
83 else
84 kmem_cache_free(pgtable_cache, table);
85}
86 55
87#ifdef CONFIG_SMP 56#ifdef CONFIG_SMP
88 57
@@ -113,11 +82,10 @@ static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, bool is
113} 82}
114#endif /* !CONFIG_SMP */ 83#endif /* !CONFIG_SMP */
115 84
116static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage, 85static inline void __pte_free_tlb(struct mmu_gather *tlb, pte_t *pte,
117 unsigned long address) 86 unsigned long address)
118{ 87{
119 pgtable_page_dtor(ptepage); 88 pgtable_free_tlb(tlb, pte, true);
120 pgtable_free_tlb(tlb, page_address(ptepage), true);
121} 89}
122 90
123#define __pmd_free_tlb(tlb, pmd, addr) \ 91#define __pmd_free_tlb(tlb, pmd, addr) \
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 61210db139fb..95515f1e7cef 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -45,40 +45,59 @@
45 45
46#define vmemmap ((struct page *)VMEMMAP_BASE) 46#define vmemmap ((struct page *)VMEMMAP_BASE)
47 47
48/* XXX All of this needs to be rethought so we can take advantage
49 * XXX cheetah's full 64-bit virtual address space, ie. no more hole
50 * XXX in the middle like on spitfire. -DaveM
51 */
52/*
53 * Given a virtual address, the lowest PAGE_SHIFT bits determine offset
54 * into the page; the next higher PAGE_SHIFT-3 bits determine the pte#
55 * in the proper pagetable (the -3 is from the 8 byte ptes, and each page
56 * table is a single page long). The next higher PMD_BITS determine pmd#
57 * in the proper pmdtable (where we must have PMD_BITS <= (PAGE_SHIFT-2)
58 * since the pmd entries are 4 bytes, and each pmd page is a single page
59 * long). Finally, the higher few bits determine pgde#.
60 */
61
62/* PMD_SHIFT determines the size of the area a second-level page 48/* PMD_SHIFT determines the size of the area a second-level page
63 * table can map 49 * table can map
64 */ 50 */
65#define PMD_SHIFT (PAGE_SHIFT + (PAGE_SHIFT-3)) 51#define PMD_SHIFT (PAGE_SHIFT + (PAGE_SHIFT-4))
66#define PMD_SIZE (_AC(1,UL) << PMD_SHIFT) 52#define PMD_SIZE (_AC(1,UL) << PMD_SHIFT)
67#define PMD_MASK (~(PMD_SIZE-1)) 53#define PMD_MASK (~(PMD_SIZE-1))
68#define PMD_BITS (PAGE_SHIFT - 2) 54#define PMD_BITS (PAGE_SHIFT - 2)
69 55
70/* PGDIR_SHIFT determines what a third-level page table entry can map */ 56/* PGDIR_SHIFT determines what a third-level page table entry can map */
71#define PGDIR_SHIFT (PAGE_SHIFT + (PAGE_SHIFT-3) + PMD_BITS) 57#define PGDIR_SHIFT (PAGE_SHIFT + (PAGE_SHIFT-4) + PMD_BITS)
72#define PGDIR_SIZE (_AC(1,UL) << PGDIR_SHIFT) 58#define PGDIR_SIZE (_AC(1,UL) << PGDIR_SHIFT)
73#define PGDIR_MASK (~(PGDIR_SIZE-1)) 59#define PGDIR_MASK (~(PGDIR_SIZE-1))
74#define PGDIR_BITS (PAGE_SHIFT - 2) 60#define PGDIR_BITS (PAGE_SHIFT - 2)
75 61
62#if (PGDIR_SHIFT + PGDIR_BITS) != 44
63#error Page table parameters do not cover virtual address space properly.
64#endif
65
66#if (PMD_SHIFT != HPAGE_SHIFT)
67#error PMD_SHIFT must equal HPAGE_SHIFT for transparent huge pages.
68#endif
69
70/* PMDs point to PTE tables which are 4K aligned. */
71#define PMD_PADDR _AC(0xfffffffe,UL)
72#define PMD_PADDR_SHIFT _AC(11,UL)
73
74#ifdef CONFIG_TRANSPARENT_HUGEPAGE
75#define PMD_ISHUGE _AC(0x00000001,UL)
76
77/* This is the PMD layout when PMD_ISHUGE is set. With 4MB huge
78 * pages, this frees up a bunch of bits in the layout that we can
79 * use for the protection settings and software metadata.
80 */
81#define PMD_HUGE_PADDR _AC(0xfffff800,UL)
82#define PMD_HUGE_PROTBITS _AC(0x000007ff,UL)
83#define PMD_HUGE_PRESENT _AC(0x00000400,UL)
84#define PMD_HUGE_WRITE _AC(0x00000200,UL)
85#define PMD_HUGE_DIRTY _AC(0x00000100,UL)
86#define PMD_HUGE_ACCESSED _AC(0x00000080,UL)
87#define PMD_HUGE_EXEC _AC(0x00000040,UL)
88#define PMD_HUGE_SPLITTING _AC(0x00000020,UL)
89#endif
90
91/* PGDs point to PMD tables which are 8K aligned. */
92#define PGD_PADDR _AC(0xfffffffc,UL)
93#define PGD_PADDR_SHIFT _AC(11,UL)
94
76#ifndef __ASSEMBLY__ 95#ifndef __ASSEMBLY__
77 96
78#include <linux/sched.h> 97#include <linux/sched.h>
79 98
80/* Entries per page directory level. */ 99/* Entries per page directory level. */
81#define PTRS_PER_PTE (1UL << (PAGE_SHIFT-3)) 100#define PTRS_PER_PTE (1UL << (PAGE_SHIFT-4))
82#define PTRS_PER_PMD (1UL << PMD_BITS) 101#define PTRS_PER_PMD (1UL << PMD_BITS)
83#define PTRS_PER_PGD (1UL << PGDIR_BITS) 102#define PTRS_PER_PGD (1UL << PGDIR_BITS)
84 103
@@ -160,26 +179,11 @@
160#define _PAGE_SZ8K_4V _AC(0x0000000000000000,UL) /* 8K Page */ 179#define _PAGE_SZ8K_4V _AC(0x0000000000000000,UL) /* 8K Page */
161#define _PAGE_SZALL_4V _AC(0x0000000000000007,UL) /* All pgsz bits */ 180#define _PAGE_SZALL_4V _AC(0x0000000000000007,UL) /* All pgsz bits */
162 181
163#if PAGE_SHIFT == 13
164#define _PAGE_SZBITS_4U _PAGE_SZ8K_4U 182#define _PAGE_SZBITS_4U _PAGE_SZ8K_4U
165#define _PAGE_SZBITS_4V _PAGE_SZ8K_4V 183#define _PAGE_SZBITS_4V _PAGE_SZ8K_4V
166#elif PAGE_SHIFT == 16
167#define _PAGE_SZBITS_4U _PAGE_SZ64K_4U
168#define _PAGE_SZBITS_4V _PAGE_SZ64K_4V
169#else
170#error Wrong PAGE_SHIFT specified
171#endif
172 184
173#if defined(CONFIG_HUGETLB_PAGE_SIZE_4MB)
174#define _PAGE_SZHUGE_4U _PAGE_SZ4MB_4U 185#define _PAGE_SZHUGE_4U _PAGE_SZ4MB_4U
175#define _PAGE_SZHUGE_4V _PAGE_SZ4MB_4V 186#define _PAGE_SZHUGE_4V _PAGE_SZ4MB_4V
176#elif defined(CONFIG_HUGETLB_PAGE_SIZE_512K)
177#define _PAGE_SZHUGE_4U _PAGE_SZ512K_4U
178#define _PAGE_SZHUGE_4V _PAGE_SZ512K_4V
179#elif defined(CONFIG_HUGETLB_PAGE_SIZE_64K)
180#define _PAGE_SZHUGE_4U _PAGE_SZ64K_4U
181#define _PAGE_SZHUGE_4V _PAGE_SZ64K_4V
182#endif
183 187
184/* These are actually filled in at boot time by sun4{u,v}_pgprot_init() */ 188/* These are actually filled in at boot time by sun4{u,v}_pgprot_init() */
185#define __P000 __pgprot(0) 189#define __P000 __pgprot(0)
@@ -218,7 +222,6 @@ extern unsigned long _PAGE_CACHE;
218 222
219extern unsigned long pg_iobits; 223extern unsigned long pg_iobits;
220extern unsigned long _PAGE_ALL_SZ_BITS; 224extern unsigned long _PAGE_ALL_SZ_BITS;
221extern unsigned long _PAGE_SZBITS;
222 225
223extern struct page *mem_map_zero; 226extern struct page *mem_map_zero;
224#define ZERO_PAGE(vaddr) (mem_map_zero) 227#define ZERO_PAGE(vaddr) (mem_map_zero)
@@ -231,25 +234,25 @@ extern struct page *mem_map_zero;
231static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot) 234static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot)
232{ 235{
233 unsigned long paddr = pfn << PAGE_SHIFT; 236 unsigned long paddr = pfn << PAGE_SHIFT;
234 unsigned long sz_bits; 237
235 238 BUILD_BUG_ON(_PAGE_SZBITS_4U != 0UL || _PAGE_SZBITS_4V != 0UL);
236 sz_bits = 0UL; 239 return __pte(paddr | pgprot_val(prot));
237 if (_PAGE_SZBITS_4U != 0UL || _PAGE_SZBITS_4V != 0UL) {
238 __asm__ __volatile__(
239 "\n661: sethi %%uhi(%1), %0\n"
240 " sllx %0, 32, %0\n"
241 " .section .sun4v_2insn_patch, \"ax\"\n"
242 " .word 661b\n"
243 " mov %2, %0\n"
244 " nop\n"
245 " .previous\n"
246 : "=r" (sz_bits)
247 : "i" (_PAGE_SZBITS_4U), "i" (_PAGE_SZBITS_4V));
248 }
249 return __pte(paddr | sz_bits | pgprot_val(prot));
250} 240}
251#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) 241#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
252 242
243#ifdef CONFIG_TRANSPARENT_HUGEPAGE
244extern pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot);
245#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot))
246
247extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot);
248
249static inline pmd_t pmd_mkhuge(pmd_t pmd)
250{
251 /* Do nothing, mk_pmd() does this part. */
252 return pmd;
253}
254#endif
255
253/* This one can be done with two shifts. */ 256/* This one can be done with two shifts. */
254static inline unsigned long pte_pfn(pte_t pte) 257static inline unsigned long pte_pfn(pte_t pte)
255{ 258{
@@ -286,6 +289,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t prot)
286 * Note: We encode this into 3 sun4v 2-insn patch sequences. 289 * Note: We encode this into 3 sun4v 2-insn patch sequences.
287 */ 290 */
288 291
292 BUILD_BUG_ON(_PAGE_SZBITS_4U != 0UL || _PAGE_SZBITS_4V != 0UL);
289 __asm__ __volatile__( 293 __asm__ __volatile__(
290 "\n661: sethi %%uhi(%2), %1\n" 294 "\n661: sethi %%uhi(%2), %1\n"
291 " sethi %%hi(%2), %0\n" 295 " sethi %%hi(%2), %0\n"
@@ -307,10 +311,10 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t prot)
307 : "=r" (mask), "=r" (tmp) 311 : "=r" (mask), "=r" (tmp)
308 : "i" (_PAGE_PADDR_4U | _PAGE_MODIFIED_4U | _PAGE_ACCESSED_4U | 312 : "i" (_PAGE_PADDR_4U | _PAGE_MODIFIED_4U | _PAGE_ACCESSED_4U |
309 _PAGE_CP_4U | _PAGE_CV_4U | _PAGE_E_4U | _PAGE_PRESENT_4U | 313 _PAGE_CP_4U | _PAGE_CV_4U | _PAGE_E_4U | _PAGE_PRESENT_4U |
310 _PAGE_SZBITS_4U | _PAGE_SPECIAL), 314 _PAGE_SPECIAL),
311 "i" (_PAGE_PADDR_4V | _PAGE_MODIFIED_4V | _PAGE_ACCESSED_4V | 315 "i" (_PAGE_PADDR_4V | _PAGE_MODIFIED_4V | _PAGE_ACCESSED_4V |
312 _PAGE_CP_4V | _PAGE_CV_4V | _PAGE_E_4V | _PAGE_PRESENT_4V | 316 _PAGE_CP_4V | _PAGE_CV_4V | _PAGE_E_4V | _PAGE_PRESENT_4V |
313 _PAGE_SZBITS_4V | _PAGE_SPECIAL)); 317 _PAGE_SPECIAL));
314 318
315 return __pte((pte_val(pte) & mask) | (pgprot_val(prot) & ~mask)); 319 return __pte((pte_val(pte) & mask) | (pgprot_val(prot) & ~mask));
316} 320}
@@ -618,19 +622,130 @@ static inline unsigned long pte_special(pte_t pte)
618 return pte_val(pte) & _PAGE_SPECIAL; 622 return pte_val(pte) & _PAGE_SPECIAL;
619} 623}
620 624
621#define pmd_set(pmdp, ptep) \ 625#ifdef CONFIG_TRANSPARENT_HUGEPAGE
622 (pmd_val(*(pmdp)) = (__pa((unsigned long) (ptep)) >> 11UL)) 626static inline int pmd_young(pmd_t pmd)
627{
628 return pmd_val(pmd) & PMD_HUGE_ACCESSED;
629}
630
631static inline int pmd_write(pmd_t pmd)
632{
633 return pmd_val(pmd) & PMD_HUGE_WRITE;
634}
635
636static inline unsigned long pmd_pfn(pmd_t pmd)
637{
638 unsigned long val = pmd_val(pmd) & PMD_HUGE_PADDR;
639
640 return val >> (PAGE_SHIFT - PMD_PADDR_SHIFT);
641}
642
643static inline int pmd_large(pmd_t pmd)
644{
645 return (pmd_val(pmd) & (PMD_ISHUGE | PMD_HUGE_PRESENT)) ==
646 (PMD_ISHUGE | PMD_HUGE_PRESENT);
647}
648
649static inline int pmd_trans_splitting(pmd_t pmd)
650{
651 return (pmd_val(pmd) & (PMD_ISHUGE|PMD_HUGE_SPLITTING)) ==
652 (PMD_ISHUGE|PMD_HUGE_SPLITTING);
653}
654
655static inline int pmd_trans_huge(pmd_t pmd)
656{
657 return pmd_val(pmd) & PMD_ISHUGE;
658}
659
660#define has_transparent_hugepage() 1
661
662static inline pmd_t pmd_mkold(pmd_t pmd)
663{
664 pmd_val(pmd) &= ~PMD_HUGE_ACCESSED;
665 return pmd;
666}
667
668static inline pmd_t pmd_wrprotect(pmd_t pmd)
669{
670 pmd_val(pmd) &= ~PMD_HUGE_WRITE;
671 return pmd;
672}
673
674static inline pmd_t pmd_mkdirty(pmd_t pmd)
675{
676 pmd_val(pmd) |= PMD_HUGE_DIRTY;
677 return pmd;
678}
679
680static inline pmd_t pmd_mkyoung(pmd_t pmd)
681{
682 pmd_val(pmd) |= PMD_HUGE_ACCESSED;
683 return pmd;
684}
685
686static inline pmd_t pmd_mkwrite(pmd_t pmd)
687{
688 pmd_val(pmd) |= PMD_HUGE_WRITE;
689 return pmd;
690}
691
692static inline pmd_t pmd_mknotpresent(pmd_t pmd)
693{
694 pmd_val(pmd) &= ~PMD_HUGE_PRESENT;
695 return pmd;
696}
697
698static inline pmd_t pmd_mksplitting(pmd_t pmd)
699{
700 pmd_val(pmd) |= PMD_HUGE_SPLITTING;
701 return pmd;
702}
703
704extern pgprot_t pmd_pgprot(pmd_t entry);
705#endif
706
707static inline int pmd_present(pmd_t pmd)
708{
709 return pmd_val(pmd) != 0U;
710}
711
712#define pmd_none(pmd) (!pmd_val(pmd))
713
714#ifdef CONFIG_TRANSPARENT_HUGEPAGE
715extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
716 pmd_t *pmdp, pmd_t pmd);
717#else
718static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
719 pmd_t *pmdp, pmd_t pmd)
720{
721 *pmdp = pmd;
722}
723#endif
724
725static inline void pmd_set(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep)
726{
727 unsigned long val = __pa((unsigned long) (ptep)) >> PMD_PADDR_SHIFT;
728
729 pmd_val(*pmdp) = val;
730}
731
623#define pud_set(pudp, pmdp) \ 732#define pud_set(pudp, pmdp) \
624 (pud_val(*(pudp)) = (__pa((unsigned long) (pmdp)) >> 11UL)) 733 (pud_val(*(pudp)) = (__pa((unsigned long) (pmdp)) >> PGD_PADDR_SHIFT))
625#define __pmd_page(pmd) \ 734static inline unsigned long __pmd_page(pmd_t pmd)
626 ((unsigned long) __va((((unsigned long)pmd_val(pmd))<<11UL))) 735{
736 unsigned long paddr = (unsigned long) pmd_val(pmd);
737#ifdef CONFIG_TRANSPARENT_HUGEPAGE
738 if (pmd_val(pmd) & PMD_ISHUGE)
739 paddr &= PMD_HUGE_PADDR;
740#endif
741 paddr <<= PMD_PADDR_SHIFT;
742 return ((unsigned long) __va(paddr));
743}
627#define pmd_page(pmd) virt_to_page((void *)__pmd_page(pmd)) 744#define pmd_page(pmd) virt_to_page((void *)__pmd_page(pmd))
628#define pud_page_vaddr(pud) \ 745#define pud_page_vaddr(pud) \
629 ((unsigned long) __va((((unsigned long)pud_val(pud))<<11UL))) 746 ((unsigned long) __va((((unsigned long)pud_val(pud))<<PGD_PADDR_SHIFT)))
630#define pud_page(pud) virt_to_page((void *)pud_page_vaddr(pud)) 747#define pud_page(pud) virt_to_page((void *)pud_page_vaddr(pud))
631#define pmd_none(pmd) (!pmd_val(pmd))
632#define pmd_bad(pmd) (0) 748#define pmd_bad(pmd) (0)
633#define pmd_present(pmd) (pmd_val(pmd) != 0U)
634#define pmd_clear(pmdp) (pmd_val(*(pmdp)) = 0U) 749#define pmd_clear(pmdp) (pmd_val(*(pmdp)) = 0U)
635#define pud_none(pud) (!pud_val(pud)) 750#define pud_none(pud) (!pud_val(pud))
636#define pud_bad(pud) (0) 751#define pud_bad(pud) (0)
@@ -664,6 +779,16 @@ static inline unsigned long pte_special(pte_t pte)
664extern void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, 779extern void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr,
665 pte_t *ptep, pte_t orig, int fullmm); 780 pte_t *ptep, pte_t orig, int fullmm);
666 781
782#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
783static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
784 unsigned long addr,
785 pmd_t *pmdp)
786{
787 pmd_t pmd = *pmdp;
788 set_pmd_at(mm, addr, pmdp, __pmd(0U));
789 return pmd;
790}
791
667static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, 792static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
668 pte_t *ptep, pte_t pte, int fullmm) 793 pte_t *ptep, pte_t pte, int fullmm)
669{ 794{
@@ -719,6 +844,16 @@ extern void mmu_info(struct seq_file *);
719 844
720struct vm_area_struct; 845struct vm_area_struct;
721extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t *); 846extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t *);
847#ifdef CONFIG_TRANSPARENT_HUGEPAGE
848extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
849 pmd_t *pmd);
850
851#define __HAVE_ARCH_PGTABLE_DEPOSIT
852extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable);
853
854#define __HAVE_ARCH_PGTABLE_WITHDRAW
855extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm);
856#endif
722 857
723/* Encode and de-code a swap entry */ 858/* Encode and de-code a swap entry */
724#define __swp_type(entry) (((entry).val >> PAGE_SHIFT) & 0xffUL) 859#define __swp_type(entry) (((entry).val >> PAGE_SHIFT) & 0xffUL)
diff --git a/arch/sparc/include/asm/tsb.h b/arch/sparc/include/asm/tsb.h
index 1a8afd1ad04f..b4c258de4443 100644
--- a/arch/sparc/include/asm/tsb.h
+++ b/arch/sparc/include/asm/tsb.h
@@ -147,20 +147,96 @@ extern struct tsb_phys_patch_entry __tsb_phys_patch, __tsb_phys_patch_end;
147 brz,pn REG1, FAIL_LABEL; \ 147 brz,pn REG1, FAIL_LABEL; \
148 sllx VADDR, 64 - (PMD_SHIFT + PMD_BITS), REG2; \ 148 sllx VADDR, 64 - (PMD_SHIFT + PMD_BITS), REG2; \
149 srlx REG2, 64 - PAGE_SHIFT, REG2; \ 149 srlx REG2, 64 - PAGE_SHIFT, REG2; \
150 sllx REG1, 11, REG1; \ 150 sllx REG1, PGD_PADDR_SHIFT, REG1; \
151 andn REG2, 0x3, REG2; \ 151 andn REG2, 0x3, REG2; \
152 lduwa [REG1 + REG2] ASI_PHYS_USE_EC, REG1; \ 152 lduwa [REG1 + REG2] ASI_PHYS_USE_EC, REG1; \
153 brz,pn REG1, FAIL_LABEL; \ 153 brz,pn REG1, FAIL_LABEL; \
154 sllx VADDR, 64 - PMD_SHIFT, REG2; \ 154 sllx VADDR, 64 - PMD_SHIFT, REG2; \
155 srlx REG2, 64 - PAGE_SHIFT, REG2; \ 155 srlx REG2, 64 - (PAGE_SHIFT - 1), REG2; \
156 sllx REG1, 11, REG1; \ 156 sllx REG1, PMD_PADDR_SHIFT, REG1; \
157 andn REG2, 0x7, REG2; \ 157 andn REG2, 0x7, REG2; \
158 add REG1, REG2, REG1; 158 add REG1, REG2, REG1;
159 159
160 /* Do a user page table walk in MMU globals. Leaves physical PTE 160 /* This macro exists only to make the PMD translator below easier
161 * pointer in REG1. Jumps to FAIL_LABEL on early page table walk 161 * to read. It hides the ELF section switch for the sun4v code
162 * termination. Physical base of page tables is in PHYS_PGD which 162 * patching.
163 * will not be modified. 163 */
164#define OR_PTE_BIT(REG, NAME) \
165661: or REG, _PAGE_##NAME##_4U, REG; \
166 .section .sun4v_1insn_patch, "ax"; \
167 .word 661b; \
168 or REG, _PAGE_##NAME##_4V, REG; \
169 .previous;
170
171 /* Load into REG the PTE value for VALID, CACHE, and SZHUGE. */
172#define BUILD_PTE_VALID_SZHUGE_CACHE(REG) \
173661: sethi %uhi(_PAGE_VALID|_PAGE_SZHUGE_4U), REG; \
174 .section .sun4v_1insn_patch, "ax"; \
175 .word 661b; \
176 sethi %uhi(_PAGE_VALID), REG; \
177 .previous; \
178 sllx REG, 32, REG; \
179661: or REG, _PAGE_CP_4U|_PAGE_CV_4U, REG; \
180 .section .sun4v_1insn_patch, "ax"; \
181 .word 661b; \
182 or REG, _PAGE_CP_4V|_PAGE_CV_4V|_PAGE_SZHUGE_4V, REG; \
183 .previous;
184
185 /* PMD has been loaded into REG1, interpret the value, seeing
186 * if it is a HUGE PMD or a normal one. If it is not valid
187 * then jump to FAIL_LABEL. If it is a HUGE PMD, and it
188 * translates to a valid PTE, branch to PTE_LABEL.
189 *
190 * We translate the PMD by hand, one bit at a time,
191 * constructing the huge PTE.
192 *
193 * So we construct the PTE in REG2 as follows:
194 *
195 * 1) Extract the PMD PFN from REG1 and place it into REG2.
196 *
197 * 2) Translate PMD protection bits in REG1 into REG2, one bit
198 * at a time using andcc tests on REG1 and OR's into REG2.
199 *
200 * Only two bits to be concerned with here, EXEC and WRITE.
201 * Now REG1 is freed up and we can use it as a temporary.
202 *
203 * 3) Construct the VALID, CACHE, and page size PTE bits in
204 * REG1, OR with REG2 to form final PTE.
205 */
206#ifdef CONFIG_TRANSPARENT_HUGEPAGE
207#define USER_PGTABLE_CHECK_PMD_HUGE(VADDR, REG1, REG2, FAIL_LABEL, PTE_LABEL) \
208 brz,pn REG1, FAIL_LABEL; \
209 andcc REG1, PMD_ISHUGE, %g0; \
210 be,pt %xcc, 700f; \
211 and REG1, PMD_HUGE_PRESENT|PMD_HUGE_ACCESSED, REG2; \
212 cmp REG2, PMD_HUGE_PRESENT|PMD_HUGE_ACCESSED; \
213 bne,pn %xcc, FAIL_LABEL; \
214 andn REG1, PMD_HUGE_PROTBITS, REG2; \
215 sllx REG2, PMD_PADDR_SHIFT, REG2; \
216 /* REG2 now holds PFN << PAGE_SHIFT */ \
217 andcc REG1, PMD_HUGE_EXEC, %g0; \
218 bne,a,pt %xcc, 1f; \
219 OR_PTE_BIT(REG2, EXEC); \
2201: andcc REG1, PMD_HUGE_WRITE, %g0; \
221 bne,a,pt %xcc, 1f; \
222 OR_PTE_BIT(REG2, W); \
223 /* REG1 can now be clobbered, build final PTE */ \
2241: BUILD_PTE_VALID_SZHUGE_CACHE(REG1); \
225 ba,pt %xcc, PTE_LABEL; \
226 or REG1, REG2, REG1; \
227700:
228#else
229#define USER_PGTABLE_CHECK_PMD_HUGE(VADDR, REG1, REG2, FAIL_LABEL, PTE_LABEL) \
230 brz,pn REG1, FAIL_LABEL; \
231 nop;
232#endif
233
234 /* Do a user page table walk in MMU globals. Leaves final,
235 * valid, PTE value in REG1. Jumps to FAIL_LABEL on early
236 * page table walk termination or if the PTE is not valid.
237 *
238 * Physical base of page tables is in PHYS_PGD which will not
239 * be modified.
164 * 240 *
165 * VADDR will not be clobbered, but REG1 and REG2 will. 241 * VADDR will not be clobbered, but REG1 and REG2 will.
166 */ 242 */
@@ -172,15 +248,19 @@ extern struct tsb_phys_patch_entry __tsb_phys_patch, __tsb_phys_patch_end;
172 brz,pn REG1, FAIL_LABEL; \ 248 brz,pn REG1, FAIL_LABEL; \
173 sllx VADDR, 64 - (PMD_SHIFT + PMD_BITS), REG2; \ 249 sllx VADDR, 64 - (PMD_SHIFT + PMD_BITS), REG2; \
174 srlx REG2, 64 - PAGE_SHIFT, REG2; \ 250 srlx REG2, 64 - PAGE_SHIFT, REG2; \
175 sllx REG1, 11, REG1; \ 251 sllx REG1, PGD_PADDR_SHIFT, REG1; \
176 andn REG2, 0x3, REG2; \ 252 andn REG2, 0x3, REG2; \
177 lduwa [REG1 + REG2] ASI_PHYS_USE_EC, REG1; \ 253 lduwa [REG1 + REG2] ASI_PHYS_USE_EC, REG1; \
178 brz,pn REG1, FAIL_LABEL; \ 254 USER_PGTABLE_CHECK_PMD_HUGE(VADDR, REG1, REG2, FAIL_LABEL, 800f) \
179 sllx VADDR, 64 - PMD_SHIFT, REG2; \ 255 sllx VADDR, 64 - PMD_SHIFT, REG2; \
180 srlx REG2, 64 - PAGE_SHIFT, REG2; \ 256 srlx REG2, 64 - (PAGE_SHIFT - 1), REG2; \
181 sllx REG1, 11, REG1; \ 257 sllx REG1, PMD_PADDR_SHIFT, REG1; \
182 andn REG2, 0x7, REG2; \ 258 andn REG2, 0x7, REG2; \
183 add REG1, REG2, REG1; 259 add REG1, REG2, REG1; \
260 ldxa [REG1] ASI_PHYS_USE_EC, REG1; \
261 brgez,pn REG1, FAIL_LABEL; \
262 nop; \
263800:
184 264
185/* Lookup a OBP mapping on VADDR in the prom_trans[] table at TL>0. 265/* Lookup a OBP mapping on VADDR in the prom_trans[] table at TL>0.
186 * If no entry is found, FAIL_LABEL will be branched to. On success 266 * If no entry is found, FAIL_LABEL will be branched to. On success
diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c
index acc8c838ff72..75b31bcdeadf 100644
--- a/arch/sparc/kernel/pci.c
+++ b/arch/sparc/kernel/pci.c
@@ -779,7 +779,7 @@ static int __pci_mmap_make_offset(struct pci_dev *pdev,
779static void __pci_mmap_set_flags(struct pci_dev *dev, struct vm_area_struct *vma, 779static void __pci_mmap_set_flags(struct pci_dev *dev, struct vm_area_struct *vma,
780 enum pci_mmap_state mmap_state) 780 enum pci_mmap_state mmap_state)
781{ 781{
782 vma->vm_flags |= (VM_IO | VM_RESERVED); 782 vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
783} 783}
784 784
785/* Set vm_page_prot of VMA, as appropriate for this architecture, for a pci 785/* Set vm_page_prot of VMA, as appropriate for this architecture, for a pci
diff --git a/arch/sparc/kernel/sun4v_tlb_miss.S b/arch/sparc/kernel/sun4v_tlb_miss.S
index e1fbf8c75787..bde867fd71e8 100644
--- a/arch/sparc/kernel/sun4v_tlb_miss.S
+++ b/arch/sparc/kernel/sun4v_tlb_miss.S
@@ -176,7 +176,7 @@ sun4v_tsb_miss_common:
176 176
177 sub %g2, TRAP_PER_CPU_FAULT_INFO, %g2 177 sub %g2, TRAP_PER_CPU_FAULT_INFO, %g2
178 178
179#ifdef CONFIG_HUGETLB_PAGE 179#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
180 mov SCRATCHPAD_UTSBREG2, %g5 180 mov SCRATCHPAD_UTSBREG2, %g5
181 ldxa [%g5] ASI_SCRATCHPAD, %g5 181 ldxa [%g5] ASI_SCRATCHPAD, %g5
182 cmp %g5, -1 182 cmp %g5, -1
diff --git a/arch/sparc/kernel/tsb.S b/arch/sparc/kernel/tsb.S
index db15d123f054..d4bdc7a62375 100644
--- a/arch/sparc/kernel/tsb.S
+++ b/arch/sparc/kernel/tsb.S
@@ -49,7 +49,7 @@ tsb_miss_page_table_walk:
49 /* Before committing to a full page table walk, 49 /* Before committing to a full page table walk,
50 * check the huge page TSB. 50 * check the huge page TSB.
51 */ 51 */
52#ifdef CONFIG_HUGETLB_PAGE 52#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
53 53
54661: ldx [%g7 + TRAP_PER_CPU_TSB_HUGE], %g5 54661: ldx [%g7 + TRAP_PER_CPU_TSB_HUGE], %g5
55 nop 55 nop
@@ -110,12 +110,9 @@ tsb_miss_page_table_walk:
110tsb_miss_page_table_walk_sun4v_fastpath: 110tsb_miss_page_table_walk_sun4v_fastpath:
111 USER_PGTABLE_WALK_TL1(%g4, %g7, %g5, %g2, tsb_do_fault) 111 USER_PGTABLE_WALK_TL1(%g4, %g7, %g5, %g2, tsb_do_fault)
112 112
113 /* Load and check PTE. */ 113 /* Valid PTE is now in %g5. */
114 ldxa [%g5] ASI_PHYS_USE_EC, %g5
115 brgez,pn %g5, tsb_do_fault
116 nop
117 114
118#ifdef CONFIG_HUGETLB_PAGE 115#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
119661: sethi %uhi(_PAGE_SZALL_4U), %g7 116661: sethi %uhi(_PAGE_SZALL_4U), %g7
120 sllx %g7, 32, %g7 117 sllx %g7, 32, %g7
121 .section .sun4v_2insn_patch, "ax" 118 .section .sun4v_2insn_patch, "ax"
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
index 77ac917be152..e98bfda205a2 100644
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -265,6 +265,7 @@ good_area:
265 } 265 }
266 if (fault & VM_FAULT_RETRY) { 266 if (fault & VM_FAULT_RETRY) {
267 flags &= ~FAULT_FLAG_ALLOW_RETRY; 267 flags &= ~FAULT_FLAG_ALLOW_RETRY;
268 flags |= FAULT_FLAG_TRIED;
268 269
269 /* No need to up_read(&mm->mmap_sem) as we would 270 /* No need to up_read(&mm->mmap_sem) as we would
270 * have already released it in __lock_page_or_retry 271 * have already released it in __lock_page_or_retry
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 1fe0429b6314..2976dba1ebaf 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -452,6 +452,7 @@ good_area:
452 } 452 }
453 if (fault & VM_FAULT_RETRY) { 453 if (fault & VM_FAULT_RETRY) {
454 flags &= ~FAULT_FLAG_ALLOW_RETRY; 454 flags &= ~FAULT_FLAG_ALLOW_RETRY;
455 flags |= FAULT_FLAG_TRIED;
455 456
456 /* No need to up_read(&mm->mmap_sem) as we would 457 /* No need to up_read(&mm->mmap_sem) as we would
457 * have already released it in __lock_page_or_retry 458 * have already released it in __lock_page_or_retry
@@ -464,13 +465,13 @@ good_area:
464 up_read(&mm->mmap_sem); 465 up_read(&mm->mmap_sem);
465 466
466 mm_rss = get_mm_rss(mm); 467 mm_rss = get_mm_rss(mm);
467#ifdef CONFIG_HUGETLB_PAGE 468#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
468 mm_rss -= (mm->context.huge_pte_count * (HPAGE_SIZE / PAGE_SIZE)); 469 mm_rss -= (mm->context.huge_pte_count * (HPAGE_SIZE / PAGE_SIZE));
469#endif 470#endif
470 if (unlikely(mm_rss > 471 if (unlikely(mm_rss >
471 mm->context.tsb_block[MM_TSB_BASE].tsb_rss_limit)) 472 mm->context.tsb_block[MM_TSB_BASE].tsb_rss_limit))
472 tsb_grow(mm, MM_TSB_BASE, mm_rss); 473 tsb_grow(mm, MM_TSB_BASE, mm_rss);
473#ifdef CONFIG_HUGETLB_PAGE 474#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
474 mm_rss = mm->context.huge_pte_count; 475 mm_rss = mm->context.huge_pte_count;
475 if (unlikely(mm_rss > 476 if (unlikely(mm_rss >
476 mm->context.tsb_block[MM_TSB_HUGE].tsb_rss_limit)) 477 mm->context.tsb_block[MM_TSB_HUGE].tsb_rss_limit))
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index 07e14535375c..f76f83d5ac63 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -303,53 +303,3 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
303{ 303{
304 return NULL; 304 return NULL;
305} 305}
306
307static void context_reload(void *__data)
308{
309 struct mm_struct *mm = __data;
310
311 if (mm == current->mm)
312 load_secondary_context(mm);
313}
314
315void hugetlb_prefault_arch_hook(struct mm_struct *mm)
316{
317 struct tsb_config *tp = &mm->context.tsb_block[MM_TSB_HUGE];
318
319 if (likely(tp->tsb != NULL))
320 return;
321
322 tsb_grow(mm, MM_TSB_HUGE, 0);
323 tsb_context_switch(mm);
324 smp_tsb_sync(mm);
325
326 /* On UltraSPARC-III+ and later, configure the second half of
327 * the Data-TLB for huge pages.
328 */
329 if (tlb_type == cheetah_plus) {
330 unsigned long ctx;
331
332 spin_lock(&ctx_alloc_lock);
333 ctx = mm->context.sparc64_ctx_val;
334 ctx &= ~CTX_PGSZ_MASK;
335 ctx |= CTX_PGSZ_BASE << CTX_PGSZ0_SHIFT;
336 ctx |= CTX_PGSZ_HUGE << CTX_PGSZ1_SHIFT;
337
338 if (ctx != mm->context.sparc64_ctx_val) {
339 /* When changing the page size fields, we
340 * must perform a context flush so that no
341 * stale entries match. This flush must
342 * occur with the original context register
343 * settings.
344 */
345 do_flush_tlb_mm(mm);
346
347 /* Reload the context register of all processors
348 * also executing in this address space.
349 */
350 mm->context.sparc64_ctx_val = ctx;
351 on_each_cpu(context_reload, mm, 0);
352 }
353 spin_unlock(&ctx_alloc_lock);
354 }
355}
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 7a9b788c6ced..9e28a118e6a4 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -276,7 +276,6 @@ static inline void tsb_insert(struct tsb *ent, unsigned long tag, unsigned long
276} 276}
277 277
278unsigned long _PAGE_ALL_SZ_BITS __read_mostly; 278unsigned long _PAGE_ALL_SZ_BITS __read_mostly;
279unsigned long _PAGE_SZBITS __read_mostly;
280 279
281static void flush_dcache(unsigned long pfn) 280static void flush_dcache(unsigned long pfn)
282{ 281{
@@ -307,12 +306,24 @@ static void flush_dcache(unsigned long pfn)
307 } 306 }
308} 307}
309 308
309/* mm->context.lock must be held */
310static void __update_mmu_tsb_insert(struct mm_struct *mm, unsigned long tsb_index,
311 unsigned long tsb_hash_shift, unsigned long address,
312 unsigned long tte)
313{
314 struct tsb *tsb = mm->context.tsb_block[tsb_index].tsb;
315 unsigned long tag;
316
317 tsb += ((address >> tsb_hash_shift) &
318 (mm->context.tsb_block[tsb_index].tsb_nentries - 1UL));
319 tag = (address >> 22UL);
320 tsb_insert(tsb, tag, tte);
321}
322
310void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) 323void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep)
311{ 324{
325 unsigned long tsb_index, tsb_hash_shift, flags;
312 struct mm_struct *mm; 326 struct mm_struct *mm;
313 struct tsb *tsb;
314 unsigned long tag, flags;
315 unsigned long tsb_index, tsb_hash_shift;
316 pte_t pte = *ptep; 327 pte_t pte = *ptep;
317 328
318 if (tlb_type != hypervisor) { 329 if (tlb_type != hypervisor) {
@@ -329,7 +340,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *
329 340
330 spin_lock_irqsave(&mm->context.lock, flags); 341 spin_lock_irqsave(&mm->context.lock, flags);
331 342
332#ifdef CONFIG_HUGETLB_PAGE 343#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
333 if (mm->context.tsb_block[MM_TSB_HUGE].tsb != NULL) { 344 if (mm->context.tsb_block[MM_TSB_HUGE].tsb != NULL) {
334 if ((tlb_type == hypervisor && 345 if ((tlb_type == hypervisor &&
335 (pte_val(pte) & _PAGE_SZALL_4V) == _PAGE_SZHUGE_4V) || 346 (pte_val(pte) & _PAGE_SZALL_4V) == _PAGE_SZHUGE_4V) ||
@@ -341,11 +352,8 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *
341 } 352 }
342#endif 353#endif
343 354
344 tsb = mm->context.tsb_block[tsb_index].tsb; 355 __update_mmu_tsb_insert(mm, tsb_index, tsb_hash_shift,
345 tsb += ((address >> tsb_hash_shift) & 356 address, pte_val(pte));
346 (mm->context.tsb_block[tsb_index].tsb_nentries - 1UL));
347 tag = (address >> 22UL);
348 tsb_insert(tsb, tag, pte_val(pte));
349 357
350 spin_unlock_irqrestore(&mm->context.lock, flags); 358 spin_unlock_irqrestore(&mm->context.lock, flags);
351} 359}
@@ -2275,8 +2283,7 @@ static void __init sun4u_pgprot_init(void)
2275 __ACCESS_BITS_4U | _PAGE_E_4U); 2283 __ACCESS_BITS_4U | _PAGE_E_4U);
2276 2284
2277#ifdef CONFIG_DEBUG_PAGEALLOC 2285#ifdef CONFIG_DEBUG_PAGEALLOC
2278 kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZBITS_4U) ^ 2286 kern_linear_pte_xor[0] = _PAGE_VALID ^ 0xfffff80000000000UL;
2279 0xfffff80000000000UL;
2280#else 2287#else
2281 kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4U) ^ 2288 kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4U) ^
2282 0xfffff80000000000UL; 2289 0xfffff80000000000UL;
@@ -2287,7 +2294,6 @@ static void __init sun4u_pgprot_init(void)
2287 for (i = 1; i < 4; i++) 2294 for (i = 1; i < 4; i++)
2288 kern_linear_pte_xor[i] = kern_linear_pte_xor[0]; 2295 kern_linear_pte_xor[i] = kern_linear_pte_xor[0];
2289 2296
2290 _PAGE_SZBITS = _PAGE_SZBITS_4U;
2291 _PAGE_ALL_SZ_BITS = (_PAGE_SZ4MB_4U | _PAGE_SZ512K_4U | 2297 _PAGE_ALL_SZ_BITS = (_PAGE_SZ4MB_4U | _PAGE_SZ512K_4U |
2292 _PAGE_SZ64K_4U | _PAGE_SZ8K_4U | 2298 _PAGE_SZ64K_4U | _PAGE_SZ8K_4U |
2293 _PAGE_SZ32MB_4U | _PAGE_SZ256MB_4U); 2299 _PAGE_SZ32MB_4U | _PAGE_SZ256MB_4U);
@@ -2324,8 +2330,7 @@ static void __init sun4v_pgprot_init(void)
2324 _PAGE_CACHE = _PAGE_CACHE_4V; 2330 _PAGE_CACHE = _PAGE_CACHE_4V;
2325 2331
2326#ifdef CONFIG_DEBUG_PAGEALLOC 2332#ifdef CONFIG_DEBUG_PAGEALLOC
2327 kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZBITS_4V) ^ 2333 kern_linear_pte_xor[0] = _PAGE_VALID ^ 0xfffff80000000000UL;
2328 0xfffff80000000000UL;
2329#else 2334#else
2330 kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4V) ^ 2335 kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4V) ^
2331 0xfffff80000000000UL; 2336 0xfffff80000000000UL;
@@ -2339,7 +2344,6 @@ static void __init sun4v_pgprot_init(void)
2339 pg_iobits = (_PAGE_VALID | _PAGE_PRESENT_4V | __DIRTY_BITS_4V | 2344 pg_iobits = (_PAGE_VALID | _PAGE_PRESENT_4V | __DIRTY_BITS_4V |
2340 __ACCESS_BITS_4V | _PAGE_E_4V); 2345 __ACCESS_BITS_4V | _PAGE_E_4V);
2341 2346
2342 _PAGE_SZBITS = _PAGE_SZBITS_4V;
2343 _PAGE_ALL_SZ_BITS = (_PAGE_SZ16GB_4V | _PAGE_SZ2GB_4V | 2347 _PAGE_ALL_SZ_BITS = (_PAGE_SZ16GB_4V | _PAGE_SZ2GB_4V |
2344 _PAGE_SZ256MB_4V | _PAGE_SZ32MB_4V | 2348 _PAGE_SZ256MB_4V | _PAGE_SZ32MB_4V |
2345 _PAGE_SZ4MB_4V | _PAGE_SZ512K_4V | 2349 _PAGE_SZ4MB_4V | _PAGE_SZ512K_4V |
@@ -2472,3 +2476,281 @@ void __flush_tlb_all(void)
2472 __asm__ __volatile__("wrpr %0, 0, %%pstate" 2476 __asm__ __volatile__("wrpr %0, 0, %%pstate"
2473 : : "r" (pstate)); 2477 : : "r" (pstate));
2474} 2478}
2479
2480static pte_t *get_from_cache(struct mm_struct *mm)
2481{
2482 struct page *page;
2483 pte_t *ret;
2484
2485 spin_lock(&mm->page_table_lock);
2486 page = mm->context.pgtable_page;
2487 ret = NULL;
2488 if (page) {
2489 void *p = page_address(page);
2490
2491 mm->context.pgtable_page = NULL;
2492
2493 ret = (pte_t *) (p + (PAGE_SIZE / 2));
2494 }
2495 spin_unlock(&mm->page_table_lock);
2496
2497 return ret;
2498}
2499
2500static struct page *__alloc_for_cache(struct mm_struct *mm)
2501{
2502 struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK |
2503 __GFP_REPEAT | __GFP_ZERO);
2504
2505 if (page) {
2506 spin_lock(&mm->page_table_lock);
2507 if (!mm->context.pgtable_page) {
2508 atomic_set(&page->_count, 2);
2509 mm->context.pgtable_page = page;
2510 }
2511 spin_unlock(&mm->page_table_lock);
2512 }
2513 return page;
2514}
2515
2516pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
2517 unsigned long address)
2518{
2519 struct page *page;
2520 pte_t *pte;
2521
2522 pte = get_from_cache(mm);
2523 if (pte)
2524 return pte;
2525
2526 page = __alloc_for_cache(mm);
2527 if (page)
2528 pte = (pte_t *) page_address(page);
2529
2530 return pte;
2531}
2532
2533pgtable_t pte_alloc_one(struct mm_struct *mm,
2534 unsigned long address)
2535{
2536 struct page *page;
2537 pte_t *pte;
2538
2539 pte = get_from_cache(mm);
2540 if (pte)
2541 return pte;
2542
2543 page = __alloc_for_cache(mm);
2544 if (page) {
2545 pgtable_page_ctor(page);
2546 pte = (pte_t *) page_address(page);
2547 }
2548
2549 return pte;
2550}
2551
2552void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
2553{
2554 struct page *page = virt_to_page(pte);
2555 if (put_page_testzero(page))
2556 free_hot_cold_page(page, 0);
2557}
2558
2559static void __pte_free(pgtable_t pte)
2560{
2561 struct page *page = virt_to_page(pte);
2562 if (put_page_testzero(page)) {
2563 pgtable_page_dtor(page);
2564 free_hot_cold_page(page, 0);
2565 }
2566}
2567
2568void pte_free(struct mm_struct *mm, pgtable_t pte)
2569{
2570 __pte_free(pte);
2571}
2572
2573void pgtable_free(void *table, bool is_page)
2574{
2575 if (is_page)
2576 __pte_free(table);
2577 else
2578 kmem_cache_free(pgtable_cache, table);
2579}
2580
2581#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2582static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot, bool for_modify)
2583{
2584 if (pgprot_val(pgprot) & _PAGE_VALID)
2585 pmd_val(pmd) |= PMD_HUGE_PRESENT;
2586 if (tlb_type == hypervisor) {
2587 if (pgprot_val(pgprot) & _PAGE_WRITE_4V)
2588 pmd_val(pmd) |= PMD_HUGE_WRITE;
2589 if (pgprot_val(pgprot) & _PAGE_EXEC_4V)
2590 pmd_val(pmd) |= PMD_HUGE_EXEC;
2591
2592 if (!for_modify) {
2593 if (pgprot_val(pgprot) & _PAGE_ACCESSED_4V)
2594 pmd_val(pmd) |= PMD_HUGE_ACCESSED;
2595 if (pgprot_val(pgprot) & _PAGE_MODIFIED_4V)
2596 pmd_val(pmd) |= PMD_HUGE_DIRTY;
2597 }
2598 } else {
2599 if (pgprot_val(pgprot) & _PAGE_WRITE_4U)
2600 pmd_val(pmd) |= PMD_HUGE_WRITE;
2601 if (pgprot_val(pgprot) & _PAGE_EXEC_4U)
2602 pmd_val(pmd) |= PMD_HUGE_EXEC;
2603
2604 if (!for_modify) {
2605 if (pgprot_val(pgprot) & _PAGE_ACCESSED_4U)
2606 pmd_val(pmd) |= PMD_HUGE_ACCESSED;
2607 if (pgprot_val(pgprot) & _PAGE_MODIFIED_4U)
2608 pmd_val(pmd) |= PMD_HUGE_DIRTY;
2609 }
2610 }
2611
2612 return pmd;
2613}
2614
2615pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
2616{
2617 pmd_t pmd;
2618
2619 pmd_val(pmd) = (page_nr << ((PAGE_SHIFT - PMD_PADDR_SHIFT)));
2620 pmd_val(pmd) |= PMD_ISHUGE;
2621 pmd = pmd_set_protbits(pmd, pgprot, false);
2622 return pmd;
2623}
2624
2625pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
2626{
2627 pmd_val(pmd) &= ~(PMD_HUGE_PRESENT |
2628 PMD_HUGE_WRITE |
2629 PMD_HUGE_EXEC);
2630 pmd = pmd_set_protbits(pmd, newprot, true);
2631 return pmd;
2632}
2633
2634pgprot_t pmd_pgprot(pmd_t entry)
2635{
2636 unsigned long pte = 0;
2637
2638 if (pmd_val(entry) & PMD_HUGE_PRESENT)
2639 pte |= _PAGE_VALID;
2640
2641 if (tlb_type == hypervisor) {
2642 if (pmd_val(entry) & PMD_HUGE_PRESENT)
2643 pte |= _PAGE_PRESENT_4V;
2644 if (pmd_val(entry) & PMD_HUGE_EXEC)
2645 pte |= _PAGE_EXEC_4V;
2646 if (pmd_val(entry) & PMD_HUGE_WRITE)
2647 pte |= _PAGE_W_4V;
2648 if (pmd_val(entry) & PMD_HUGE_ACCESSED)
2649 pte |= _PAGE_ACCESSED_4V;
2650 if (pmd_val(entry) & PMD_HUGE_DIRTY)
2651 pte |= _PAGE_MODIFIED_4V;
2652 pte |= _PAGE_CP_4V|_PAGE_CV_4V;
2653 } else {
2654 if (pmd_val(entry) & PMD_HUGE_PRESENT)
2655 pte |= _PAGE_PRESENT_4U;
2656 if (pmd_val(entry) & PMD_HUGE_EXEC)
2657 pte |= _PAGE_EXEC_4U;
2658 if (pmd_val(entry) & PMD_HUGE_WRITE)
2659 pte |= _PAGE_W_4U;
2660 if (pmd_val(entry) & PMD_HUGE_ACCESSED)
2661 pte |= _PAGE_ACCESSED_4U;
2662 if (pmd_val(entry) & PMD_HUGE_DIRTY)
2663 pte |= _PAGE_MODIFIED_4U;
2664 pte |= _PAGE_CP_4U|_PAGE_CV_4U;
2665 }
2666
2667 return __pgprot(pte);
2668}
2669
2670void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
2671 pmd_t *pmd)
2672{
2673 unsigned long pte, flags;
2674 struct mm_struct *mm;
2675 pmd_t entry = *pmd;
2676 pgprot_t prot;
2677
2678 if (!pmd_large(entry) || !pmd_young(entry))
2679 return;
2680
2681 pte = (pmd_val(entry) & ~PMD_HUGE_PROTBITS);
2682 pte <<= PMD_PADDR_SHIFT;
2683 pte |= _PAGE_VALID;
2684
2685 prot = pmd_pgprot(entry);
2686
2687 if (tlb_type == hypervisor)
2688 pgprot_val(prot) |= _PAGE_SZHUGE_4V;
2689 else
2690 pgprot_val(prot) |= _PAGE_SZHUGE_4U;
2691
2692 pte |= pgprot_val(prot);
2693
2694 mm = vma->vm_mm;
2695
2696 spin_lock_irqsave(&mm->context.lock, flags);
2697
2698 if (mm->context.tsb_block[MM_TSB_HUGE].tsb != NULL)
2699 __update_mmu_tsb_insert(mm, MM_TSB_HUGE, HPAGE_SHIFT,
2700 addr, pte);
2701
2702 spin_unlock_irqrestore(&mm->context.lock, flags);
2703}
2704#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
2705
2706#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
2707static void context_reload(void *__data)
2708{
2709 struct mm_struct *mm = __data;
2710
2711 if (mm == current->mm)
2712 load_secondary_context(mm);
2713}
2714
2715void hugetlb_setup(struct mm_struct *mm)
2716{
2717 struct tsb_config *tp = &mm->context.tsb_block[MM_TSB_HUGE];
2718
2719 if (likely(tp->tsb != NULL))
2720 return;
2721
2722 tsb_grow(mm, MM_TSB_HUGE, 0);
2723 tsb_context_switch(mm);
2724 smp_tsb_sync(mm);
2725
2726 /* On UltraSPARC-III+ and later, configure the second half of
2727 * the Data-TLB for huge pages.
2728 */
2729 if (tlb_type == cheetah_plus) {
2730 unsigned long ctx;
2731
2732 spin_lock(&ctx_alloc_lock);
2733 ctx = mm->context.sparc64_ctx_val;
2734 ctx &= ~CTX_PGSZ_MASK;
2735 ctx |= CTX_PGSZ_BASE << CTX_PGSZ0_SHIFT;
2736 ctx |= CTX_PGSZ_HUGE << CTX_PGSZ1_SHIFT;
2737
2738 if (ctx != mm->context.sparc64_ctx_val) {
2739 /* When changing the page size fields, we
2740 * must perform a context flush so that no
2741 * stale entries match. This flush must
2742 * occur with the original context register
2743 * settings.
2744 */
2745 do_flush_tlb_mm(mm);
2746
2747 /* Reload the context register of all processors
2748 * also executing in this address space.
2749 */
2750 mm->context.sparc64_ctx_val = ctx;
2751 on_each_cpu(context_reload, mm, 0);
2752 }
2753 spin_unlock(&ctx_alloc_lock);
2754 }
2755}
2756#endif
diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c
index b1f279cd00bf..3e8fec391fe0 100644
--- a/arch/sparc/mm/tlb.c
+++ b/arch/sparc/mm/tlb.c
@@ -43,16 +43,37 @@ void flush_tlb_pending(void)
43 put_cpu_var(tlb_batch); 43 put_cpu_var(tlb_batch);
44} 44}
45 45
46void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, 46static void tlb_batch_add_one(struct mm_struct *mm, unsigned long vaddr,
47 pte_t *ptep, pte_t orig, int fullmm) 47 bool exec)
48{ 48{
49 struct tlb_batch *tb = &get_cpu_var(tlb_batch); 49 struct tlb_batch *tb = &get_cpu_var(tlb_batch);
50 unsigned long nr; 50 unsigned long nr;
51 51
52 vaddr &= PAGE_MASK; 52 vaddr &= PAGE_MASK;
53 if (pte_exec(orig)) 53 if (exec)
54 vaddr |= 0x1UL; 54 vaddr |= 0x1UL;
55 55
56 nr = tb->tlb_nr;
57
58 if (unlikely(nr != 0 && mm != tb->mm)) {
59 flush_tlb_pending();
60 nr = 0;
61 }
62
63 if (nr == 0)
64 tb->mm = mm;
65
66 tb->vaddrs[nr] = vaddr;
67 tb->tlb_nr = ++nr;
68 if (nr >= TLB_BATCH_NR)
69 flush_tlb_pending();
70
71 put_cpu_var(tlb_batch);
72}
73
74void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr,
75 pte_t *ptep, pte_t orig, int fullmm)
76{
56 if (tlb_type != hypervisor && 77 if (tlb_type != hypervisor &&
57 pte_dirty(orig)) { 78 pte_dirty(orig)) {
58 unsigned long paddr, pfn = pte_pfn(orig); 79 unsigned long paddr, pfn = pte_pfn(orig);
@@ -77,26 +98,91 @@ void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr,
77 } 98 }
78 99
79no_cache_flush: 100no_cache_flush:
101 if (!fullmm)
102 tlb_batch_add_one(mm, vaddr, pte_exec(orig));
103}
104
105#ifdef CONFIG_TRANSPARENT_HUGEPAGE
106static void tlb_batch_pmd_scan(struct mm_struct *mm, unsigned long vaddr,
107 pmd_t pmd, bool exec)
108{
109 unsigned long end;
110 pte_t *pte;
111
112 pte = pte_offset_map(&pmd, vaddr);
113 end = vaddr + HPAGE_SIZE;
114 while (vaddr < end) {
115 if (pte_val(*pte) & _PAGE_VALID)
116 tlb_batch_add_one(mm, vaddr, exec);
117 pte++;
118 vaddr += PAGE_SIZE;
119 }
120 pte_unmap(pte);
121}
80 122
81 if (fullmm) { 123void set_pmd_at(struct mm_struct *mm, unsigned long addr,
82 put_cpu_var(tlb_batch); 124 pmd_t *pmdp, pmd_t pmd)
125{
126 pmd_t orig = *pmdp;
127
128 *pmdp = pmd;
129
130 if (mm == &init_mm)
83 return; 131 return;
132
133 if ((pmd_val(pmd) ^ pmd_val(orig)) & PMD_ISHUGE) {
134 if (pmd_val(pmd) & PMD_ISHUGE)
135 mm->context.huge_pte_count++;
136 else
137 mm->context.huge_pte_count--;
138 if (mm->context.huge_pte_count == 1)
139 hugetlb_setup(mm);
84 } 140 }
85 141
86 nr = tb->tlb_nr; 142 if (!pmd_none(orig)) {
143 bool exec = ((pmd_val(orig) & PMD_HUGE_EXEC) != 0);
87 144
88 if (unlikely(nr != 0 && mm != tb->mm)) { 145 addr &= HPAGE_MASK;
89 flush_tlb_pending(); 146 if (pmd_val(orig) & PMD_ISHUGE)
90 nr = 0; 147 tlb_batch_add_one(mm, addr, exec);
148 else
149 tlb_batch_pmd_scan(mm, addr, orig, exec);
91 } 150 }
151}
92 152
93 if (nr == 0) 153void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
94 tb->mm = mm; 154{
155 struct list_head *lh = (struct list_head *) pgtable;
95 156
96 tb->vaddrs[nr] = vaddr; 157 assert_spin_locked(&mm->page_table_lock);
97 tb->tlb_nr = ++nr;
98 if (nr >= TLB_BATCH_NR)
99 flush_tlb_pending();
100 158
101 put_cpu_var(tlb_batch); 159 /* FIFO */
160 if (!mm->pmd_huge_pte)
161 INIT_LIST_HEAD(lh);
162 else
163 list_add(lh, (struct list_head *) mm->pmd_huge_pte);
164 mm->pmd_huge_pte = pgtable;
165}
166
167pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm)
168{
169 struct list_head *lh;
170 pgtable_t pgtable;
171
172 assert_spin_locked(&mm->page_table_lock);
173
174 /* FIFO */
175 pgtable = mm->pmd_huge_pte;
176 lh = (struct list_head *) pgtable;
177 if (list_empty(lh))
178 mm->pmd_huge_pte = NULL;
179 else {
180 mm->pmd_huge_pte = (pgtable_t) lh->next;
181 list_del(lh);
182 }
183 pte_val(pgtable[0]) = 0;
184 pte_val(pgtable[1]) = 0;
185
186 return pgtable;
102} 187}
188#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/sparc/mm/tsb.c b/arch/sparc/mm/tsb.c
index c52add79b83d..7f6474347491 100644
--- a/arch/sparc/mm/tsb.c
+++ b/arch/sparc/mm/tsb.c
@@ -78,7 +78,7 @@ void flush_tsb_user(struct tlb_batch *tb)
78 base = __pa(base); 78 base = __pa(base);
79 __flush_tsb_one(tb, PAGE_SHIFT, base, nentries); 79 __flush_tsb_one(tb, PAGE_SHIFT, base, nentries);
80 80
81#ifdef CONFIG_HUGETLB_PAGE 81#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
82 if (mm->context.tsb_block[MM_TSB_HUGE].tsb) { 82 if (mm->context.tsb_block[MM_TSB_HUGE].tsb) {
83 base = (unsigned long) mm->context.tsb_block[MM_TSB_HUGE].tsb; 83 base = (unsigned long) mm->context.tsb_block[MM_TSB_HUGE].tsb;
84 nentries = mm->context.tsb_block[MM_TSB_HUGE].tsb_nentries; 84 nentries = mm->context.tsb_block[MM_TSB_HUGE].tsb_nentries;
@@ -90,29 +90,12 @@ void flush_tsb_user(struct tlb_batch *tb)
90 spin_unlock_irqrestore(&mm->context.lock, flags); 90 spin_unlock_irqrestore(&mm->context.lock, flags);
91} 91}
92 92
93#if defined(CONFIG_SPARC64_PAGE_SIZE_8KB)
94#define HV_PGSZ_IDX_BASE HV_PGSZ_IDX_8K 93#define HV_PGSZ_IDX_BASE HV_PGSZ_IDX_8K
95#define HV_PGSZ_MASK_BASE HV_PGSZ_MASK_8K 94#define HV_PGSZ_MASK_BASE HV_PGSZ_MASK_8K
96#elif defined(CONFIG_SPARC64_PAGE_SIZE_64KB)
97#define HV_PGSZ_IDX_BASE HV_PGSZ_IDX_64K
98#define HV_PGSZ_MASK_BASE HV_PGSZ_MASK_64K
99#else
100#error Broken base page size setting...
101#endif
102 95
103#ifdef CONFIG_HUGETLB_PAGE 96#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
104#if defined(CONFIG_HUGETLB_PAGE_SIZE_64K)
105#define HV_PGSZ_IDX_HUGE HV_PGSZ_IDX_64K
106#define HV_PGSZ_MASK_HUGE HV_PGSZ_MASK_64K
107#elif defined(CONFIG_HUGETLB_PAGE_SIZE_512K)
108#define HV_PGSZ_IDX_HUGE HV_PGSZ_IDX_512K
109#define HV_PGSZ_MASK_HUGE HV_PGSZ_MASK_512K
110#elif defined(CONFIG_HUGETLB_PAGE_SIZE_4MB)
111#define HV_PGSZ_IDX_HUGE HV_PGSZ_IDX_4MB 97#define HV_PGSZ_IDX_HUGE HV_PGSZ_IDX_4MB
112#define HV_PGSZ_MASK_HUGE HV_PGSZ_MASK_4MB 98#define HV_PGSZ_MASK_HUGE HV_PGSZ_MASK_4MB
113#else
114#error Broken huge page size setting...
115#endif
116#endif 99#endif
117 100
118static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_idx, unsigned long tsb_bytes) 101static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_idx, unsigned long tsb_bytes)
@@ -207,7 +190,7 @@ static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_idx, unsign
207 case MM_TSB_BASE: 190 case MM_TSB_BASE:
208 hp->pgsz_idx = HV_PGSZ_IDX_BASE; 191 hp->pgsz_idx = HV_PGSZ_IDX_BASE;
209 break; 192 break;
210#ifdef CONFIG_HUGETLB_PAGE 193#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
211 case MM_TSB_HUGE: 194 case MM_TSB_HUGE:
212 hp->pgsz_idx = HV_PGSZ_IDX_HUGE; 195 hp->pgsz_idx = HV_PGSZ_IDX_HUGE;
213 break; 196 break;
@@ -222,7 +205,7 @@ static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_idx, unsign
222 case MM_TSB_BASE: 205 case MM_TSB_BASE:
223 hp->pgsz_mask = HV_PGSZ_MASK_BASE; 206 hp->pgsz_mask = HV_PGSZ_MASK_BASE;
224 break; 207 break;
225#ifdef CONFIG_HUGETLB_PAGE 208#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
226 case MM_TSB_HUGE: 209 case MM_TSB_HUGE:
227 hp->pgsz_mask = HV_PGSZ_MASK_HUGE; 210 hp->pgsz_mask = HV_PGSZ_MASK_HUGE;
228 break; 211 break;
@@ -444,7 +427,7 @@ retry_tsb_alloc:
444 427
445int init_new_context(struct task_struct *tsk, struct mm_struct *mm) 428int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
446{ 429{
447#ifdef CONFIG_HUGETLB_PAGE 430#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
448 unsigned long huge_pte_count; 431 unsigned long huge_pte_count;
449#endif 432#endif
450 unsigned int i; 433 unsigned int i;
@@ -453,7 +436,7 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
453 436
454 mm->context.sparc64_ctx_val = 0UL; 437 mm->context.sparc64_ctx_val = 0UL;
455 438
456#ifdef CONFIG_HUGETLB_PAGE 439#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
457 /* We reset it to zero because the fork() page copying 440 /* We reset it to zero because the fork() page copying
458 * will re-increment the counters as the parent PTEs are 441 * will re-increment the counters as the parent PTEs are
459 * copied into the child address space. 442 * copied into the child address space.
@@ -462,6 +445,8 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
462 mm->context.huge_pte_count = 0; 445 mm->context.huge_pte_count = 0;
463#endif 446#endif
464 447
448 mm->context.pgtable_page = NULL;
449
465 /* copy_mm() copies over the parent's mm_struct before calling 450 /* copy_mm() copies over the parent's mm_struct before calling
466 * us, so we need to zero out the TSB pointer or else tsb_grow() 451 * us, so we need to zero out the TSB pointer or else tsb_grow()
467 * will be confused and think there is an older TSB to free up. 452 * will be confused and think there is an older TSB to free up.
@@ -474,7 +459,7 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
474 */ 459 */
475 tsb_grow(mm, MM_TSB_BASE, get_mm_rss(mm)); 460 tsb_grow(mm, MM_TSB_BASE, get_mm_rss(mm));
476 461
477#ifdef CONFIG_HUGETLB_PAGE 462#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
478 if (unlikely(huge_pte_count)) 463 if (unlikely(huge_pte_count))
479 tsb_grow(mm, MM_TSB_HUGE, huge_pte_count); 464 tsb_grow(mm, MM_TSB_HUGE, huge_pte_count);
480#endif 465#endif
@@ -500,10 +485,17 @@ static void tsb_destroy_one(struct tsb_config *tp)
500void destroy_context(struct mm_struct *mm) 485void destroy_context(struct mm_struct *mm)
501{ 486{
502 unsigned long flags, i; 487 unsigned long flags, i;
488 struct page *page;
503 489
504 for (i = 0; i < MM_NUM_TSBS; i++) 490 for (i = 0; i < MM_NUM_TSBS; i++)
505 tsb_destroy_one(&mm->context.tsb_block[i]); 491 tsb_destroy_one(&mm->context.tsb_block[i]);
506 492
493 page = mm->context.pgtable_page;
494 if (page && put_page_testzero(page)) {
495 pgtable_page_dtor(page);
496 free_hot_cold_page(page, 0);
497 }
498
507 spin_lock_irqsave(&ctx_alloc_lock, flags); 499 spin_lock_irqsave(&ctx_alloc_lock, flags);
508 500
509 if (CTX_VALID(mm->context)) { 501 if (CTX_VALID(mm->context)) {
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index c9a3c1fe7297..dc46490adca0 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -7,12 +7,15 @@ config TILE
7 select HAVE_DMA_API_DEBUG 7 select HAVE_DMA_API_DEBUG
8 select HAVE_KVM if !TILEGX 8 select HAVE_KVM if !TILEGX
9 select GENERIC_FIND_FIRST_BIT 9 select GENERIC_FIND_FIRST_BIT
10 select SYSCTL_EXCEPTION_TRACE
10 select USE_GENERIC_SMP_HELPERS 11 select USE_GENERIC_SMP_HELPERS
11 select CC_OPTIMIZE_FOR_SIZE 12 select CC_OPTIMIZE_FOR_SIZE
13 select HAVE_DEBUG_KMEMLEAK
12 select HAVE_GENERIC_HARDIRQS 14 select HAVE_GENERIC_HARDIRQS
13 select GENERIC_IRQ_PROBE 15 select GENERIC_IRQ_PROBE
14 select GENERIC_PENDING_IRQ if SMP 16 select GENERIC_PENDING_IRQ if SMP
15 select GENERIC_IRQ_SHOW 17 select GENERIC_IRQ_SHOW
18 select HAVE_DEBUG_BUGVERBOSE
16 select HAVE_SYSCALL_WRAPPERS if TILEGX 19 select HAVE_SYSCALL_WRAPPERS if TILEGX
17 select SYS_HYPERVISOR 20 select SYS_HYPERVISOR
18 select ARCH_HAVE_NMI_SAFE_CMPXCHG 21 select ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/arch/tile/include/asm/hugetlb.h b/arch/tile/include/asm/hugetlb.h
index b2042380a5aa..0f885af2b621 100644
--- a/arch/tile/include/asm/hugetlb.h
+++ b/arch/tile/include/asm/hugetlb.h
@@ -106,6 +106,10 @@ static inline void arch_release_hugepage(struct page *page)
106{ 106{
107} 107}
108 108
109static inline void arch_clear_hugepage_flags(struct page *page)
110{
111}
112
109#ifdef CONFIG_HUGETLB_SUPER_PAGES 113#ifdef CONFIG_HUGETLB_SUPER_PAGES
110static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, 114static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
111 struct page *page, int writable) 115 struct page *page, int writable)
diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c
index 758b6038c2b7..3cfa98bf9125 100644
--- a/arch/tile/mm/elf.c
+++ b/arch/tile/mm/elf.c
@@ -36,19 +36,14 @@ static void sim_notify_exec(const char *binary_name)
36 } while (c); 36 } while (c);
37} 37}
38 38
39static int notify_exec(void) 39static int notify_exec(struct mm_struct *mm)
40{ 40{
41 int retval = 0; /* failure */ 41 int retval = 0; /* failure */
42 struct vm_area_struct *vma = current->mm->mmap; 42
43 while (vma) { 43 if (mm->exe_file) {
44 if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file)
45 break;
46 vma = vma->vm_next;
47 }
48 if (vma) {
49 char *buf = (char *) __get_free_page(GFP_KERNEL); 44 char *buf = (char *) __get_free_page(GFP_KERNEL);
50 if (buf) { 45 if (buf) {
51 char *path = d_path(&vma->vm_file->f_path, 46 char *path = d_path(&mm->exe_file->f_path,
52 buf, PAGE_SIZE); 47 buf, PAGE_SIZE);
53 if (!IS_ERR(path)) { 48 if (!IS_ERR(path)) {
54 sim_notify_exec(path); 49 sim_notify_exec(path);
@@ -106,16 +101,16 @@ int arch_setup_additional_pages(struct linux_binprm *bprm,
106 unsigned long vdso_base; 101 unsigned long vdso_base;
107 int retval = 0; 102 int retval = 0;
108 103
104 down_write(&mm->mmap_sem);
105
109 /* 106 /*
110 * Notify the simulator that an exec just occurred. 107 * Notify the simulator that an exec just occurred.
111 * If we can't find the filename of the mapping, just use 108 * If we can't find the filename of the mapping, just use
112 * whatever was passed as the linux_binprm filename. 109 * whatever was passed as the linux_binprm filename.
113 */ 110 */
114 if (!notify_exec()) 111 if (!notify_exec(mm))
115 sim_notify_exec(bprm->filename); 112 sim_notify_exec(bprm->filename);
116 113
117 down_write(&mm->mmap_sem);
118
119 /* 114 /*
120 * MAYWRITE to allow gdb to COW and set breakpoints 115 * MAYWRITE to allow gdb to COW and set breakpoints
121 */ 116 */
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index 84ce7abbf5af..fe811fa5f1b9 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -454,6 +454,7 @@ good_area:
454 tsk->min_flt++; 454 tsk->min_flt++;
455 if (fault & VM_FAULT_RETRY) { 455 if (fault & VM_FAULT_RETRY) {
456 flags &= ~FAULT_FLAG_ALLOW_RETRY; 456 flags &= ~FAULT_FLAG_ALLOW_RETRY;
457 flags |= FAULT_FLAG_TRIED;
457 458
458 /* 459 /*
459 * No need to up_read(&mm->mmap_sem) as we would 460 * No need to up_read(&mm->mmap_sem) as we would
diff --git a/arch/um/Kconfig.common b/arch/um/Kconfig.common
index cb837c223922..648121b037d5 100644
--- a/arch/um/Kconfig.common
+++ b/arch/um/Kconfig.common
@@ -7,6 +7,7 @@ config UML
7 bool 7 bool
8 default y 8 default y
9 select HAVE_GENERIC_HARDIRQS 9 select HAVE_GENERIC_HARDIRQS
10 select HAVE_UID16
10 select GENERIC_IRQ_SHOW 11 select GENERIC_IRQ_SHOW
11 select GENERIC_CPU_DEVICES 12 select GENERIC_CPU_DEVICES
12 select GENERIC_IO 13 select GENERIC_IO
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index 0353b98ae35a..0f00e9c82080 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -89,6 +89,7 @@ good_area:
89 current->min_flt++; 89 current->min_flt++;
90 if (fault & VM_FAULT_RETRY) { 90 if (fault & VM_FAULT_RETRY) {
91 flags &= ~FAULT_FLAG_ALLOW_RETRY; 91 flags &= ~FAULT_FLAG_ALLOW_RETRY;
92 flags |= FAULT_FLAG_TRIED;
92 93
93 goto retry; 94 goto retry;
94 } 95 }
diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c
index b6f0458c3143..b008586dad75 100644
--- a/arch/unicore32/kernel/process.c
+++ b/arch/unicore32/kernel/process.c
@@ -380,7 +380,7 @@ int vectors_user_mapping(void)
380 return install_special_mapping(mm, 0xffff0000, PAGE_SIZE, 380 return install_special_mapping(mm, 0xffff0000, PAGE_SIZE,
381 VM_READ | VM_EXEC | 381 VM_READ | VM_EXEC |
382 VM_MAYREAD | VM_MAYEXEC | 382 VM_MAYREAD | VM_MAYEXEC |
383 VM_RESERVED, 383 VM_DONTEXPAND | VM_DONTDUMP,
384 NULL); 384 NULL);
385} 385}
386 386
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b72777ff32a9..1ae94bcae5d9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -10,6 +10,7 @@ config X86_32
10 def_bool y 10 def_bool y
11 depends on !64BIT 11 depends on !64BIT
12 select CLKSRC_I8253 12 select CLKSRC_I8253
13 select HAVE_UID16
13 14
14config X86_64 15config X86_64
15 def_bool y 16 def_bool y
@@ -46,6 +47,7 @@ config X86
46 select HAVE_FUNCTION_GRAPH_FP_TEST 47 select HAVE_FUNCTION_GRAPH_FP_TEST
47 select HAVE_FUNCTION_TRACE_MCOUNT_TEST 48 select HAVE_FUNCTION_TRACE_MCOUNT_TEST
48 select HAVE_SYSCALL_TRACEPOINTS 49 select HAVE_SYSCALL_TRACEPOINTS
50 select SYSCTL_EXCEPTION_TRACE
49 select HAVE_KVM 51 select HAVE_KVM
50 select HAVE_ARCH_KGDB 52 select HAVE_ARCH_KGDB
51 select HAVE_ARCH_TRACEHOOK 53 select HAVE_ARCH_TRACEHOOK
@@ -65,6 +67,7 @@ config X86
65 select HAVE_PERF_EVENTS_NMI 67 select HAVE_PERF_EVENTS_NMI
66 select HAVE_PERF_REGS 68 select HAVE_PERF_REGS
67 select HAVE_PERF_USER_STACK_DUMP 69 select HAVE_PERF_USER_STACK_DUMP
70 select HAVE_DEBUG_KMEMLEAK
68 select ANON_INODES 71 select ANON_INODES
69 select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386 72 select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386
70 select HAVE_CMPXCHG_LOCAL if !M386 73 select HAVE_CMPXCHG_LOCAL if !M386
@@ -85,6 +88,7 @@ config X86
85 select IRQ_FORCED_THREADING 88 select IRQ_FORCED_THREADING
86 select USE_GENERIC_SMP_HELPERS if SMP 89 select USE_GENERIC_SMP_HELPERS if SMP
87 select HAVE_BPF_JIT if X86_64 90 select HAVE_BPF_JIT if X86_64
91 select HAVE_ARCH_TRANSPARENT_HUGEPAGE
88 select CLKEVT_I8253 92 select CLKEVT_I8253
89 select ARCH_HAVE_NMI_SAFE_CMPXCHG 93 select ARCH_HAVE_NMI_SAFE_CMPXCHG
90 select GENERIC_IOMAP 94 select GENERIC_IOMAP
@@ -2168,6 +2172,7 @@ config IA32_EMULATION
2168 bool "IA32 Emulation" 2172 bool "IA32 Emulation"
2169 depends on X86_64 2173 depends on X86_64
2170 select COMPAT_BINFMT_ELF 2174 select COMPAT_BINFMT_ELF
2175 select HAVE_UID16
2171 ---help--- 2176 ---help---
2172 Include code to run legacy 32-bit programs under a 2177 Include code to run legacy 32-bit programs under a
2173 64-bit kernel. You should likely turn this on, unless you're 2178 64-bit kernel. You should likely turn this on, unless you're
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 250b8774c158..b6c3b821acf6 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -240,30 +240,6 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u)
240 return c; 240 return c;
241} 241}
242 242
243
244/*
245 * atomic_dec_if_positive - decrement by 1 if old value positive
246 * @v: pointer of type atomic_t
247 *
248 * The function returns the old value of *v minus 1, even if
249 * the atomic variable, v, was not decremented.
250 */
251static inline int atomic_dec_if_positive(atomic_t *v)
252{
253 int c, old, dec;
254 c = atomic_read(v);
255 for (;;) {
256 dec = c - 1;
257 if (unlikely(dec < 0))
258 break;
259 old = atomic_cmpxchg((v), c, dec);
260 if (likely(old == c))
261 break;
262 c = old;
263 }
264 return dec;
265}
266
267/** 243/**
268 * atomic_inc_short - increment of a short integer 244 * atomic_inc_short - increment of a short integer
269 * @v: pointer to type int 245 * @v: pointer to type int
diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
index 439a9acc132d..bdd35dbd0605 100644
--- a/arch/x86/include/asm/hugetlb.h
+++ b/arch/x86/include/asm/hugetlb.h
@@ -90,4 +90,8 @@ static inline void arch_release_hugepage(struct page *page)
90{ 90{
91} 91}
92 92
93static inline void arch_clear_hugepage_flags(struct page *page)
94{
95}
96
93#endif /* _ASM_X86_HUGETLB_H */ 97#endif /* _ASM_X86_HUGETLB_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index fc9948465293..a1f780d45f76 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -146,8 +146,7 @@ static inline unsigned long pmd_pfn(pmd_t pmd)
146 146
147static inline int pmd_large(pmd_t pte) 147static inline int pmd_large(pmd_t pte)
148{ 148{
149 return (pmd_flags(pte) & (_PAGE_PSE | _PAGE_PRESENT)) == 149 return pmd_flags(pte) & _PAGE_PSE;
150 (_PAGE_PSE | _PAGE_PRESENT);
151} 150}
152 151
153#ifdef CONFIG_TRANSPARENT_HUGEPAGE 152#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -415,7 +414,13 @@ static inline int pte_hidden(pte_t pte)
415 414
416static inline int pmd_present(pmd_t pmd) 415static inline int pmd_present(pmd_t pmd)
417{ 416{
418 return pmd_flags(pmd) & _PAGE_PRESENT; 417 /*
418 * Checking for _PAGE_PSE is needed too because
419 * split_huge_page will temporarily clear the present bit (but
420 * the _PAGE_PSE flag will remain set at all times while the
421 * _PAGE_PRESENT bit is clear).
422 */
423 return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE);
419} 424}
420 425
421static inline int pmd_none(pmd_t pmd) 426static inline int pmd_none(pmd_t pmd)
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 0c92113c4cb6..8faa215a503e 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -71,6 +71,7 @@ do { \
71 * tables contain all the necessary information. 71 * tables contain all the necessary information.
72 */ 72 */
73#define update_mmu_cache(vma, address, ptep) do { } while (0) 73#define update_mmu_cache(vma, address, ptep) do { } while (0)
74#define update_mmu_cache_pmd(vma, address, pmd) do { } while (0)
74 75
75#endif /* !__ASSEMBLY__ */ 76#endif /* !__ASSEMBLY__ */
76 77
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 8251be02301e..47356f9df82e 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -143,6 +143,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
143#define pte_unmap(pte) ((void)(pte))/* NOP */ 143#define pte_unmap(pte) ((void)(pte))/* NOP */
144 144
145#define update_mmu_cache(vma, address, ptep) do { } while (0) 145#define update_mmu_cache(vma, address, ptep) do { } while (0)
146#define update_mmu_cache_pmd(vma, address, pmd) do { } while (0)
146 147
147/* Encode and de-code a swap entry */ 148/* Encode and de-code a swap entry */
148#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE 149#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a530b230e7d7..8e13ecb41bee 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1220,6 +1220,7 @@ good_area:
1220 /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk 1220 /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
1221 * of starvation. */ 1221 * of starvation. */
1222 flags &= ~FAULT_FLAG_ALLOW_RETRY; 1222 flags &= ~FAULT_FLAG_ALLOW_RETRY;
1223 flags |= FAULT_FLAG_TRIED;
1223 goto retry; 1224 goto retry;
1224 } 1225 }
1225 } 1226 }
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index b91e48512425..937bff5cdaa7 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -71,7 +71,6 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
71 struct address_space *mapping = vma->vm_file->f_mapping; 71 struct address_space *mapping = vma->vm_file->f_mapping;
72 pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + 72 pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
73 vma->vm_pgoff; 73 vma->vm_pgoff;
74 struct prio_tree_iter iter;
75 struct vm_area_struct *svma; 74 struct vm_area_struct *svma;
76 unsigned long saddr; 75 unsigned long saddr;
77 pte_t *spte = NULL; 76 pte_t *spte = NULL;
@@ -81,7 +80,7 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
81 return (pte_t *)pmd_alloc(mm, pud, addr); 80 return (pte_t *)pmd_alloc(mm, pud, addr);
82 81
83 mutex_lock(&mapping->i_mmap_mutex); 82 mutex_lock(&mapping->i_mmap_mutex);
84 vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { 83 vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
85 if (svma == vma) 84 if (svma == vma)
86 continue; 85 continue;
87 86
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 3d68ef6d2266..0eb572eda406 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -664,20 +664,20 @@ static void free_pfn_range(u64 paddr, unsigned long size)
664} 664}
665 665
666/* 666/*
667 * track_pfn_vma_copy is called when vma that is covering the pfnmap gets 667 * track_pfn_copy is called when vma that is covering the pfnmap gets
668 * copied through copy_page_range(). 668 * copied through copy_page_range().
669 * 669 *
670 * If the vma has a linear pfn mapping for the entire range, we get the prot 670 * If the vma has a linear pfn mapping for the entire range, we get the prot
671 * from pte and reserve the entire vma range with single reserve_pfn_range call. 671 * from pte and reserve the entire vma range with single reserve_pfn_range call.
672 */ 672 */
673int track_pfn_vma_copy(struct vm_area_struct *vma) 673int track_pfn_copy(struct vm_area_struct *vma)
674{ 674{
675 resource_size_t paddr; 675 resource_size_t paddr;
676 unsigned long prot; 676 unsigned long prot;
677 unsigned long vma_size = vma->vm_end - vma->vm_start; 677 unsigned long vma_size = vma->vm_end - vma->vm_start;
678 pgprot_t pgprot; 678 pgprot_t pgprot;
679 679
680 if (is_linear_pfn_mapping(vma)) { 680 if (vma->vm_flags & VM_PAT) {
681 /* 681 /*
682 * reserve the whole chunk covered by vma. We need the 682 * reserve the whole chunk covered by vma. We need the
683 * starting address and protection from pte. 683 * starting address and protection from pte.
@@ -694,31 +694,59 @@ int track_pfn_vma_copy(struct vm_area_struct *vma)
694} 694}
695 695
696/* 696/*
697 * track_pfn_vma_new is called when a _new_ pfn mapping is being established
698 * for physical range indicated by pfn and size.
699 *
700 * prot is passed in as a parameter for the new mapping. If the vma has a 697 * prot is passed in as a parameter for the new mapping. If the vma has a
701 * linear pfn mapping for the entire range reserve the entire vma range with 698 * linear pfn mapping for the entire range reserve the entire vma range with
702 * single reserve_pfn_range call. 699 * single reserve_pfn_range call.
703 */ 700 */
704int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, 701int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
705 unsigned long pfn, unsigned long size) 702 unsigned long pfn, unsigned long addr, unsigned long size)
706{ 703{
704 resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT;
707 unsigned long flags; 705 unsigned long flags;
708 resource_size_t paddr;
709 unsigned long vma_size = vma->vm_end - vma->vm_start;
710 706
711 if (is_linear_pfn_mapping(vma)) { 707 /* reserve the whole chunk starting from paddr */
712 /* reserve the whole chunk starting from vm_pgoff */ 708 if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) {
713 paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; 709 int ret;
714 return reserve_pfn_range(paddr, vma_size, prot, 0); 710
711 ret = reserve_pfn_range(paddr, size, prot, 0);
712 if (!ret)
713 vma->vm_flags |= VM_PAT;
714 return ret;
715 } 715 }
716 716
717 if (!pat_enabled) 717 if (!pat_enabled)
718 return 0; 718 return 0;
719 719
720 /* for vm_insert_pfn and friends, we set prot based on lookup */ 720 /*
721 flags = lookup_memtype(pfn << PAGE_SHIFT); 721 * For anything smaller than the vma size we set prot based on the
722 * lookup.
723 */
724 flags = lookup_memtype(paddr);
725
726 /* Check memtype for the remaining pages */
727 while (size > PAGE_SIZE) {
728 size -= PAGE_SIZE;
729 paddr += PAGE_SIZE;
730 if (flags != lookup_memtype(paddr))
731 return -EINVAL;
732 }
733
734 *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
735 flags);
736
737 return 0;
738}
739
740int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
741 unsigned long pfn)
742{
743 unsigned long flags;
744
745 if (!pat_enabled)
746 return 0;
747
748 /* Set prot based on lookup */
749 flags = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT);
722 *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | 750 *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
723 flags); 751 flags);
724 752
@@ -726,22 +754,31 @@ int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
726} 754}
727 755
728/* 756/*
729 * untrack_pfn_vma is called while unmapping a pfnmap for a region. 757 * untrack_pfn is called while unmapping a pfnmap for a region.
730 * untrack can be called for a specific region indicated by pfn and size or 758 * untrack can be called for a specific region indicated by pfn and size or
731 * can be for the entire vma (in which case size can be zero). 759 * can be for the entire vma (in which case pfn, size are zero).
732 */ 760 */
733void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, 761void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
734 unsigned long size) 762 unsigned long size)
735{ 763{
736 resource_size_t paddr; 764 resource_size_t paddr;
737 unsigned long vma_size = vma->vm_end - vma->vm_start; 765 unsigned long prot;
738 766
739 if (is_linear_pfn_mapping(vma)) { 767 if (!(vma->vm_flags & VM_PAT))
740 /* free the whole chunk starting from vm_pgoff */
741 paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
742 free_pfn_range(paddr, vma_size);
743 return; 768 return;
769
770 /* free the chunk starting from pfn or the whole chunk */
771 paddr = (resource_size_t)pfn << PAGE_SHIFT;
772 if (!paddr && !size) {
773 if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
774 WARN_ON_ONCE(1);
775 return;
776 }
777
778 size = vma->vm_end - vma->vm_start;
744 } 779 }
780 free_pfn_range(paddr, size);
781 vma->vm_flags &= ~VM_PAT;
745} 782}
746 783
747pgprot_t pgprot_writecombine(pgprot_t prot) 784pgprot_t pgprot_writecombine(pgprot_t prot)
diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
index 8acaddd0fb21..415f6c4ced36 100644
--- a/arch/x86/mm/pat_rbtree.c
+++ b/arch/x86/mm/pat_rbtree.c
@@ -12,7 +12,7 @@
12#include <linux/debugfs.h> 12#include <linux/debugfs.h>
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/rbtree.h> 15#include <linux/rbtree_augmented.h>
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/gfp.h> 17#include <linux/gfp.h>
18 18
@@ -54,29 +54,24 @@ static u64 get_subtree_max_end(struct rb_node *node)
54 return ret; 54 return ret;
55} 55}
56 56
57/* Update 'subtree_max_end' for a node, based on node and its children */ 57static u64 compute_subtree_max_end(struct memtype *data)
58static void memtype_rb_augment_cb(struct rb_node *node, void *__unused)
59{ 58{
60 struct memtype *data; 59 u64 max_end = data->end, child_max_end;
61 u64 max_end, child_max_end;
62
63 if (!node)
64 return;
65 60
66 data = container_of(node, struct memtype, rb); 61 child_max_end = get_subtree_max_end(data->rb.rb_right);
67 max_end = data->end;
68
69 child_max_end = get_subtree_max_end(node->rb_right);
70 if (child_max_end > max_end) 62 if (child_max_end > max_end)
71 max_end = child_max_end; 63 max_end = child_max_end;
72 64
73 child_max_end = get_subtree_max_end(node->rb_left); 65 child_max_end = get_subtree_max_end(data->rb.rb_left);
74 if (child_max_end > max_end) 66 if (child_max_end > max_end)
75 max_end = child_max_end; 67 max_end = child_max_end;
76 68
77 data->subtree_max_end = max_end; 69 return max_end;
78} 70}
79 71
72RB_DECLARE_CALLBACKS(static, memtype_rb_augment_cb, struct memtype, rb,
73 u64, subtree_max_end, compute_subtree_max_end)
74
80/* Find the first (lowest start addr) overlapping range from rb tree */ 75/* Find the first (lowest start addr) overlapping range from rb tree */
81static struct memtype *memtype_rb_lowest_match(struct rb_root *root, 76static struct memtype *memtype_rb_lowest_match(struct rb_root *root,
82 u64 start, u64 end) 77 u64 start, u64 end)
@@ -179,15 +174,17 @@ static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata)
179 struct memtype *data = container_of(*node, struct memtype, rb); 174 struct memtype *data = container_of(*node, struct memtype, rb);
180 175
181 parent = *node; 176 parent = *node;
177 if (data->subtree_max_end < newdata->end)
178 data->subtree_max_end = newdata->end;
182 if (newdata->start <= data->start) 179 if (newdata->start <= data->start)
183 node = &((*node)->rb_left); 180 node = &((*node)->rb_left);
184 else if (newdata->start > data->start) 181 else if (newdata->start > data->start)
185 node = &((*node)->rb_right); 182 node = &((*node)->rb_right);
186 } 183 }
187 184
185 newdata->subtree_max_end = newdata->end;
188 rb_link_node(&newdata->rb, parent, node); 186 rb_link_node(&newdata->rb, parent, node);
189 rb_insert_color(&newdata->rb, root); 187 rb_insert_augmented(&newdata->rb, root, &memtype_rb_augment_cb);
190 rb_augment_insert(&newdata->rb, memtype_rb_augment_cb, NULL);
191} 188}
192 189
193int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type) 190int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type)
@@ -209,16 +206,13 @@ int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type)
209 206
210struct memtype *rbt_memtype_erase(u64 start, u64 end) 207struct memtype *rbt_memtype_erase(u64 start, u64 end)
211{ 208{
212 struct rb_node *deepest;
213 struct memtype *data; 209 struct memtype *data;
214 210
215 data = memtype_rb_exact_match(&memtype_rbroot, start, end); 211 data = memtype_rb_exact_match(&memtype_rbroot, start, end);
216 if (!data) 212 if (!data)
217 goto out; 213 goto out;
218 214
219 deepest = rb_augment_erase_begin(&data->rb); 215 rb_erase_augmented(&data->rb, &memtype_rbroot, &memtype_rb_augment_cb);
220 rb_erase(&data->rb, &memtype_rbroot);
221 rb_augment_erase_end(deepest, memtype_rb_augment_cb, NULL);
222out: 216out:
223 return data; 217 return data;
224} 218}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 5a16824cc2b3..fd28d86fe3d2 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -2451,8 +2451,7 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
2451 2451
2452 prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP); 2452 prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
2453 2453
2454 BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) == 2454 BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
2455 (VM_PFNMAP | VM_RESERVED | VM_IO)));
2456 2455
2457 rmd.mfn = mfn; 2456 rmd.mfn = mfn;
2458 rmd.prot = prot; 2457 rmd.prot = prot;
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index 5a74c53bc69c..2c2f710ed1dc 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -126,6 +126,7 @@ good_area:
126 current->min_flt++; 126 current->min_flt++;
127 if (fault & VM_FAULT_RETRY) { 127 if (fault & VM_FAULT_RETRY) {
128 flags &= ~FAULT_FLAG_ALLOW_RETRY; 128 flags &= ~FAULT_FLAG_ALLOW_RETRY;
129 flags |= FAULT_FLAG_TRIED;
129 130
130 /* No need to up_read(&mm->mmap_sem) as we would 131 /* No need to up_read(&mm->mmap_sem) as we would
131 * have already released it in __lock_page_or_retry 132 * have already released it in __lock_page_or_retry
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 7dda4f790f00..86c88216a503 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -248,26 +248,23 @@ static bool pages_correctly_reserved(unsigned long start_pfn,
248static int 248static int
249memory_block_action(unsigned long phys_index, unsigned long action) 249memory_block_action(unsigned long phys_index, unsigned long action)
250{ 250{
251 unsigned long start_pfn, start_paddr; 251 unsigned long start_pfn;
252 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 252 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
253 struct page *first_page; 253 struct page *first_page;
254 int ret; 254 int ret;
255 255
256 first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT); 256 first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT);
257 start_pfn = page_to_pfn(first_page);
257 258
258 switch (action) { 259 switch (action) {
259 case MEM_ONLINE: 260 case MEM_ONLINE:
260 start_pfn = page_to_pfn(first_page);
261
262 if (!pages_correctly_reserved(start_pfn, nr_pages)) 261 if (!pages_correctly_reserved(start_pfn, nr_pages))
263 return -EBUSY; 262 return -EBUSY;
264 263
265 ret = online_pages(start_pfn, nr_pages); 264 ret = online_pages(start_pfn, nr_pages);
266 break; 265 break;
267 case MEM_OFFLINE: 266 case MEM_OFFLINE:
268 start_paddr = page_to_pfn(first_page) << PAGE_SHIFT; 267 ret = offline_pages(start_pfn, nr_pages);
269 ret = remove_memory(start_paddr,
270 nr_pages << PAGE_SHIFT);
271 break; 268 break;
272 default: 269 default:
273 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " 270 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
@@ -278,13 +275,11 @@ memory_block_action(unsigned long phys_index, unsigned long action)
278 return ret; 275 return ret;
279} 276}
280 277
281static int memory_block_change_state(struct memory_block *mem, 278static int __memory_block_change_state(struct memory_block *mem,
282 unsigned long to_state, unsigned long from_state_req) 279 unsigned long to_state, unsigned long from_state_req)
283{ 280{
284 int ret = 0; 281 int ret = 0;
285 282
286 mutex_lock(&mem->state_mutex);
287
288 if (mem->state != from_state_req) { 283 if (mem->state != from_state_req) {
289 ret = -EINVAL; 284 ret = -EINVAL;
290 goto out; 285 goto out;
@@ -312,10 +307,20 @@ static int memory_block_change_state(struct memory_block *mem,
312 break; 307 break;
313 } 308 }
314out: 309out:
315 mutex_unlock(&mem->state_mutex);
316 return ret; 310 return ret;
317} 311}
318 312
313static int memory_block_change_state(struct memory_block *mem,
314 unsigned long to_state, unsigned long from_state_req)
315{
316 int ret;
317
318 mutex_lock(&mem->state_mutex);
319 ret = __memory_block_change_state(mem, to_state, from_state_req);
320 mutex_unlock(&mem->state_mutex);
321
322 return ret;
323}
319static ssize_t 324static ssize_t
320store_mem_state(struct device *dev, 325store_mem_state(struct device *dev,
321 struct device_attribute *attr, const char *buf, size_t count) 326 struct device_attribute *attr, const char *buf, size_t count)
@@ -656,6 +661,21 @@ int unregister_memory_section(struct mem_section *section)
656} 661}
657 662
658/* 663/*
664 * offline one memory block. If the memory block has been offlined, do nothing.
665 */
666int offline_memory_block(struct memory_block *mem)
667{
668 int ret = 0;
669
670 mutex_lock(&mem->state_mutex);
671 if (mem->state != MEM_OFFLINE)
672 ret = __memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
673 mutex_unlock(&mem->state_mutex);
674
675 return ret;
676}
677
678/*
659 * Initialize the sysfs support for memory devices... 679 * Initialize the sysfs support for memory devices...
660 */ 680 */
661int __init memory_dev_init(void) 681int __init memory_dev_init(void)
diff --git a/drivers/char/mbcs.c b/drivers/char/mbcs.c
index 0c7d340b9ab9..f74e892711dd 100644
--- a/drivers/char/mbcs.c
+++ b/drivers/char/mbcs.c
@@ -507,7 +507,7 @@ static int mbcs_gscr_mmap(struct file *fp, struct vm_area_struct *vma)
507 507
508 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); 508 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
509 509
510 /* Remap-pfn-range will mark the range VM_IO and VM_RESERVED */ 510 /* Remap-pfn-range will mark the range VM_IO */
511 if (remap_pfn_range(vma, 511 if (remap_pfn_range(vma,
512 vma->vm_start, 512 vma->vm_start,
513 __pa(soft->gscr_addr) >> PAGE_SHIFT, 513 __pa(soft->gscr_addr) >> PAGE_SHIFT,
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index e5eedfa24c91..0537903c985b 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -322,7 +322,7 @@ static int mmap_mem(struct file *file, struct vm_area_struct *vma)
322 322
323 vma->vm_ops = &mmap_mem_ops; 323 vma->vm_ops = &mmap_mem_ops;
324 324
325 /* Remap-pfn-range will mark the range VM_IO and VM_RESERVED */ 325 /* Remap-pfn-range will mark the range VM_IO */
326 if (remap_pfn_range(vma, 326 if (remap_pfn_range(vma,
327 vma->vm_start, 327 vma->vm_start,
328 vma->vm_pgoff, 328 vma->vm_pgoff,
diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c
index 845f97fd1832..e1f60f968fdd 100644
--- a/drivers/char/mspec.c
+++ b/drivers/char/mspec.c
@@ -286,7 +286,7 @@ mspec_mmap(struct file *file, struct vm_area_struct *vma,
286 atomic_set(&vdata->refcnt, 1); 286 atomic_set(&vdata->refcnt, 1);
287 vma->vm_private_data = vdata; 287 vma->vm_private_data = vdata;
288 288
289 vma->vm_flags |= (VM_IO | VM_RESERVED | VM_PFNMAP | VM_DONTEXPAND); 289 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
290 if (vdata->type == MSPEC_FETCHOP || vdata->type == MSPEC_UNCACHED) 290 if (vdata->type == MSPEC_FETCHOP || vdata->type == MSPEC_UNCACHED)
291 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); 291 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
292 vma->vm_ops = &mspec_vm_ops; 292 vma->vm_ops = &mspec_vm_ops;
diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
index 92177d5aedee..24efae464e2c 100644
--- a/drivers/gpu/drm/drm_gem.c
+++ b/drivers/gpu/drm/drm_gem.c
@@ -706,7 +706,7 @@ int drm_gem_mmap(struct file *filp, struct vm_area_struct *vma)
706 goto out_unlock; 706 goto out_unlock;
707 } 707 }
708 708
709 vma->vm_flags |= VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTEXPAND; 709 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
710 vma->vm_ops = obj->dev->driver->gem_vm_ops; 710 vma->vm_ops = obj->dev->driver->gem_vm_ops;
711 vma->vm_private_data = map->handle; 711 vma->vm_private_data = map->handle;
712 vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); 712 vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
diff --git a/drivers/gpu/drm/drm_vm.c b/drivers/gpu/drm/drm_vm.c
index 23a824e6a22a..db7bd292410b 100644
--- a/drivers/gpu/drm/drm_vm.c
+++ b/drivers/gpu/drm/drm_vm.c
@@ -514,8 +514,7 @@ static int drm_mmap_dma(struct file *filp, struct vm_area_struct *vma)
514 514
515 vma->vm_ops = &drm_vm_dma_ops; 515 vma->vm_ops = &drm_vm_dma_ops;
516 516
517 vma->vm_flags |= VM_RESERVED; /* Don't swap */ 517 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
518 vma->vm_flags |= VM_DONTEXPAND;
519 518
520 drm_vm_open_locked(dev, vma); 519 drm_vm_open_locked(dev, vma);
521 return 0; 520 return 0;
@@ -643,21 +642,16 @@ int drm_mmap_locked(struct file *filp, struct vm_area_struct *vma)
643 case _DRM_SHM: 642 case _DRM_SHM:
644 vma->vm_ops = &drm_vm_shm_ops; 643 vma->vm_ops = &drm_vm_shm_ops;
645 vma->vm_private_data = (void *)map; 644 vma->vm_private_data = (void *)map;
646 /* Don't let this area swap. Change when
647 DRM_KERNEL advisory is supported. */
648 vma->vm_flags |= VM_RESERVED;
649 break; 645 break;
650 case _DRM_SCATTER_GATHER: 646 case _DRM_SCATTER_GATHER:
651 vma->vm_ops = &drm_vm_sg_ops; 647 vma->vm_ops = &drm_vm_sg_ops;
652 vma->vm_private_data = (void *)map; 648 vma->vm_private_data = (void *)map;
653 vma->vm_flags |= VM_RESERVED;
654 vma->vm_page_prot = drm_dma_prot(map->type, vma); 649 vma->vm_page_prot = drm_dma_prot(map->type, vma);
655 break; 650 break;
656 default: 651 default:
657 return -EINVAL; /* This should never happen. */ 652 return -EINVAL; /* This should never happen. */
658 } 653 }
659 vma->vm_flags |= VM_RESERVED; /* Don't swap */ 654 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
660 vma->vm_flags |= VM_DONTEXPAND;
661 655
662 drm_vm_open_locked(dev, vma); 656 drm_vm_open_locked(dev, vma);
663 return 0; 657 return 0;
diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.c b/drivers/gpu/drm/exynos/exynos_drm_gem.c
index fcdbe46914f7..d2545560664f 100644
--- a/drivers/gpu/drm/exynos/exynos_drm_gem.c
+++ b/drivers/gpu/drm/exynos/exynos_drm_gem.c
@@ -500,7 +500,7 @@ static int exynos_drm_gem_mmap_buffer(struct file *filp,
500 500
501 DRM_DEBUG_KMS("%s\n", __FILE__); 501 DRM_DEBUG_KMS("%s\n", __FILE__);
502 502
503 vma->vm_flags |= (VM_IO | VM_RESERVED); 503 vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
504 504
505 update_vm_cache_attr(exynos_gem_obj, vma); 505 update_vm_cache_attr(exynos_gem_obj, vma);
506 506
diff --git a/drivers/gpu/drm/gma500/framebuffer.c b/drivers/gpu/drm/gma500/framebuffer.c
index 884ba73ac6ce..afded54dbb10 100644
--- a/drivers/gpu/drm/gma500/framebuffer.c
+++ b/drivers/gpu/drm/gma500/framebuffer.c
@@ -178,8 +178,7 @@ static int psbfb_mmap(struct fb_info *info, struct vm_area_struct *vma)
178 */ 178 */
179 vma->vm_ops = &psbfb_vm_ops; 179 vma->vm_ops = &psbfb_vm_ops;
180 vma->vm_private_data = (void *)psbfb; 180 vma->vm_private_data = (void *)psbfb;
181 vma->vm_flags |= VM_RESERVED | VM_IO | 181 vma->vm_flags |= VM_IO | VM_MIXEDMAP | VM_DONTEXPAND | VM_DONTDUMP;
182 VM_MIXEDMAP | VM_DONTEXPAND;
183 return 0; 182 return 0;
184} 183}
185 184
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index a877813571a4..3ba72dbdc4bd 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -285,7 +285,7 @@ int ttm_bo_mmap(struct file *filp, struct vm_area_struct *vma,
285 */ 285 */
286 286
287 vma->vm_private_data = bo; 287 vma->vm_private_data = bo;
288 vma->vm_flags |= VM_RESERVED | VM_IO | VM_MIXEDMAP | VM_DONTEXPAND; 288 vma->vm_flags |= VM_IO | VM_MIXEDMAP | VM_DONTEXPAND | VM_DONTDUMP;
289 return 0; 289 return 0;
290out_unref: 290out_unref:
291 ttm_bo_unref(&bo); 291 ttm_bo_unref(&bo);
@@ -300,7 +300,7 @@ int ttm_fbdev_mmap(struct vm_area_struct *vma, struct ttm_buffer_object *bo)
300 300
301 vma->vm_ops = &ttm_bo_vm_ops; 301 vma->vm_ops = &ttm_bo_vm_ops;
302 vma->vm_private_data = ttm_bo_reference(bo); 302 vma->vm_private_data = ttm_bo_reference(bo);
303 vma->vm_flags |= VM_RESERVED | VM_IO | VM_MIXEDMAP | VM_DONTEXPAND; 303 vma->vm_flags |= VM_IO | VM_MIXEDMAP | VM_DONTEXPAND;
304 return 0; 304 return 0;
305} 305}
306EXPORT_SYMBOL(ttm_fbdev_mmap); 306EXPORT_SYMBOL(ttm_fbdev_mmap);
diff --git a/drivers/gpu/drm/udl/udl_fb.c b/drivers/gpu/drm/udl/udl_fb.c
index 67df842fbb33..69a2b16f42a6 100644
--- a/drivers/gpu/drm/udl/udl_fb.c
+++ b/drivers/gpu/drm/udl/udl_fb.c
@@ -243,7 +243,7 @@ static int udl_fb_mmap(struct fb_info *info, struct vm_area_struct *vma)
243 size = 0; 243 size = 0;
244 } 244 }
245 245
246 vma->vm_flags |= VM_RESERVED; /* avoid to swap out this VMA */ 246 /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */
247 return 0; 247 return 0;
248} 248}
249 249
diff --git a/drivers/infiniband/hw/ehca/ehca_uverbs.c b/drivers/infiniband/hw/ehca/ehca_uverbs.c
index 45ee89b65c23..1a1d5d99fcf9 100644
--- a/drivers/infiniband/hw/ehca/ehca_uverbs.c
+++ b/drivers/infiniband/hw/ehca/ehca_uverbs.c
@@ -117,7 +117,7 @@ static int ehca_mmap_fw(struct vm_area_struct *vma, struct h_galpas *galpas,
117 physical = galpas->user.fw_handle; 117 physical = galpas->user.fw_handle;
118 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); 118 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
119 ehca_gen_dbg("vsize=%llx physical=%llx", vsize, physical); 119 ehca_gen_dbg("vsize=%llx physical=%llx", vsize, physical);
120 /* VM_IO | VM_RESERVED are set by remap_pfn_range() */ 120 /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */
121 ret = remap_4k_pfn(vma, vma->vm_start, physical >> EHCA_PAGESHIFT, 121 ret = remap_4k_pfn(vma, vma->vm_start, physical >> EHCA_PAGESHIFT,
122 vma->vm_page_prot); 122 vma->vm_page_prot);
123 if (unlikely(ret)) { 123 if (unlikely(ret)) {
@@ -139,7 +139,7 @@ static int ehca_mmap_queue(struct vm_area_struct *vma, struct ipz_queue *queue,
139 u64 start, ofs; 139 u64 start, ofs;
140 struct page *page; 140 struct page *page;
141 141
142 vma->vm_flags |= VM_RESERVED; 142 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
143 start = vma->vm_start; 143 start = vma->vm_start;
144 for (ofs = 0; ofs < queue->queue_length; ofs += PAGE_SIZE) { 144 for (ofs = 0; ofs < queue->queue_length; ofs += PAGE_SIZE) {
145 u64 virt_addr = (u64)ipz_qeit_calc(queue, ofs); 145 u64 virt_addr = (u64)ipz_qeit_calc(queue, ofs);
diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c
index 736d9edbdbe7..3eb7e454849b 100644
--- a/drivers/infiniband/hw/ipath/ipath_file_ops.c
+++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c
@@ -1225,7 +1225,7 @@ static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr,
1225 1225
1226 vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT; 1226 vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT;
1227 vma->vm_ops = &ipath_file_vm_ops; 1227 vma->vm_ops = &ipath_file_vm_ops;
1228 vma->vm_flags |= VM_RESERVED | VM_DONTEXPAND; 1228 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
1229 ret = 1; 1229 ret = 1;
1230 1230
1231bail: 1231bail:
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
index faa44cb08071..959a5c4ff812 100644
--- a/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -971,7 +971,7 @@ static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr,
971 971
972 vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT; 972 vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT;
973 vma->vm_ops = &qib_file_vm_ops; 973 vma->vm_ops = &qib_file_vm_ops;
974 vma->vm_flags |= VM_RESERVED | VM_DONTEXPAND; 974 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
975 ret = 1; 975 ret = 1;
976 976
977bail: 977bail:
diff --git a/drivers/media/pci/meye/meye.c b/drivers/media/pci/meye/meye.c
index 7bc775219f97..e5a76da86081 100644
--- a/drivers/media/pci/meye/meye.c
+++ b/drivers/media/pci/meye/meye.c
@@ -1647,7 +1647,7 @@ static int meye_mmap(struct file *file, struct vm_area_struct *vma)
1647 1647
1648 vma->vm_ops = &meye_vm_ops; 1648 vma->vm_ops = &meye_vm_ops;
1649 vma->vm_flags &= ~VM_IO; /* not I/O memory */ 1649 vma->vm_flags &= ~VM_IO; /* not I/O memory */
1650 vma->vm_flags |= VM_RESERVED; /* avoid to swap out this VMA */ 1650 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
1651 vma->vm_private_data = (void *) (offset / gbufsize); 1651 vma->vm_private_data = (void *) (offset / gbufsize);
1652 meye_vm_open(vma); 1652 meye_vm_open(vma);
1653 1653
diff --git a/drivers/media/platform/omap/omap_vout.c b/drivers/media/platform/omap/omap_vout.c
index 66ac21d466af..134016f0e660 100644
--- a/drivers/media/platform/omap/omap_vout.c
+++ b/drivers/media/platform/omap/omap_vout.c
@@ -911,7 +911,7 @@ static int omap_vout_mmap(struct file *file, struct vm_area_struct *vma)
911 911
912 q->bufs[i]->baddr = vma->vm_start; 912 q->bufs[i]->baddr = vma->vm_start;
913 913
914 vma->vm_flags |= VM_RESERVED; 914 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
915 vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); 915 vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
916 vma->vm_ops = &omap_vout_vm_ops; 916 vma->vm_ops = &omap_vout_vm_ops;
917 vma->vm_private_data = (void *) vout; 917 vma->vm_private_data = (void *) vout;
diff --git a/drivers/media/platform/vino.c b/drivers/media/platform/vino.c
index 790d96cffeea..70b0bf4b2900 100644
--- a/drivers/media/platform/vino.c
+++ b/drivers/media/platform/vino.c
@@ -3950,7 +3950,7 @@ found:
3950 3950
3951 fb->map_count = 1; 3951 fb->map_count = 1;
3952 3952
3953 vma->vm_flags |= VM_DONTEXPAND | VM_RESERVED; 3953 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
3954 vma->vm_flags &= ~VM_IO; 3954 vma->vm_flags &= ~VM_IO;
3955 vma->vm_private_data = fb; 3955 vma->vm_private_data = fb;
3956 vma->vm_file = file; 3956 vma->vm_file = file;
diff --git a/drivers/media/usb/sn9c102/sn9c102_core.c b/drivers/media/usb/sn9c102/sn9c102_core.c
index 19ea780b16ff..5bfc8e2f018f 100644
--- a/drivers/media/usb/sn9c102/sn9c102_core.c
+++ b/drivers/media/usb/sn9c102/sn9c102_core.c
@@ -2126,8 +2126,7 @@ static int sn9c102_mmap(struct file* filp, struct vm_area_struct *vma)
2126 return -EINVAL; 2126 return -EINVAL;
2127 } 2127 }
2128 2128
2129 vma->vm_flags |= VM_IO; 2129 vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
2130 vma->vm_flags |= VM_RESERVED;
2131 2130
2132 pos = cam->frame[i].bufmem; 2131 pos = cam->frame[i].bufmem;
2133 while (size > 0) { /* size is page-aligned */ 2132 while (size > 0) { /* size is page-aligned */
diff --git a/drivers/media/usb/usbvision/usbvision-video.c b/drivers/media/usb/usbvision/usbvision-video.c
index f67018ed3795..5c36a57e6590 100644
--- a/drivers/media/usb/usbvision/usbvision-video.c
+++ b/drivers/media/usb/usbvision/usbvision-video.c
@@ -1108,8 +1108,7 @@ static int usbvision_mmap(struct file *file, struct vm_area_struct *vma)
1108 } 1108 }
1109 1109
1110 /* VM_IO is eventually going to replace PageReserved altogether */ 1110 /* VM_IO is eventually going to replace PageReserved altogether */
1111 vma->vm_flags |= VM_IO; 1111 vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
1112 vma->vm_flags |= VM_RESERVED; /* avoid to swap out this VMA */
1113 1112
1114 pos = usbvision->frame[i].data; 1113 pos = usbvision->frame[i].data;
1115 while (size > 0) { 1114 while (size > 0) {
diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c
index f300deafd268..828e7c10bd70 100644
--- a/drivers/media/v4l2-core/videobuf-dma-sg.c
+++ b/drivers/media/v4l2-core/videobuf-dma-sg.c
@@ -582,7 +582,7 @@ static int __videobuf_mmap_mapper(struct videobuf_queue *q,
582 map->count = 1; 582 map->count = 1;
583 map->q = q; 583 map->q = q;
584 vma->vm_ops = &videobuf_vm_ops; 584 vma->vm_ops = &videobuf_vm_ops;
585 vma->vm_flags |= VM_DONTEXPAND | VM_RESERVED; 585 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
586 vma->vm_flags &= ~VM_IO; /* using shared anonymous pages */ 586 vma->vm_flags &= ~VM_IO; /* using shared anonymous pages */
587 vma->vm_private_data = map; 587 vma->vm_private_data = map;
588 dprintk(1, "mmap %p: q=%p %08lx-%08lx pgoff %08lx bufs %d-%d\n", 588 dprintk(1, "mmap %p: q=%p %08lx-%08lx pgoff %08lx bufs %d-%d\n",
diff --git a/drivers/media/v4l2-core/videobuf-vmalloc.c b/drivers/media/v4l2-core/videobuf-vmalloc.c
index df142580e44c..2ff7fcc77b11 100644
--- a/drivers/media/v4l2-core/videobuf-vmalloc.c
+++ b/drivers/media/v4l2-core/videobuf-vmalloc.c
@@ -270,7 +270,7 @@ static int __videobuf_mmap_mapper(struct videobuf_queue *q,
270 } 270 }
271 271
272 vma->vm_ops = &videobuf_vm_ops; 272 vma->vm_ops = &videobuf_vm_ops;
273 vma->vm_flags |= VM_DONTEXPAND | VM_RESERVED; 273 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
274 vma->vm_private_data = map; 274 vma->vm_private_data = map;
275 275
276 dprintk(1, "mmap %p: q=%p %08lx-%08lx (%lx) pgoff %08lx buf %d\n", 276 dprintk(1, "mmap %p: q=%p %08lx-%08lx (%lx) pgoff %08lx buf %d\n",
diff --git a/drivers/media/v4l2-core/videobuf2-memops.c b/drivers/media/v4l2-core/videobuf2-memops.c
index 504cd4cbe29e..051ea3571b20 100644
--- a/drivers/media/v4l2-core/videobuf2-memops.c
+++ b/drivers/media/v4l2-core/videobuf2-memops.c
@@ -163,7 +163,7 @@ int vb2_mmap_pfn_range(struct vm_area_struct *vma, unsigned long paddr,
163 return ret; 163 return ret;
164 } 164 }
165 165
166 vma->vm_flags |= VM_DONTEXPAND | VM_RESERVED; 166 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
167 vma->vm_private_data = priv; 167 vma->vm_private_data = priv;
168 vma->vm_ops = vm_ops; 168 vma->vm_ops = vm_ops;
169 169
diff --git a/drivers/misc/carma/carma-fpga.c b/drivers/misc/carma/carma-fpga.c
index 0c43297ed9ac..8835eabb3b87 100644
--- a/drivers/misc/carma/carma-fpga.c
+++ b/drivers/misc/carma/carma-fpga.c
@@ -1243,8 +1243,6 @@ static int data_mmap(struct file *filp, struct vm_area_struct *vma)
1243 return -EINVAL; 1243 return -EINVAL;
1244 } 1244 }
1245 1245
1246 /* IO memory (stop cacheing) */
1247 vma->vm_flags |= VM_IO | VM_RESERVED;
1248 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); 1246 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
1249 1247
1250 return io_remap_pfn_range(vma, vma->vm_start, addr, vsize, 1248 return io_remap_pfn_range(vma, vma->vm_start, addr, vsize,
diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c
index ecafa4ba238b..492c8cac69ac 100644
--- a/drivers/misc/sgi-gru/grufile.c
+++ b/drivers/misc/sgi-gru/grufile.c
@@ -108,9 +108,8 @@ static int gru_file_mmap(struct file *file, struct vm_area_struct *vma)
108 vma->vm_end & (GRU_GSEG_PAGESIZE - 1)) 108 vma->vm_end & (GRU_GSEG_PAGESIZE - 1))
109 return -EINVAL; 109 return -EINVAL;
110 110
111 vma->vm_flags |= 111 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_LOCKED |
112 (VM_IO | VM_DONTCOPY | VM_LOCKED | VM_DONTEXPAND | VM_PFNMAP | 112 VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
113 VM_RESERVED);
114 vma->vm_page_prot = PAGE_SHARED; 113 vma->vm_page_prot = PAGE_SHARED;
115 vma->vm_ops = &gru_vm_ops; 114 vma->vm_ops = &gru_vm_ops;
116 115
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index a6e74514e662..73ae81a629f2 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -1182,7 +1182,7 @@ static int mtdchar_mmap(struct file *file, struct vm_area_struct *vma)
1182 return -EINVAL; 1182 return -EINVAL;
1183 if (set_vm_offset(vma, off) < 0) 1183 if (set_vm_offset(vma, off) < 0)
1184 return -EINVAL; 1184 return -EINVAL;
1185 vma->vm_flags |= VM_IO | VM_RESERVED; 1185 vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
1186 1186
1187#ifdef pgprot_noncached 1187#ifdef pgprot_noncached
1188 if (file->f_flags & O_DSYNC || off >= __pa(high_memory)) 1188 if (file->f_flags & O_DSYNC || off >= __pa(high_memory))
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 575730744fdb..b9adff543f5f 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -1056,8 +1056,7 @@ EXPORT_SYMBOL_GPL(mtd_writev);
1056 * until the request succeeds or until the allocation size falls below 1056 * until the request succeeds or until the allocation size falls below
1057 * the system page size. This attempts to make sure it does not adversely 1057 * the system page size. This attempts to make sure it does not adversely
1058 * impact system performance, so when allocating more than one page, we 1058 * impact system performance, so when allocating more than one page, we
1059 * ask the memory allocator to avoid re-trying, swapping, writing back 1059 * ask the memory allocator to avoid re-trying.
1060 * or performing I/O.
1061 * 1060 *
1062 * Note, this function also makes sure that the allocated buffer is aligned to 1061 * Note, this function also makes sure that the allocated buffer is aligned to
1063 * the MTD device's min. I/O unit, i.e. the "mtd->writesize" value. 1062 * the MTD device's min. I/O unit, i.e. the "mtd->writesize" value.
@@ -1071,8 +1070,7 @@ EXPORT_SYMBOL_GPL(mtd_writev);
1071 */ 1070 */
1072void *mtd_kmalloc_up_to(const struct mtd_info *mtd, size_t *size) 1071void *mtd_kmalloc_up_to(const struct mtd_info *mtd, size_t *size)
1073{ 1072{
1074 gfp_t flags = __GFP_NOWARN | __GFP_WAIT | 1073 gfp_t flags = __GFP_NOWARN | __GFP_WAIT | __GFP_NORETRY;
1075 __GFP_NORETRY | __GFP_NO_KSWAPD;
1076 size_t min_alloc = max_t(size_t, mtd->writesize, PAGE_SIZE); 1074 size_t min_alloc = max_t(size_t, mtd->writesize, PAGE_SIZE);
1077 void *kbuf; 1075 void *kbuf;
1078 1076
diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c
index f34b5b29fb95..d93b2b6b1f7a 100644
--- a/drivers/oprofile/buffer_sync.c
+++ b/drivers/oprofile/buffer_sync.c
@@ -216,7 +216,7 @@ static inline unsigned long fast_get_dcookie(struct path *path)
216} 216}
217 217
218 218
219/* Look up the dcookie for the task's first VM_EXECUTABLE mapping, 219/* Look up the dcookie for the task's mm->exe_file,
220 * which corresponds loosely to "application name". This is 220 * which corresponds loosely to "application name". This is
221 * not strictly necessary but allows oprofile to associate 221 * not strictly necessary but allows oprofile to associate
222 * shared-library samples with particular applications 222 * shared-library samples with particular applications
@@ -224,21 +224,10 @@ static inline unsigned long fast_get_dcookie(struct path *path)
224static unsigned long get_exec_dcookie(struct mm_struct *mm) 224static unsigned long get_exec_dcookie(struct mm_struct *mm)
225{ 225{
226 unsigned long cookie = NO_COOKIE; 226 unsigned long cookie = NO_COOKIE;
227 struct vm_area_struct *vma;
228
229 if (!mm)
230 goto out;
231 227
232 for (vma = mm->mmap; vma; vma = vma->vm_next) { 228 if (mm && mm->exe_file)
233 if (!vma->vm_file) 229 cookie = fast_get_dcookie(&mm->exe_file->f_path);
234 continue;
235 if (!(vma->vm_flags & VM_EXECUTABLE))
236 continue;
237 cookie = fast_get_dcookie(&vma->vm_file->f_path);
238 break;
239 }
240 230
241out:
242 return cookie; 231 return cookie;
243} 232}
244 233
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 9c5c5f2b3962..be2c9a6561ff 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1257,7 +1257,7 @@ sg_mmap(struct file *filp, struct vm_area_struct *vma)
1257 } 1257 }
1258 1258
1259 sfp->mmap_called = 1; 1259 sfp->mmap_called = 1;
1260 vma->vm_flags |= VM_RESERVED; 1260 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
1261 vma->vm_private_data = sfp; 1261 vma->vm_private_data = sfp;
1262 vma->vm_ops = &sg_mmap_vm_ops; 1262 vma->vm_ops = &sg_mmap_vm_ops;
1263 return 0; 1263 return 0;
diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c
index 94a740d2883d..634b9ae713e0 100644
--- a/drivers/staging/android/ashmem.c
+++ b/drivers/staging/android/ashmem.c
@@ -332,7 +332,6 @@ static int ashmem_mmap(struct file *file, struct vm_area_struct *vma)
332 if (vma->vm_file) 332 if (vma->vm_file)
333 fput(vma->vm_file); 333 fput(vma->vm_file);
334 vma->vm_file = asma->file; 334 vma->vm_file = asma->file;
335 vma->vm_flags |= VM_CAN_NONLINEAR;
336 335
337out: 336out:
338 mutex_unlock(&ashmem_mutex); 337 mutex_unlock(&ashmem_mutex);
diff --git a/drivers/staging/omapdrm/omap_gem_dmabuf.c b/drivers/staging/omapdrm/omap_gem_dmabuf.c
index 42728e0cc194..c6f3ef6f57b9 100644
--- a/drivers/staging/omapdrm/omap_gem_dmabuf.c
+++ b/drivers/staging/omapdrm/omap_gem_dmabuf.c
@@ -160,7 +160,7 @@ static int omap_gem_dmabuf_mmap(struct dma_buf *buffer,
160 goto out_unlock; 160 goto out_unlock;
161 } 161 }
162 162
163 vma->vm_flags |= VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTEXPAND; 163 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
164 vma->vm_ops = obj->dev->driver->gem_vm_ops; 164 vma->vm_ops = obj->dev->driver->gem_vm_ops;
165 vma->vm_private_data = obj; 165 vma->vm_private_data = obj;
166 vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); 166 vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
diff --git a/drivers/staging/tidspbridge/rmgr/drv_interface.c b/drivers/staging/tidspbridge/rmgr/drv_interface.c
index bddea1d3b2c3..701a11ac676d 100644
--- a/drivers/staging/tidspbridge/rmgr/drv_interface.c
+++ b/drivers/staging/tidspbridge/rmgr/drv_interface.c
@@ -261,7 +261,7 @@ static int bridge_mmap(struct file *filp, struct vm_area_struct *vma)
261{ 261{
262 u32 status; 262 u32 status;
263 263
264 vma->vm_flags |= VM_RESERVED | VM_IO; 264 /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */
265 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); 265 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
266 266
267 dev_dbg(bridge, "%s: vm filp %p start %lx end %lx page_prot %ulx " 267 dev_dbg(bridge, "%s: vm filp %p start %lx end %lx page_prot %ulx "
diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c
index a783d533a1a6..5110f367f1f1 100644
--- a/drivers/uio/uio.c
+++ b/drivers/uio/uio.c
@@ -653,8 +653,6 @@ static int uio_mmap_physical(struct vm_area_struct *vma)
653 if (mi < 0) 653 if (mi < 0)
654 return -EINVAL; 654 return -EINVAL;
655 655
656 vma->vm_flags |= VM_IO | VM_RESERVED;
657
658 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); 656 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
659 657
660 return remap_pfn_range(vma, 658 return remap_pfn_range(vma,
@@ -666,7 +664,7 @@ static int uio_mmap_physical(struct vm_area_struct *vma)
666 664
667static int uio_mmap_logical(struct vm_area_struct *vma) 665static int uio_mmap_logical(struct vm_area_struct *vma)
668{ 666{
669 vma->vm_flags |= VM_RESERVED; 667 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
670 vma->vm_ops = &uio_vm_ops; 668 vma->vm_ops = &uio_vm_ops;
671 uio_vma_open(vma); 669 uio_vma_open(vma);
672 return 0; 670 return 0;
diff --git a/drivers/usb/mon/mon_bin.c b/drivers/usb/mon/mon_bin.c
index 91cd85076a44..9a62e89d6dc0 100644
--- a/drivers/usb/mon/mon_bin.c
+++ b/drivers/usb/mon/mon_bin.c
@@ -1247,7 +1247,7 @@ static int mon_bin_mmap(struct file *filp, struct vm_area_struct *vma)
1247{ 1247{
1248 /* don't do anything here: "fault" will set up page table entries */ 1248 /* don't do anything here: "fault" will set up page table entries */
1249 vma->vm_ops = &mon_bin_vm_ops; 1249 vma->vm_ops = &mon_bin_vm_ops;
1250 vma->vm_flags |= VM_RESERVED; 1250 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
1251 vma->vm_private_data = filp->private_data; 1251 vma->vm_private_data = filp->private_data;
1252 mon_bin_vma_open(vma); 1252 mon_bin_vma_open(vma);
1253 return 0; 1253 return 0;
diff --git a/drivers/video/68328fb.c b/drivers/video/68328fb.c
index a425d65d5ba2..fa44fbed397d 100644
--- a/drivers/video/68328fb.c
+++ b/drivers/video/68328fb.c
@@ -400,7 +400,7 @@ static int mc68x328fb_mmap(struct fb_info *info, struct vm_area_struct *vma)
400#ifndef MMU 400#ifndef MMU
401 /* this is uClinux (no MMU) specific code */ 401 /* this is uClinux (no MMU) specific code */
402 402
403 vma->vm_flags |= VM_RESERVED; 403 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
404 vma->vm_start = videomemory; 404 vma->vm_start = videomemory;
405 405
406 return 0; 406 return 0;
diff --git a/drivers/video/aty/atyfb_base.c b/drivers/video/aty/atyfb_base.c
index 3f2e8c13f1ca..868932f904ef 100644
--- a/drivers/video/aty/atyfb_base.c
+++ b/drivers/video/aty/atyfb_base.c
@@ -1942,8 +1942,7 @@ static int atyfb_mmap(struct fb_info *info, struct vm_area_struct *vma)
1942 off = vma->vm_pgoff << PAGE_SHIFT; 1942 off = vma->vm_pgoff << PAGE_SHIFT;
1943 size = vma->vm_end - vma->vm_start; 1943 size = vma->vm_end - vma->vm_start;
1944 1944
1945 /* To stop the swapper from even considering these pages. */ 1945 /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */
1946 vma->vm_flags |= (VM_IO | VM_RESERVED);
1947 1946
1948 if (((vma->vm_pgoff == 0) && (size == info->fix.smem_len)) || 1947 if (((vma->vm_pgoff == 0) && (size == info->fix.smem_len)) ||
1949 ((off == info->fix.smem_len) && (size == PAGE_SIZE))) 1948 ((off == info->fix.smem_len) && (size == PAGE_SIZE)))
diff --git a/drivers/video/fb-puv3.c b/drivers/video/fb-puv3.c
index 60a787fa32cf..7d106f1f4906 100644
--- a/drivers/video/fb-puv3.c
+++ b/drivers/video/fb-puv3.c
@@ -653,9 +653,8 @@ int unifb_mmap(struct fb_info *info,
653 vma->vm_page_prot)) 653 vma->vm_page_prot))
654 return -EAGAIN; 654 return -EAGAIN;
655 655
656 vma->vm_flags |= VM_RESERVED; /* avoid to swap out this VMA */ 656 /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */
657 return 0; 657 return 0;
658
659} 658}
660 659
661static struct fb_ops unifb_ops = { 660static struct fb_ops unifb_ops = {
diff --git a/drivers/video/fb_defio.c b/drivers/video/fb_defio.c
index 64cda560c488..88cad6b8b479 100644
--- a/drivers/video/fb_defio.c
+++ b/drivers/video/fb_defio.c
@@ -166,7 +166,7 @@ static const struct address_space_operations fb_deferred_io_aops = {
166static int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma) 166static int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma)
167{ 167{
168 vma->vm_ops = &fb_deferred_io_vm_ops; 168 vma->vm_ops = &fb_deferred_io_vm_ops;
169 vma->vm_flags |= ( VM_RESERVED | VM_DONTEXPAND ); 169 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
170 if (!(info->flags & FBINFO_VIRTFB)) 170 if (!(info->flags & FBINFO_VIRTFB))
171 vma->vm_flags |= VM_IO; 171 vma->vm_flags |= VM_IO;
172 vma->vm_private_data = info; 172 vma->vm_private_data = info;
diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index 0dff12a1daef..3ff0105a496a 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -1410,8 +1410,7 @@ fb_mmap(struct file *file, struct vm_area_struct * vma)
1410 return -EINVAL; 1410 return -EINVAL;
1411 off += start; 1411 off += start;
1412 vma->vm_pgoff = off >> PAGE_SHIFT; 1412 vma->vm_pgoff = off >> PAGE_SHIFT;
1413 /* This is an IO map - tell maydump to skip this VMA */ 1413 /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by io_remap_pfn_range()*/
1414 vma->vm_flags |= VM_IO | VM_RESERVED;
1415 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 1414 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
1416 fb_pgprotect(file, vma, off); 1415 fb_pgprotect(file, vma, off);
1417 if (io_remap_pfn_range(vma, vma->vm_start, off >> PAGE_SHIFT, 1416 if (io_remap_pfn_range(vma, vma->vm_start, off >> PAGE_SHIFT,
diff --git a/drivers/video/gbefb.c b/drivers/video/gbefb.c
index 7e7b7a9ba274..05e2a8a99d8f 100644
--- a/drivers/video/gbefb.c
+++ b/drivers/video/gbefb.c
@@ -1024,7 +1024,7 @@ static int gbefb_mmap(struct fb_info *info,
1024 pgprot_val(vma->vm_page_prot) = 1024 pgprot_val(vma->vm_page_prot) =
1025 pgprot_fb(pgprot_val(vma->vm_page_prot)); 1025 pgprot_fb(pgprot_val(vma->vm_page_prot));
1026 1026
1027 vma->vm_flags |= VM_IO | VM_RESERVED; 1027 /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */
1028 1028
1029 /* look for the starting tile */ 1029 /* look for the starting tile */
1030 tile = &gbe_tiles.cpu[offset >> TILE_SHIFT]; 1030 tile = &gbe_tiles.cpu[offset >> TILE_SHIFT];
diff --git a/drivers/video/omap2/omapfb/omapfb-main.c b/drivers/video/omap2/omapfb/omapfb-main.c
index 3c39aa8de928..15373f4aee19 100644
--- a/drivers/video/omap2/omapfb/omapfb-main.c
+++ b/drivers/video/omap2/omapfb/omapfb-main.c
@@ -1128,7 +1128,7 @@ static int omapfb_mmap(struct fb_info *fbi, struct vm_area_struct *vma)
1128 DBG("user mmap region start %lx, len %d, off %lx\n", start, len, off); 1128 DBG("user mmap region start %lx, len %d, off %lx\n", start, len, off);
1129 1129
1130 vma->vm_pgoff = off >> PAGE_SHIFT; 1130 vma->vm_pgoff = off >> PAGE_SHIFT;
1131 vma->vm_flags |= VM_IO | VM_RESERVED; 1131 /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */
1132 vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); 1132 vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
1133 vma->vm_ops = &mmap_user_ops; 1133 vma->vm_ops = &mmap_user_ops;
1134 vma->vm_private_data = rg; 1134 vma->vm_private_data = rg;
diff --git a/drivers/video/sbuslib.c b/drivers/video/sbuslib.c
index 3c1de981a18c..296afae442f4 100644
--- a/drivers/video/sbuslib.c
+++ b/drivers/video/sbuslib.c
@@ -57,9 +57,8 @@ int sbusfb_mmap_helper(struct sbus_mmap_map *map,
57 57
58 off = vma->vm_pgoff << PAGE_SHIFT; 58 off = vma->vm_pgoff << PAGE_SHIFT;
59 59
60 /* To stop the swapper from even considering these pages */ 60 /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */
61 vma->vm_flags |= (VM_IO | VM_RESERVED); 61
62
63 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); 62 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
64 63
65 /* Each page, see which map applies */ 64 /* Each page, see which map applies */
diff --git a/drivers/video/smscufx.c b/drivers/video/smscufx.c
index 5533a32c6ca1..97bd6620c364 100644
--- a/drivers/video/smscufx.c
+++ b/drivers/video/smscufx.c
@@ -803,7 +803,6 @@ static int ufx_ops_mmap(struct fb_info *info, struct vm_area_struct *vma)
803 size = 0; 803 size = 0;
804 } 804 }
805 805
806 vma->vm_flags |= VM_RESERVED; /* avoid to swap out this VMA */
807 return 0; 806 return 0;
808} 807}
809 808
diff --git a/drivers/video/udlfb.c b/drivers/video/udlfb.c
index 8af64148294b..f45eba3d6150 100644
--- a/drivers/video/udlfb.c
+++ b/drivers/video/udlfb.c
@@ -345,7 +345,6 @@ static int dlfb_ops_mmap(struct fb_info *info, struct vm_area_struct *vma)
345 size = 0; 345 size = 0;
346 } 346 }
347 347
348 vma->vm_flags |= VM_RESERVED; /* avoid to swap out this VMA */
349 return 0; 348 return 0;
350} 349}
351 350
diff --git a/drivers/video/vermilion/vermilion.c b/drivers/video/vermilion/vermilion.c
index 970e43d13f52..89aef343e295 100644
--- a/drivers/video/vermilion/vermilion.c
+++ b/drivers/video/vermilion/vermilion.c
@@ -1018,7 +1018,6 @@ static int vmlfb_mmap(struct fb_info *info, struct vm_area_struct *vma)
1018 offset += vinfo->vram_start; 1018 offset += vinfo->vram_start;
1019 pgprot_val(vma->vm_page_prot) |= _PAGE_PCD; 1019 pgprot_val(vma->vm_page_prot) |= _PAGE_PCD;
1020 pgprot_val(vma->vm_page_prot) &= ~_PAGE_PWT; 1020 pgprot_val(vma->vm_page_prot) &= ~_PAGE_PWT;
1021 vma->vm_flags |= VM_RESERVED | VM_IO;
1022 if (remap_pfn_range(vma, vma->vm_start, offset >> PAGE_SHIFT, 1021 if (remap_pfn_range(vma, vma->vm_start, offset >> PAGE_SHIFT,
1023 size, vma->vm_page_prot)) 1022 size, vma->vm_page_prot))
1024 return -EAGAIN; 1023 return -EAGAIN;
diff --git a/drivers/video/vfb.c b/drivers/video/vfb.c
index 501a922aa9dc..c7f692525b88 100644
--- a/drivers/video/vfb.c
+++ b/drivers/video/vfb.c
@@ -439,7 +439,6 @@ static int vfb_mmap(struct fb_info *info,
439 size = 0; 439 size = 0;
440 } 440 }
441 441
442 vma->vm_flags |= VM_RESERVED; /* avoid to swap out this VMA */
443 return 0; 442 return 0;
444 443
445} 444}
diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c
index 934985d14c24..4097987b330e 100644
--- a/drivers/xen/gntalloc.c
+++ b/drivers/xen/gntalloc.c
@@ -535,7 +535,7 @@ static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma)
535 535
536 vma->vm_private_data = vm_priv; 536 vma->vm_private_data = vm_priv;
537 537
538 vma->vm_flags |= VM_RESERVED | VM_DONTEXPAND; 538 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
539 539
540 vma->vm_ops = &gntalloc_vmops; 540 vma->vm_ops = &gntalloc_vmops;
541 541
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index 5df9fd847b2e..610bfc6be177 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -720,7 +720,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
720 720
721 vma->vm_ops = &gntdev_vmops; 721 vma->vm_ops = &gntdev_vmops;
722 722
723 vma->vm_flags |= VM_RESERVED|VM_DONTEXPAND; 723 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
724 724
725 if (use_ptemod) 725 if (use_ptemod)
726 vma->vm_flags |= VM_DONTCOPY; 726 vma->vm_flags |= VM_DONTCOPY;
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index ef6389580b8c..8adb9cc267f9 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -455,7 +455,8 @@ static int privcmd_mmap(struct file *file, struct vm_area_struct *vma)
455{ 455{
456 /* DONTCOPY is essential for Xen because copy_page_range doesn't know 456 /* DONTCOPY is essential for Xen because copy_page_range doesn't know
457 * how to recreate these mappings */ 457 * how to recreate these mappings */
458 vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY | VM_PFNMAP; 458 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTCOPY |
459 VM_DONTEXPAND | VM_DONTDUMP;
459 vma->vm_ops = &privcmd_vm_ops; 460 vma->vm_ops = &privcmd_vm_ops;
460 vma->vm_private_data = NULL; 461 vma->vm_private_data = NULL;
461 462
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index dd6f7ee1e312..c2483e97beee 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -738,6 +738,7 @@ v9fs_cached_file_write(struct file *filp, const char __user * data,
738static const struct vm_operations_struct v9fs_file_vm_ops = { 738static const struct vm_operations_struct v9fs_file_vm_ops = {
739 .fault = filemap_fault, 739 .fault = filemap_fault,
740 .page_mkwrite = v9fs_vm_page_mkwrite, 740 .page_mkwrite = v9fs_vm_page_mkwrite,
741 .remap_pages = generic_file_remap_pages,
741}; 742};
742 743
743 744
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 28a64e769527..e800dec958c3 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1123,7 +1123,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
1123 if (always_dump_vma(vma)) 1123 if (always_dump_vma(vma))
1124 goto whole; 1124 goto whole;
1125 1125
1126 if (vma->vm_flags & VM_NODUMP) 1126 if (vma->vm_flags & VM_DONTDUMP)
1127 return 0; 1127 return 0;
1128 1128
1129 /* Hugetlb memory check */ 1129 /* Hugetlb memory check */
@@ -1135,7 +1135,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
1135 } 1135 }
1136 1136
1137 /* Do not dump I/O mapped devices or special mappings */ 1137 /* Do not dump I/O mapped devices or special mappings */
1138 if (vma->vm_flags & (VM_IO | VM_RESERVED)) 1138 if (vma->vm_flags & VM_IO)
1139 return 0; 1139 return 0;
1140 1140
1141 /* By default, dump shared memory if mapped from an anonymous file. */ 1141 /* By default, dump shared memory if mapped from an anonymous file. */
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 08d812b32282..262db114ff01 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1205,7 +1205,7 @@ static int maydump(struct vm_area_struct *vma, unsigned long mm_flags)
1205 int dump_ok; 1205 int dump_ok;
1206 1206
1207 /* Do not dump I/O mapped devices or special mappings */ 1207 /* Do not dump I/O mapped devices or special mappings */
1208 if (vma->vm_flags & (VM_IO | VM_RESERVED)) { 1208 if (vma->vm_flags & VM_IO) {
1209 kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags); 1209 kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags);
1210 return 0; 1210 return 0;
1211 } 1211 }
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 5caf285c6e4d..f6b40e86121b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1599,6 +1599,7 @@ out:
1599static const struct vm_operations_struct btrfs_file_vm_ops = { 1599static const struct vm_operations_struct btrfs_file_vm_ops = {
1600 .fault = filemap_fault, 1600 .fault = filemap_fault,
1601 .page_mkwrite = btrfs_page_mkwrite, 1601 .page_mkwrite = btrfs_page_mkwrite,
1602 .remap_pages = generic_file_remap_pages,
1602}; 1603};
1603 1604
1604static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) 1605static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
@@ -1610,7 +1611,6 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1610 1611
1611 file_accessed(filp); 1612 file_accessed(filp);
1612 vma->vm_ops = &btrfs_file_vm_ops; 1613 vma->vm_ops = &btrfs_file_vm_ops;
1613 vma->vm_flags |= VM_CAN_NONLINEAR;
1614 1614
1615 return 0; 1615 return 0;
1616} 1616}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 22b6e4583faa..6690269f5dde 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1224,6 +1224,7 @@ out:
1224static struct vm_operations_struct ceph_vmops = { 1224static struct vm_operations_struct ceph_vmops = {
1225 .fault = filemap_fault, 1225 .fault = filemap_fault,
1226 .page_mkwrite = ceph_page_mkwrite, 1226 .page_mkwrite = ceph_page_mkwrite,
1227 .remap_pages = generic_file_remap_pages,
1227}; 1228};
1228 1229
1229int ceph_mmap(struct file *file, struct vm_area_struct *vma) 1230int ceph_mmap(struct file *file, struct vm_area_struct *vma)
@@ -1234,6 +1235,5 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma)
1234 return -ENOEXEC; 1235 return -ENOEXEC;
1235 file_accessed(file); 1236 file_accessed(file);
1236 vma->vm_ops = &ceph_vmops; 1237 vma->vm_ops = &ceph_vmops;
1237 vma->vm_flags |= VM_CAN_NONLINEAR;
1238 return 0; 1238 return 0;
1239} 1239}
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 7d7bbdc4c8e7..edb25b4bbb95 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3003,6 +3003,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
3003static struct vm_operations_struct cifs_file_vm_ops = { 3003static struct vm_operations_struct cifs_file_vm_ops = {
3004 .fault = filemap_fault, 3004 .fault = filemap_fault,
3005 .page_mkwrite = cifs_page_mkwrite, 3005 .page_mkwrite = cifs_page_mkwrite,
3006 .remap_pages = generic_file_remap_pages,
3006}; 3007};
3007 3008
3008int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma) 3009int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/exec.c b/fs/exec.c
index 19f4fb80cd17..4f2bebc276c5 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -603,7 +603,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
603 * process cleanup to remove whatever mess we made. 603 * process cleanup to remove whatever mess we made.
604 */ 604 */
605 if (length != move_page_tables(vma, old_start, 605 if (length != move_page_tables(vma, old_start,
606 vma, new_start, length)) 606 vma, new_start, length, false))
607 return -ENOMEM; 607 return -ENOMEM;
608 608
609 lru_add_drain(); 609 lru_add_drain();
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index ca6f07afe601..bf3966bccd34 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -207,6 +207,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
207static const struct vm_operations_struct ext4_file_vm_ops = { 207static const struct vm_operations_struct ext4_file_vm_ops = {
208 .fault = filemap_fault, 208 .fault = filemap_fault,
209 .page_mkwrite = ext4_page_mkwrite, 209 .page_mkwrite = ext4_page_mkwrite,
210 .remap_pages = generic_file_remap_pages,
210}; 211};
211 212
212static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) 213static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
@@ -217,7 +218,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
217 return -ENOEXEC; 218 return -ENOEXEC;
218 file_accessed(file); 219 file_accessed(file);
219 vma->vm_ops = &ext4_file_vm_ops; 220 vma->vm_ops = &ext4_file_vm_ops;
220 vma->vm_flags |= VM_CAN_NONLINEAR;
221 return 0; 221 return 0;
222} 222}
223 223
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 8e1d7b9e4a33..401b6c6248ae 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -439,8 +439,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
439 * setting I_SYNC flag and calling inode_sync_complete() to clear it. 439 * setting I_SYNC flag and calling inode_sync_complete() to clear it.
440 */ 440 */
441static int 441static int
442__writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, 442__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
443 struct writeback_control *wbc)
444{ 443{
445 struct address_space *mapping = inode->i_mapping; 444 struct address_space *mapping = inode->i_mapping;
446 long nr_to_write = wbc->nr_to_write; 445 long nr_to_write = wbc->nr_to_write;
@@ -527,7 +526,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
527 inode->i_state |= I_SYNC; 526 inode->i_state |= I_SYNC;
528 spin_unlock(&inode->i_lock); 527 spin_unlock(&inode->i_lock);
529 528
530 ret = __writeback_single_inode(inode, wb, wbc); 529 ret = __writeback_single_inode(inode, wbc);
531 530
532 spin_lock(&wb->list_lock); 531 spin_lock(&wb->list_lock);
533 spin_lock(&inode->i_lock); 532 spin_lock(&inode->i_lock);
@@ -670,7 +669,7 @@ static long writeback_sb_inodes(struct super_block *sb,
670 * We use I_SYNC to pin the inode in memory. While it is set 669 * We use I_SYNC to pin the inode in memory. While it is set
671 * evict_inode() will wait so the inode cannot be freed. 670 * evict_inode() will wait so the inode cannot be freed.
672 */ 671 */
673 __writeback_single_inode(inode, wb, &wbc); 672 __writeback_single_inode(inode, &wbc);
674 673
675 work->nr_pages -= write_chunk - wbc.nr_to_write; 674 work->nr_pages -= write_chunk - wbc.nr_to_write;
676 wrote += write_chunk - wbc.nr_to_write; 675 wrote += write_chunk - wbc.nr_to_write;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index aba15f1b7ad2..78d2837bc940 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1379,6 +1379,7 @@ static const struct vm_operations_struct fuse_file_vm_ops = {
1379 .close = fuse_vma_close, 1379 .close = fuse_vma_close,
1380 .fault = filemap_fault, 1380 .fault = filemap_fault,
1381 .page_mkwrite = fuse_page_mkwrite, 1381 .page_mkwrite = fuse_page_mkwrite,
1382 .remap_pages = generic_file_remap_pages,
1382}; 1383};
1383 1384
1384static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) 1385static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 30e21997a1a1..0def0504afc1 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -492,6 +492,7 @@ out:
492static const struct vm_operations_struct gfs2_vm_ops = { 492static const struct vm_operations_struct gfs2_vm_ops = {
493 .fault = filemap_fault, 493 .fault = filemap_fault,
494 .page_mkwrite = gfs2_page_mkwrite, 494 .page_mkwrite = gfs2_page_mkwrite,
495 .remap_pages = generic_file_remap_pages,
495}; 496};
496 497
497/** 498/**
@@ -526,7 +527,6 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
526 return error; 527 return error;
527 } 528 }
528 vma->vm_ops = &gfs2_vm_ops; 529 vma->vm_ops = &gfs2_vm_ops;
529 vma->vm_flags |= VM_CAN_NONLINEAR;
530 530
531 return 0; 531 return 0;
532} 532}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 9460120a5170..c5bc355d8243 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -110,7 +110,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
110 * way when do_mmap_pgoff unwinds (may be important on powerpc 110 * way when do_mmap_pgoff unwinds (may be important on powerpc
111 * and ia64). 111 * and ia64).
112 */ 112 */
113 vma->vm_flags |= VM_HUGETLB | VM_RESERVED; 113 vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND | VM_DONTDUMP;
114 vma->vm_ops = &hugetlb_vm_ops; 114 vma->vm_ops = &hugetlb_vm_ops;
115 115
116 if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) 116 if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
@@ -397,17 +397,16 @@ static void hugetlbfs_evict_inode(struct inode *inode)
397} 397}
398 398
399static inline void 399static inline void
400hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff) 400hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
401{ 401{
402 struct vm_area_struct *vma; 402 struct vm_area_struct *vma;
403 struct prio_tree_iter iter;
404 403
405 vma_prio_tree_foreach(vma, &iter, root, pgoff, ULONG_MAX) { 404 vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) {
406 unsigned long v_offset; 405 unsigned long v_offset;
407 406
408 /* 407 /*
409 * Can the expression below overflow on 32-bit arches? 408 * Can the expression below overflow on 32-bit arches?
410 * No, because the prio_tree returns us only those vmas 409 * No, because the interval tree returns us only those vmas
411 * which overlap the truncated area starting at pgoff, 410 * which overlap the truncated area starting at pgoff,
412 * and no vma on a 32-bit arch can span beyond the 4GB. 411 * and no vma on a 32-bit arch can span beyond the 4GB.
413 */ 412 */
@@ -432,7 +431,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
432 431
433 i_size_write(inode, offset); 432 i_size_write(inode, offset);
434 mutex_lock(&mapping->i_mmap_mutex); 433 mutex_lock(&mapping->i_mmap_mutex);
435 if (!prio_tree_empty(&mapping->i_mmap)) 434 if (!RB_EMPTY_ROOT(&mapping->i_mmap))
436 hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); 435 hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
437 mutex_unlock(&mapping->i_mmap_mutex); 436 mutex_unlock(&mapping->i_mmap_mutex);
438 truncate_hugepages(inode, offset); 437 truncate_hugepages(inode, offset);
diff --git a/fs/inode.c b/fs/inode.c
index ac8d904b3f16..b03c71957246 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -348,7 +348,7 @@ void address_space_init_once(struct address_space *mapping)
348 mutex_init(&mapping->i_mmap_mutex); 348 mutex_init(&mapping->i_mmap_mutex);
349 INIT_LIST_HEAD(&mapping->private_list); 349 INIT_LIST_HEAD(&mapping->private_list);
350 spin_lock_init(&mapping->private_lock); 350 spin_lock_init(&mapping->private_lock);
351 INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); 351 mapping->i_mmap = RB_ROOT;
352 INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); 352 INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
353} 353}
354EXPORT_SYMBOL(address_space_init_once); 354EXPORT_SYMBOL(address_space_init_once);
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 1ea349fff68b..ae81b01e6fd7 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -394,8 +394,11 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c,
394} 394}
395 395
396/* Trivial function to remove the last node in the tree. Which by definition 396/* Trivial function to remove the last node in the tree. Which by definition
397 has no right-hand -- so can be removed just by making its only child (if 397 has no right-hand child — so can be removed just by making its left-hand
398 any) take its place under its parent. */ 398 child (if any) take its place under its parent. Since this is only done
399 when we're consuming the whole tree, there's no need to use rb_erase()
400 and let it worry about adjusting colours and balancing the tree. That
401 would just be a waste of time. */
399static void eat_last(struct rb_root *root, struct rb_node *node) 402static void eat_last(struct rb_root *root, struct rb_node *node)
400{ 403{
401 struct rb_node *parent = rb_parent(node); 404 struct rb_node *parent = rb_parent(node);
@@ -412,12 +415,12 @@ static void eat_last(struct rb_root *root, struct rb_node *node)
412 link = &parent->rb_right; 415 link = &parent->rb_right;
413 416
414 *link = node->rb_left; 417 *link = node->rb_left;
415 /* Colour doesn't matter now. Only the parent pointer. */
416 if (node->rb_left) 418 if (node->rb_left)
417 node->rb_left->rb_parent_color = node->rb_parent_color; 419 node->rb_left->__rb_parent_color = node->__rb_parent_color;
418} 420}
419 421
420/* We put this in reverse order, so we can just use eat_last */ 422/* We put the version tree in reverse order, so we can use the same eat_last()
423 function that we use to consume the tmpnode tree (tn_root). */
421static void ver_insert(struct rb_root *ver_root, struct jffs2_tmp_dnode_info *tn) 424static void ver_insert(struct rb_root *ver_root, struct jffs2_tmp_dnode_info *tn)
422{ 425{
423 struct rb_node **link = &ver_root->rb_node; 426 struct rb_node **link = &ver_root->rb_node;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 6a7fcab7ecb3..f692be97676d 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -578,6 +578,7 @@ out:
578static const struct vm_operations_struct nfs_file_vm_ops = { 578static const struct vm_operations_struct nfs_file_vm_ops = {
579 .fault = filemap_fault, 579 .fault = filemap_fault,
580 .page_mkwrite = nfs_vm_page_mkwrite, 580 .page_mkwrite = nfs_vm_page_mkwrite,
581 .remap_pages = generic_file_remap_pages,
581}; 582};
582 583
583static int nfs_need_sync_write(struct file *filp, struct inode *inode) 584static int nfs_need_sync_write(struct file *filp, struct inode *inode)
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 5b387a4c293e..16f35f7423c5 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -135,13 +135,13 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
135static const struct vm_operations_struct nilfs_file_vm_ops = { 135static const struct vm_operations_struct nilfs_file_vm_ops = {
136 .fault = filemap_fault, 136 .fault = filemap_fault,
137 .page_mkwrite = nilfs_page_mkwrite, 137 .page_mkwrite = nilfs_page_mkwrite,
138 .remap_pages = generic_file_remap_pages,
138}; 139};
139 140
140static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma) 141static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
141{ 142{
142 file_accessed(file); 143 file_accessed(file);
143 vma->vm_ops = &nilfs_file_vm_ops; 144 vma->vm_ops = &nilfs_file_vm_ops;
144 vma->vm_flags |= VM_CAN_NONLINEAR;
145 return 0; 145 return 0;
146} 146}
147 147
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index d150372fd81d..47a87dda54ce 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -173,6 +173,7 @@ out:
173static const struct vm_operations_struct ocfs2_file_vm_ops = { 173static const struct vm_operations_struct ocfs2_file_vm_ops = {
174 .fault = ocfs2_fault, 174 .fault = ocfs2_fault,
175 .page_mkwrite = ocfs2_page_mkwrite, 175 .page_mkwrite = ocfs2_page_mkwrite,
176 .remap_pages = generic_file_remap_pages,
176}; 177};
177 178
178int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) 179int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
@@ -188,7 +189,6 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
188 ocfs2_inode_unlock(file->f_dentry->d_inode, lock_level); 189 ocfs2_inode_unlock(file->f_dentry->d_inode, lock_level);
189out: 190out:
190 vma->vm_ops = &ocfs2_file_vm_ops; 191 vma->vm_ops = &ocfs2_file_vm_ops;
191 vma->vm_flags |= VM_CAN_NONLINEAR;
192 return 0; 192 return 0;
193} 193}
194 194
diff --git a/fs/proc/base.c b/fs/proc/base.c
index d295af993677..ef5c84be66f9 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -873,111 +873,6 @@ static const struct file_operations proc_environ_operations = {
873 .release = mem_release, 873 .release = mem_release,
874}; 874};
875 875
876static ssize_t oom_adjust_read(struct file *file, char __user *buf,
877 size_t count, loff_t *ppos)
878{
879 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
880 char buffer[PROC_NUMBUF];
881 size_t len;
882 int oom_adjust = OOM_DISABLE;
883 unsigned long flags;
884
885 if (!task)
886 return -ESRCH;
887
888 if (lock_task_sighand(task, &flags)) {
889 oom_adjust = task->signal->oom_adj;
890 unlock_task_sighand(task, &flags);
891 }
892
893 put_task_struct(task);
894
895 len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
896
897 return simple_read_from_buffer(buf, count, ppos, buffer, len);
898}
899
900static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
901 size_t count, loff_t *ppos)
902{
903 struct task_struct *task;
904 char buffer[PROC_NUMBUF];
905 int oom_adjust;
906 unsigned long flags;
907 int err;
908
909 memset(buffer, 0, sizeof(buffer));
910 if (count > sizeof(buffer) - 1)
911 count = sizeof(buffer) - 1;
912 if (copy_from_user(buffer, buf, count)) {
913 err = -EFAULT;
914 goto out;
915 }
916
917 err = kstrtoint(strstrip(buffer), 0, &oom_adjust);
918 if (err)
919 goto out;
920 if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
921 oom_adjust != OOM_DISABLE) {
922 err = -EINVAL;
923 goto out;
924 }
925
926 task = get_proc_task(file->f_path.dentry->d_inode);
927 if (!task) {
928 err = -ESRCH;
929 goto out;
930 }
931
932 task_lock(task);
933 if (!task->mm) {
934 err = -EINVAL;
935 goto err_task_lock;
936 }
937
938 if (!lock_task_sighand(task, &flags)) {
939 err = -ESRCH;
940 goto err_task_lock;
941 }
942
943 if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
944 err = -EACCES;
945 goto err_sighand;
946 }
947
948 /*
949 * Warn that /proc/pid/oom_adj is deprecated, see
950 * Documentation/feature-removal-schedule.txt.
951 */
952 printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
953 current->comm, task_pid_nr(current), task_pid_nr(task),
954 task_pid_nr(task));
955 task->signal->oom_adj = oom_adjust;
956 /*
957 * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
958 * value is always attainable.
959 */
960 if (task->signal->oom_adj == OOM_ADJUST_MAX)
961 task->signal->oom_score_adj = OOM_SCORE_ADJ_MAX;
962 else
963 task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
964 -OOM_DISABLE;
965 trace_oom_score_adj_update(task);
966err_sighand:
967 unlock_task_sighand(task, &flags);
968err_task_lock:
969 task_unlock(task);
970 put_task_struct(task);
971out:
972 return err < 0 ? err : count;
973}
974
975static const struct file_operations proc_oom_adjust_operations = {
976 .read = oom_adjust_read,
977 .write = oom_adjust_write,
978 .llseek = generic_file_llseek,
979};
980
981static ssize_t oom_score_adj_read(struct file *file, char __user *buf, 876static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
982 size_t count, loff_t *ppos) 877 size_t count, loff_t *ppos)
983{ 878{
@@ -1051,15 +946,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
1051 if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) 946 if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
1052 task->signal->oom_score_adj_min = oom_score_adj; 947 task->signal->oom_score_adj_min = oom_score_adj;
1053 trace_oom_score_adj_update(task); 948 trace_oom_score_adj_update(task);
1054 /* 949
1055 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
1056 * always attainable.
1057 */
1058 if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
1059 task->signal->oom_adj = OOM_DISABLE;
1060 else
1061 task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
1062 OOM_SCORE_ADJ_MAX;
1063err_sighand: 950err_sighand:
1064 unlock_task_sighand(task, &flags); 951 unlock_task_sighand(task, &flags);
1065err_task_lock: 952err_task_lock:
@@ -2710,7 +2597,6 @@ static const struct pid_entry tgid_base_stuff[] = {
2710 REG("cgroup", S_IRUGO, proc_cgroup_operations), 2597 REG("cgroup", S_IRUGO, proc_cgroup_operations),
2711#endif 2598#endif
2712 INF("oom_score", S_IRUGO, proc_oom_score), 2599 INF("oom_score", S_IRUGO, proc_oom_score),
2713 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
2714 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), 2600 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
2715#ifdef CONFIG_AUDITSYSCALL 2601#ifdef CONFIG_AUDITSYSCALL
2716 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), 2602 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
@@ -3077,7 +2963,6 @@ static const struct pid_entry tid_base_stuff[] = {
3077 REG("cgroup", S_IRUGO, proc_cgroup_operations), 2963 REG("cgroup", S_IRUGO, proc_cgroup_operations),
3078#endif 2964#endif
3079 INF("oom_score", S_IRUGO, proc_oom_score), 2965 INF("oom_score", S_IRUGO, proc_oom_score),
3080 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
3081 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), 2966 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
3082#ifdef CONFIG_AUDITSYSCALL 2967#ifdef CONFIG_AUDITSYSCALL
3083 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), 2968 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 7fcd0d60a968..b8730d9ebaee 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -115,7 +115,13 @@ u64 stable_page_flags(struct page *page)
115 u |= 1 << KPF_COMPOUND_TAIL; 115 u |= 1 << KPF_COMPOUND_TAIL;
116 if (PageHuge(page)) 116 if (PageHuge(page))
117 u |= 1 << KPF_HUGE; 117 u |= 1 << KPF_HUGE;
118 else if (PageTransCompound(page)) 118 /*
119 * PageTransCompound can be true for non-huge compound pages (slab
120 * pages or pages allocated by drivers with __GFP_COMP) because it
121 * just checks PG_head/PG_tail, so we need to check PageLRU to make
122 * sure a given page is a thp, not a non-huge compound page.
123 */
124 else if (PageTransCompound(page) && PageLRU(compound_trans_head(page)))
119 u |= 1 << KPF_THP; 125 u |= 1 << KPF_THP;
120 126
121 /* 127 /*
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index dcd56f84db7e..a781bdf06694 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -142,6 +142,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
142 } 142 }
143 143
144 rb_link_node(node, parent, p); 144 rb_link_node(node, parent, p);
145 rb_insert_color(node, &head->parent->root);
145 return 0; 146 return 0;
146} 147}
147 148
@@ -168,10 +169,8 @@ static void init_header(struct ctl_table_header *head,
168 head->node = node; 169 head->node = node;
169 if (node) { 170 if (node) {
170 struct ctl_table *entry; 171 struct ctl_table *entry;
171 for (entry = table; entry->procname; entry++, node++) { 172 for (entry = table; entry->procname; entry++, node++)
172 rb_init_node(&node->node);
173 node->header = head; 173 node->header = head;
174 }
175 } 174 }
176} 175}
177 176
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 4540b8f76f16..79827ce03e3b 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -54,7 +54,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
54 "VmPTE:\t%8lu kB\n" 54 "VmPTE:\t%8lu kB\n"
55 "VmSwap:\t%8lu kB\n", 55 "VmSwap:\t%8lu kB\n",
56 hiwater_vm << (PAGE_SHIFT-10), 56 hiwater_vm << (PAGE_SHIFT-10),
57 (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), 57 total_vm << (PAGE_SHIFT-10),
58 mm->locked_vm << (PAGE_SHIFT-10), 58 mm->locked_vm << (PAGE_SHIFT-10),
59 mm->pinned_vm << (PAGE_SHIFT-10), 59 mm->pinned_vm << (PAGE_SHIFT-10),
60 hiwater_rss << (PAGE_SHIFT-10), 60 hiwater_rss << (PAGE_SHIFT-10),
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index ff48c5a85309..5bc77817f382 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1536,6 +1536,7 @@ out_unlock:
1536static const struct vm_operations_struct ubifs_file_vm_ops = { 1536static const struct vm_operations_struct ubifs_file_vm_ops = {
1537 .fault = filemap_fault, 1537 .fault = filemap_fault,
1538 .page_mkwrite = ubifs_vm_page_mkwrite, 1538 .page_mkwrite = ubifs_vm_page_mkwrite,
1539 .remap_pages = generic_file_remap_pages,
1539}; 1540};
1540 1541
1541static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma) 1542static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 1eaeb8be3aae..aa473fa640a2 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -940,7 +940,6 @@ xfs_file_mmap(
940 struct vm_area_struct *vma) 940 struct vm_area_struct *vma)
941{ 941{
942 vma->vm_ops = &xfs_file_vm_ops; 942 vma->vm_ops = &xfs_file_vm_ops;
943 vma->vm_flags |= VM_CAN_NONLINEAR;
944 943
945 file_accessed(filp); 944 file_accessed(filp);
946 return 0; 945 return 0;
@@ -1443,4 +1442,5 @@ const struct file_operations xfs_dir_file_operations = {
1443static const struct vm_operations_struct xfs_file_vm_ops = { 1442static const struct vm_operations_struct xfs_file_vm_ops = {
1444 .fault = filemap_fault, 1443 .fault = filemap_fault,
1445 .page_mkwrite = xfs_vm_page_mkwrite, 1444 .page_mkwrite = xfs_vm_page_mkwrite,
1445 .remap_pages = generic_file_remap_pages,
1446}; 1446};
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index ff4947b7a976..b36ce40bd1c6 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -87,7 +87,7 @@ static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
87 pmd_t *pmdp) 87 pmd_t *pmdp)
88{ 88{
89 pmd_t pmd = *pmdp; 89 pmd_t pmd = *pmdp;
90 pmd_clear(mm, address, pmdp); 90 pmd_clear(pmdp);
91 return pmd; 91 return pmd;
92} 92}
93#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 93#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -162,6 +162,19 @@ extern void pmdp_splitting_flush(struct vm_area_struct *vma,
162 unsigned long address, pmd_t *pmdp); 162 unsigned long address, pmd_t *pmdp);
163#endif 163#endif
164 164
165#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
166extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable);
167#endif
168
169#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
170extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm);
171#endif
172
173#ifndef __HAVE_ARCH_PMDP_INVALIDATE
174extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
175 pmd_t *pmdp);
176#endif
177
165#ifndef __HAVE_ARCH_PTE_SAME 178#ifndef __HAVE_ARCH_PTE_SAME
166static inline int pte_same(pte_t pte_a, pte_t pte_b) 179static inline int pte_same(pte_t pte_a, pte_t pte_b)
167{ 180{
@@ -381,48 +394,59 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm,
381 394
382#ifndef __HAVE_PFNMAP_TRACKING 395#ifndef __HAVE_PFNMAP_TRACKING
383/* 396/*
384 * Interface that can be used by architecture code to keep track of 397 * Interfaces that can be used by architecture code to keep track of
385 * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn) 398 * memory type of pfn mappings specified by the remap_pfn_range,
386 * 399 * vm_insert_pfn.
387 * track_pfn_vma_new is called when a _new_ pfn mapping is being established 400 */
388 * for physical range indicated by pfn and size. 401
402/*
403 * track_pfn_remap is called when a _new_ pfn mapping is being established
404 * by remap_pfn_range() for physical range indicated by pfn and size.
389 */ 405 */
390static inline int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, 406static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
391 unsigned long pfn, unsigned long size) 407 unsigned long pfn, unsigned long addr,
408 unsigned long size)
392{ 409{
393 return 0; 410 return 0;
394} 411}
395 412
396/* 413/*
397 * Interface that can be used by architecture code to keep track of 414 * track_pfn_insert is called when a _new_ single pfn is established
398 * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn) 415 * by vm_insert_pfn().
399 * 416 */
400 * track_pfn_vma_copy is called when vma that is covering the pfnmap gets 417static inline int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
418 unsigned long pfn)
419{
420 return 0;
421}
422
423/*
424 * track_pfn_copy is called when vma that is covering the pfnmap gets
401 * copied through copy_page_range(). 425 * copied through copy_page_range().
402 */ 426 */
403static inline int track_pfn_vma_copy(struct vm_area_struct *vma) 427static inline int track_pfn_copy(struct vm_area_struct *vma)
404{ 428{
405 return 0; 429 return 0;
406} 430}
407 431
408/* 432/*
409 * Interface that can be used by architecture code to keep track of
410 * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn)
411 *
412 * untrack_pfn_vma is called while unmapping a pfnmap for a region. 433 * untrack_pfn_vma is called while unmapping a pfnmap for a region.
413 * untrack can be called for a specific region indicated by pfn and size or 434 * untrack can be called for a specific region indicated by pfn and size or
414 * can be for the entire vma (in which case size can be zero). 435 * can be for the entire vma (in which case pfn, size are zero).
415 */ 436 */
416static inline void untrack_pfn_vma(struct vm_area_struct *vma, 437static inline void untrack_pfn(struct vm_area_struct *vma,
417 unsigned long pfn, unsigned long size) 438 unsigned long pfn, unsigned long size)
418{ 439{
419} 440}
420#else 441#else
421extern int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, 442extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
422 unsigned long pfn, unsigned long size); 443 unsigned long pfn, unsigned long addr,
423extern int track_pfn_vma_copy(struct vm_area_struct *vma); 444 unsigned long size);
424extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, 445extern int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
425 unsigned long size); 446 unsigned long pfn);
447extern int track_pfn_copy(struct vm_area_struct *vma);
448extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
449 unsigned long size);
426#endif 450#endif
427 451
428#ifdef CONFIG_MMU 452#ifdef CONFIG_MMU
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 70cfcb2d63c4..5b08a8540ecf 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -86,6 +86,31 @@ static inline int atomic_dec_unless_positive(atomic_t *p)
86} 86}
87#endif 87#endif
88 88
89/*
90 * atomic_dec_if_positive - decrement by 1 if old value positive
91 * @v: pointer of type atomic_t
92 *
93 * The function returns the old value of *v minus 1, even if
94 * the atomic variable, v, was not decremented.
95 */
96#ifndef atomic_dec_if_positive
97static inline int atomic_dec_if_positive(atomic_t *v)
98{
99 int c, old, dec;
100 c = atomic_read(v);
101 for (;;) {
102 dec = c - 1;
103 if (unlikely(dec < 0))
104 break;
105 old = atomic_cmpxchg((v), c, dec);
106 if (likely(old == c))
107 break;
108 c = old;
109 }
110 return dec;
111}
112#endif
113
89#ifndef CONFIG_ARCH_HAS_ATOMIC_OR 114#ifndef CONFIG_ARCH_HAS_ATOMIC_OR
90static inline void atomic_or(int i, atomic_t *v) 115static inline void atomic_or(int i, atomic_t *v)
91{ 116{
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index ef658147e4e8..6ecb6dc2f303 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -22,8 +22,9 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
22extern int fragmentation_index(struct zone *zone, unsigned int order); 22extern int fragmentation_index(struct zone *zone, unsigned int order);
23extern unsigned long try_to_compact_pages(struct zonelist *zonelist, 23extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
24 int order, gfp_t gfp_mask, nodemask_t *mask, 24 int order, gfp_t gfp_mask, nodemask_t *mask,
25 bool sync, bool *contended); 25 bool sync, bool *contended, struct page **page);
26extern int compact_pgdat(pg_data_t *pgdat, int order); 26extern int compact_pgdat(pg_data_t *pgdat, int order);
27extern void reset_isolation_suitable(pg_data_t *pgdat);
27extern unsigned long compaction_suitable(struct zone *zone, int order); 28extern unsigned long compaction_suitable(struct zone *zone, int order);
28 29
29/* Do not skip compaction more than 64 times */ 30/* Do not skip compaction more than 64 times */
@@ -61,10 +62,20 @@ static inline bool compaction_deferred(struct zone *zone, int order)
61 return zone->compact_considered < defer_limit; 62 return zone->compact_considered < defer_limit;
62} 63}
63 64
65/* Returns true if restarting compaction after many failures */
66static inline bool compaction_restarting(struct zone *zone, int order)
67{
68 if (order < zone->compact_order_failed)
69 return false;
70
71 return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT &&
72 zone->compact_considered >= 1UL << zone->compact_defer_shift;
73}
74
64#else 75#else
65static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, 76static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
66 int order, gfp_t gfp_mask, nodemask_t *nodemask, 77 int order, gfp_t gfp_mask, nodemask_t *nodemask,
67 bool sync, bool *contended) 78 bool sync, bool *contended, struct page **page)
68{ 79{
69 return COMPACT_CONTINUE; 80 return COMPACT_CONTINUE;
70} 81}
@@ -74,6 +85,10 @@ static inline int compact_pgdat(pg_data_t *pgdat, int order)
74 return COMPACT_CONTINUE; 85 return COMPACT_CONTINUE;
75} 86}
76 87
88static inline void reset_isolation_suitable(pg_data_t *pgdat)
89{
90}
91
77static inline unsigned long compaction_suitable(struct zone *zone, int order) 92static inline unsigned long compaction_suitable(struct zone *zone, int order)
78{ 93{
79 return COMPACT_SKIPPED; 94 return COMPACT_SKIPPED;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ca6d8c806f47..c617ed024df8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -401,7 +401,7 @@ struct inodes_stat_t {
401#include <linux/cache.h> 401#include <linux/cache.h>
402#include <linux/list.h> 402#include <linux/list.h>
403#include <linux/radix-tree.h> 403#include <linux/radix-tree.h>
404#include <linux/prio_tree.h> 404#include <linux/rbtree.h>
405#include <linux/init.h> 405#include <linux/init.h>
406#include <linux/pid.h> 406#include <linux/pid.h>
407#include <linux/bug.h> 407#include <linux/bug.h>
@@ -669,7 +669,7 @@ struct address_space {
669 struct radix_tree_root page_tree; /* radix tree of all pages */ 669 struct radix_tree_root page_tree; /* radix tree of all pages */
670 spinlock_t tree_lock; /* and lock protecting it */ 670 spinlock_t tree_lock; /* and lock protecting it */
671 unsigned int i_mmap_writable;/* count VM_SHARED mappings */ 671 unsigned int i_mmap_writable;/* count VM_SHARED mappings */
672 struct prio_tree_root i_mmap; /* tree of private and shared mappings */ 672 struct rb_root i_mmap; /* tree of private and shared mappings */
673 struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ 673 struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
674 struct mutex i_mmap_mutex; /* protect tree, count, list */ 674 struct mutex i_mmap_mutex; /* protect tree, count, list */
675 /* Protected by tree_lock together with the radix tree */ 675 /* Protected by tree_lock together with the radix tree */
@@ -741,7 +741,7 @@ int mapping_tagged(struct address_space *mapping, int tag);
741 */ 741 */
742static inline int mapping_mapped(struct address_space *mapping) 742static inline int mapping_mapped(struct address_space *mapping)
743{ 743{
744 return !prio_tree_empty(&mapping->i_mmap) || 744 return !RB_EMPTY_ROOT(&mapping->i_mmap) ||
745 !list_empty(&mapping->i_mmap_nonlinear); 745 !list_empty(&mapping->i_mmap_nonlinear);
746} 746}
747 747
@@ -2552,6 +2552,8 @@ extern int sb_min_blocksize(struct super_block *, int);
2552 2552
2553extern int generic_file_mmap(struct file *, struct vm_area_struct *); 2553extern int generic_file_mmap(struct file *, struct vm_area_struct *);
2554extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); 2554extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
2555extern int generic_file_remap_pages(struct vm_area_struct *, unsigned long addr,
2556 unsigned long size, pgoff_t pgoff);
2555extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size); 2557extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size);
2556int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk); 2558int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk);
2557extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t); 2559extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 4883f393f50a..02c1c9710be0 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -30,12 +30,7 @@ struct vm_area_struct;
30#define ___GFP_HARDWALL 0x20000u 30#define ___GFP_HARDWALL 0x20000u
31#define ___GFP_THISNODE 0x40000u 31#define ___GFP_THISNODE 0x40000u
32#define ___GFP_RECLAIMABLE 0x80000u 32#define ___GFP_RECLAIMABLE 0x80000u
33#ifdef CONFIG_KMEMCHECK
34#define ___GFP_NOTRACK 0x200000u 33#define ___GFP_NOTRACK 0x200000u
35#else
36#define ___GFP_NOTRACK 0
37#endif
38#define ___GFP_NO_KSWAPD 0x400000u
39#define ___GFP_OTHER_NODE 0x800000u 34#define ___GFP_OTHER_NODE 0x800000u
40#define ___GFP_WRITE 0x1000000u 35#define ___GFP_WRITE 0x1000000u
41 36
@@ -90,7 +85,6 @@ struct vm_area_struct;
90#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */ 85#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */
91#define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) /* Don't track with kmemcheck */ 86#define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) /* Don't track with kmemcheck */
92 87
93#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD)
94#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */ 88#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
95#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */ 89#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */
96 90
@@ -120,8 +114,7 @@ struct vm_area_struct;
120 __GFP_MOVABLE) 114 __GFP_MOVABLE)
121#define GFP_IOFS (__GFP_IO | __GFP_FS) 115#define GFP_IOFS (__GFP_IO | __GFP_FS)
122#define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ 116#define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
123 __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \ 117 __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN)
124 __GFP_NO_KSWAPD)
125 118
126#ifdef CONFIG_NUMA 119#ifdef CONFIG_NUMA
127#define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY) 120#define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 4c59b1131187..b31cb7da0346 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -11,8 +11,7 @@ extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
11extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, 11extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
12 unsigned long address, pmd_t *pmd, 12 unsigned long address, pmd_t *pmd,
13 pmd_t orig_pmd); 13 pmd_t orig_pmd);
14extern pgtable_t get_pmd_huge_pte(struct mm_struct *mm); 14extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
15extern struct page *follow_trans_huge_pmd(struct mm_struct *mm,
16 unsigned long addr, 15 unsigned long addr,
17 pmd_t *pmd, 16 pmd_t *pmd,
18 unsigned int flags); 17 unsigned int flags);
diff --git a/include/linux/interval_tree.h b/include/linux/interval_tree.h
new file mode 100644
index 000000000000..724556aa3c95
--- /dev/null
+++ b/include/linux/interval_tree.h
@@ -0,0 +1,27 @@
1#ifndef _LINUX_INTERVAL_TREE_H
2#define _LINUX_INTERVAL_TREE_H
3
4#include <linux/rbtree.h>
5
6struct interval_tree_node {
7 struct rb_node rb;
8 unsigned long start; /* Start of interval */
9 unsigned long last; /* Last location _in_ interval */
10 unsigned long __subtree_last;
11};
12
13extern void
14interval_tree_insert(struct interval_tree_node *node, struct rb_root *root);
15
16extern void
17interval_tree_remove(struct interval_tree_node *node, struct rb_root *root);
18
19extern struct interval_tree_node *
20interval_tree_iter_first(struct rb_root *root,
21 unsigned long start, unsigned long last);
22
23extern struct interval_tree_node *
24interval_tree_iter_next(struct interval_tree_node *node,
25 unsigned long start, unsigned long last);
26
27#endif /* _LINUX_INTERVAL_TREE_H */
diff --git a/include/linux/interval_tree_generic.h b/include/linux/interval_tree_generic.h
new file mode 100644
index 000000000000..58370e1862ad
--- /dev/null
+++ b/include/linux/interval_tree_generic.h
@@ -0,0 +1,191 @@
1/*
2 Interval Trees
3 (C) 2012 Michel Lespinasse <walken@google.com>
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18
19 include/linux/interval_tree_generic.h
20*/
21
22#include <linux/rbtree_augmented.h>
23
24/*
25 * Template for implementing interval trees
26 *
27 * ITSTRUCT: struct type of the interval tree nodes
28 * ITRB: name of struct rb_node field within ITSTRUCT
29 * ITTYPE: type of the interval endpoints
30 * ITSUBTREE: name of ITTYPE field within ITSTRUCT holding last-in-subtree
31 * ITSTART(n): start endpoint of ITSTRUCT node n
32 * ITLAST(n): last endpoint of ITSTRUCT node n
33 * ITSTATIC: 'static' or empty
34 * ITPREFIX: prefix to use for the inline tree definitions
35 *
36 * Note - before using this, please consider if non-generic version
37 * (interval_tree.h) would work for you...
38 */
39
40#define INTERVAL_TREE_DEFINE(ITSTRUCT, ITRB, ITTYPE, ITSUBTREE, \
41 ITSTART, ITLAST, ITSTATIC, ITPREFIX) \
42 \
43/* Callbacks for augmented rbtree insert and remove */ \
44 \
45static inline ITTYPE ITPREFIX ## _compute_subtree_last(ITSTRUCT *node) \
46{ \
47 ITTYPE max = ITLAST(node), subtree_last; \
48 if (node->ITRB.rb_left) { \
49 subtree_last = rb_entry(node->ITRB.rb_left, \
50 ITSTRUCT, ITRB)->ITSUBTREE; \
51 if (max < subtree_last) \
52 max = subtree_last; \
53 } \
54 if (node->ITRB.rb_right) { \
55 subtree_last = rb_entry(node->ITRB.rb_right, \
56 ITSTRUCT, ITRB)->ITSUBTREE; \
57 if (max < subtree_last) \
58 max = subtree_last; \
59 } \
60 return max; \
61} \
62 \
63RB_DECLARE_CALLBACKS(static, ITPREFIX ## _augment, ITSTRUCT, ITRB, \
64 ITTYPE, ITSUBTREE, ITPREFIX ## _compute_subtree_last) \
65 \
66/* Insert / remove interval nodes from the tree */ \
67 \
68ITSTATIC void ITPREFIX ## _insert(ITSTRUCT *node, struct rb_root *root) \
69{ \
70 struct rb_node **link = &root->rb_node, *rb_parent = NULL; \
71 ITTYPE start = ITSTART(node), last = ITLAST(node); \
72 ITSTRUCT *parent; \
73 \
74 while (*link) { \
75 rb_parent = *link; \
76 parent = rb_entry(rb_parent, ITSTRUCT, ITRB); \
77 if (parent->ITSUBTREE < last) \
78 parent->ITSUBTREE = last; \
79 if (start < ITSTART(parent)) \
80 link = &parent->ITRB.rb_left; \
81 else \
82 link = &parent->ITRB.rb_right; \
83 } \
84 \
85 node->ITSUBTREE = last; \
86 rb_link_node(&node->ITRB, rb_parent, link); \
87 rb_insert_augmented(&node->ITRB, root, &ITPREFIX ## _augment); \
88} \
89 \
90ITSTATIC void ITPREFIX ## _remove(ITSTRUCT *node, struct rb_root *root) \
91{ \
92 rb_erase_augmented(&node->ITRB, root, &ITPREFIX ## _augment); \
93} \
94 \
95/* \
96 * Iterate over intervals intersecting [start;last] \
97 * \
98 * Note that a node's interval intersects [start;last] iff: \
99 * Cond1: ITSTART(node) <= last \
100 * and \
101 * Cond2: start <= ITLAST(node) \
102 */ \
103 \
104static ITSTRUCT * \
105ITPREFIX ## _subtree_search(ITSTRUCT *node, ITTYPE start, ITTYPE last) \
106{ \
107 while (true) { \
108 /* \
109 * Loop invariant: start <= node->ITSUBTREE \
110 * (Cond2 is satisfied by one of the subtree nodes) \
111 */ \
112 if (node->ITRB.rb_left) { \
113 ITSTRUCT *left = rb_entry(node->ITRB.rb_left, \
114 ITSTRUCT, ITRB); \
115 if (start <= left->ITSUBTREE) { \
116 /* \
117 * Some nodes in left subtree satisfy Cond2. \
118 * Iterate to find the leftmost such node N. \
119 * If it also satisfies Cond1, that's the \
120 * match we are looking for. Otherwise, there \
121 * is no matching interval as nodes to the \
122 * right of N can't satisfy Cond1 either. \
123 */ \
124 node = left; \
125 continue; \
126 } \
127 } \
128 if (ITSTART(node) <= last) { /* Cond1 */ \
129 if (start <= ITLAST(node)) /* Cond2 */ \
130 return node; /* node is leftmost match */ \
131 if (node->ITRB.rb_right) { \
132 node = rb_entry(node->ITRB.rb_right, \
133 ITSTRUCT, ITRB); \
134 if (start <= node->ITSUBTREE) \
135 continue; \
136 } \
137 } \
138 return NULL; /* No match */ \
139 } \
140} \
141 \
142ITSTATIC ITSTRUCT * \
143ITPREFIX ## _iter_first(struct rb_root *root, ITTYPE start, ITTYPE last) \
144{ \
145 ITSTRUCT *node; \
146 \
147 if (!root->rb_node) \
148 return NULL; \
149 node = rb_entry(root->rb_node, ITSTRUCT, ITRB); \
150 if (node->ITSUBTREE < start) \
151 return NULL; \
152 return ITPREFIX ## _subtree_search(node, start, last); \
153} \
154 \
155ITSTATIC ITSTRUCT * \
156ITPREFIX ## _iter_next(ITSTRUCT *node, ITTYPE start, ITTYPE last) \
157{ \
158 struct rb_node *rb = node->ITRB.rb_right, *prev; \
159 \
160 while (true) { \
161 /* \
162 * Loop invariants: \
163 * Cond1: ITSTART(node) <= last \
164 * rb == node->ITRB.rb_right \
165 * \
166 * First, search right subtree if suitable \
167 */ \
168 if (rb) { \
169 ITSTRUCT *right = rb_entry(rb, ITSTRUCT, ITRB); \
170 if (start <= right->ITSUBTREE) \
171 return ITPREFIX ## _subtree_search(right, \
172 start, last); \
173 } \
174 \
175 /* Move up the tree until we come from a node's left child */ \
176 do { \
177 rb = rb_parent(&node->ITRB); \
178 if (!rb) \
179 return NULL; \
180 prev = &node->ITRB; \
181 node = rb_entry(rb, ITSTRUCT, ITRB); \
182 rb = node->ITRB.rb_right; \
183 } while (prev == rb); \
184 \
185 /* Check if the node intersects [start;last] */ \
186 if (last < ITSTART(node)) /* !Cond1 */ \
187 return NULL; \
188 else if (start <= ITLAST(node)) /* Cond2 */ \
189 return node; \
190 } \
191}
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 19dc455b4f3d..569d67d4243e 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -70,8 +70,7 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
70 * @p_end: ptr to ulong for end pfn of the range, can be %NULL 70 * @p_end: ptr to ulong for end pfn of the range, can be %NULL
71 * @p_nid: ptr to int for nid of the range, can be %NULL 71 * @p_nid: ptr to int for nid of the range, can be %NULL
72 * 72 *
73 * Walks over configured memory ranges. Available after early_node_map is 73 * Walks over configured memory ranges.
74 * populated.
75 */ 74 */
76#define for_each_mem_pfn_range(i, nid, p_start, p_end, p_nid) \ 75#define for_each_mem_pfn_range(i, nid, p_start, p_end, p_nid) \
77 for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \ 76 for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 8d9489fdab2e..fd0e6d53836e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -84,14 +84,14 @@ extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
84extern struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont); 84extern struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont);
85 85
86static inline 86static inline
87int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup) 87bool mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *memcg)
88{ 88{
89 struct mem_cgroup *memcg; 89 struct mem_cgroup *task_memcg;
90 int match; 90 bool match;
91 91
92 rcu_read_lock(); 92 rcu_read_lock();
93 memcg = mem_cgroup_from_task(rcu_dereference((mm)->owner)); 93 task_memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
94 match = __mem_cgroup_same_or_subtree(cgroup, memcg); 94 match = __mem_cgroup_same_or_subtree(memcg, task_memcg);
95 rcu_read_unlock(); 95 rcu_read_unlock();
96 return match; 96 return match;
97} 97}
@@ -258,10 +258,10 @@ static inline struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm
258 return NULL; 258 return NULL;
259} 259}
260 260
261static inline int mm_match_cgroup(struct mm_struct *mm, 261static inline bool mm_match_cgroup(struct mm_struct *mm,
262 struct mem_cgroup *memcg) 262 struct mem_cgroup *memcg)
263{ 263{
264 return 1; 264 return true;
265} 265}
266 266
267static inline int task_in_mem_cgroup(struct task_struct *task, 267static inline int task_in_mem_cgroup(struct task_struct *task,
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 910550f3b70e..95573ec4ee6c 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -10,6 +10,7 @@ struct page;
10struct zone; 10struct zone;
11struct pglist_data; 11struct pglist_data;
12struct mem_section; 12struct mem_section;
13struct memory_block;
13 14
14#ifdef CONFIG_MEMORY_HOTPLUG 15#ifdef CONFIG_MEMORY_HOTPLUG
15 16
@@ -233,6 +234,8 @@ static inline int is_mem_section_removable(unsigned long pfn,
233extern int mem_online_node(int nid); 234extern int mem_online_node(int nid);
234extern int add_memory(int nid, u64 start, u64 size); 235extern int add_memory(int nid, u64 start, u64 size);
235extern int arch_add_memory(int nid, u64 start, u64 size); 236extern int arch_add_memory(int nid, u64 start, u64 size);
237extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
238extern int offline_memory_block(struct memory_block *mem);
236extern int remove_memory(u64 start, u64 size); 239extern int remove_memory(u64 start, u64 size);
237extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, 240extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
238 int nr_pages); 241 int nr_pages);
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 95b738c7abff..cec569325608 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -188,7 +188,7 @@ struct sp_node {
188 188
189struct shared_policy { 189struct shared_policy {
190 struct rb_root root; 190 struct rb_root root;
191 spinlock_t lock; 191 struct mutex mutex;
192}; 192};
193 193
194void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol); 194void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol);
@@ -239,7 +239,7 @@ extern int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol,
239/* Check if a vma is migratable */ 239/* Check if a vma is migratable */
240static inline int vma_migratable(struct vm_area_struct *vma) 240static inline int vma_migratable(struct vm_area_struct *vma)
241{ 241{
242 if (vma->vm_flags & (VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED)) 242 if (vma->vm_flags & (VM_IO | VM_HUGETLB | VM_PFNMAP))
243 return 0; 243 return 0;
244 /* 244 /*
245 * Migration allocates pages in the highest zone. If we cannot 245 * Migration allocates pages in the highest zone. If we cannot
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 311be906b57d..fa0680402738 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -10,7 +10,6 @@
10#include <linux/list.h> 10#include <linux/list.h>
11#include <linux/mmzone.h> 11#include <linux/mmzone.h>
12#include <linux/rbtree.h> 12#include <linux/rbtree.h>
13#include <linux/prio_tree.h>
14#include <linux/atomic.h> 13#include <linux/atomic.h>
15#include <linux/debug_locks.h> 14#include <linux/debug_locks.h>
16#include <linux/mm_types.h> 15#include <linux/mm_types.h>
@@ -21,6 +20,7 @@
21 20
22struct mempolicy; 21struct mempolicy;
23struct anon_vma; 22struct anon_vma;
23struct anon_vma_chain;
24struct file_ra_state; 24struct file_ra_state;
25struct user_struct; 25struct user_struct;
26struct writeback_control; 26struct writeback_control;
@@ -70,6 +70,8 @@ extern unsigned int kobjsize(const void *objp);
70/* 70/*
71 * vm_flags in vm_area_struct, see mm_types.h. 71 * vm_flags in vm_area_struct, see mm_types.h.
72 */ 72 */
73#define VM_NONE 0x00000000
74
73#define VM_READ 0x00000001 /* currently active flags */ 75#define VM_READ 0x00000001 /* currently active flags */
74#define VM_WRITE 0x00000002 76#define VM_WRITE 0x00000002
75#define VM_EXEC 0x00000004 77#define VM_EXEC 0x00000004
@@ -82,16 +84,9 @@ extern unsigned int kobjsize(const void *objp);
82#define VM_MAYSHARE 0x00000080 84#define VM_MAYSHARE 0x00000080
83 85
84#define VM_GROWSDOWN 0x00000100 /* general info on the segment */ 86#define VM_GROWSDOWN 0x00000100 /* general info on the segment */
85#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
86#define VM_GROWSUP 0x00000200
87#else
88#define VM_GROWSUP 0x00000000
89#define VM_NOHUGEPAGE 0x00000200 /* MADV_NOHUGEPAGE marked this vma */
90#endif
91#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ 87#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */
92#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ 88#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */
93 89
94#define VM_EXECUTABLE 0x00001000
95#define VM_LOCKED 0x00002000 90#define VM_LOCKED 0x00002000
96#define VM_IO 0x00004000 /* Memory mapped I/O or similar */ 91#define VM_IO 0x00004000 /* Memory mapped I/O or similar */
97 92
@@ -101,25 +96,34 @@ extern unsigned int kobjsize(const void *objp);
101 96
102#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ 97#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */
103#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ 98#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */
104#define VM_RESERVED 0x00080000 /* Count as reserved_vm like IO */
105#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ 99#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
106#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ 100#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */
107#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ 101#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
108#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ 102#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */
109#ifndef CONFIG_TRANSPARENT_HUGEPAGE 103#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */
110#define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */ 104#define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */
111#else
112#define VM_HUGEPAGE 0x01000000 /* MADV_HUGEPAGE marked this vma */
113#endif
114#define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */
115#define VM_NODUMP 0x04000000 /* Do not include in the core dump */
116 105
117#define VM_CAN_NONLINEAR 0x08000000 /* Has ->fault & does nonlinear pages */
118#define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ 106#define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */
119#define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */ 107#define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */
120#define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */ 108#define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */
121#define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ 109#define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */
122 110
111#if defined(CONFIG_X86)
112# define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */
113#elif defined(CONFIG_PPC)
114# define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */
115#elif defined(CONFIG_PARISC)
116# define VM_GROWSUP VM_ARCH_1
117#elif defined(CONFIG_IA64)
118# define VM_GROWSUP VM_ARCH_1
119#elif !defined(CONFIG_MMU)
120# define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */
121#endif
122
123#ifndef VM_GROWSUP
124# define VM_GROWSUP VM_NONE
125#endif
126
123/* Bits set in the VMA until the stack is in its final location */ 127/* Bits set in the VMA until the stack is in its final location */
124#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ) 128#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ)
125 129
@@ -143,7 +147,7 @@ extern unsigned int kobjsize(const void *objp);
143 * Special vmas that are non-mergable, non-mlock()able. 147 * Special vmas that are non-mergable, non-mlock()able.
144 * Note: mm/huge_memory.c VM_NO_THP depends on this definition. 148 * Note: mm/huge_memory.c VM_NO_THP depends on this definition.
145 */ 149 */
146#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) 150#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP)
147 151
148/* 152/*
149 * mapping from the currently active vm_flags protection bits (the 153 * mapping from the currently active vm_flags protection bits (the
@@ -157,24 +161,7 @@ extern pgprot_t protection_map[16];
157#define FAULT_FLAG_ALLOW_RETRY 0x08 /* Retry fault if blocking */ 161#define FAULT_FLAG_ALLOW_RETRY 0x08 /* Retry fault if blocking */
158#define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */ 162#define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */
159#define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */ 163#define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */
160 164#define FAULT_FLAG_TRIED 0x40 /* second try */
161/*
162 * This interface is used by x86 PAT code to identify a pfn mapping that is
163 * linear over entire vma. This is to optimize PAT code that deals with
164 * marking the physical region with a particular prot. This is not for generic
165 * mm use. Note also that this check will not work if the pfn mapping is
166 * linear for a vma starting at physical address 0. In which case PAT code
167 * falls back to slow path of reserving physical range page by page.
168 */
169static inline int is_linear_pfn_mapping(struct vm_area_struct *vma)
170{
171 return !!(vma->vm_flags & VM_PFN_AT_MMAP);
172}
173
174static inline int is_pfn_mapping(struct vm_area_struct *vma)
175{
176 return !!(vma->vm_flags & VM_PFNMAP);
177}
178 165
179/* 166/*
180 * vm_fault is filled by the the pagefault handler and passed to the vma's 167 * vm_fault is filled by the the pagefault handler and passed to the vma's
@@ -182,8 +169,7 @@ static inline int is_pfn_mapping(struct vm_area_struct *vma)
182 * of VM_FAULT_xxx flags that give details about how the fault was handled. 169 * of VM_FAULT_xxx flags that give details about how the fault was handled.
183 * 170 *
184 * pgoff should be used in favour of virtual_address, if possible. If pgoff 171 * pgoff should be used in favour of virtual_address, if possible. If pgoff
185 * is used, one may set VM_CAN_NONLINEAR in the vma->vm_flags to get nonlinear 172 * is used, one may implement ->remap_pages to get nonlinear mapping support.
186 * mapping support.
187 */ 173 */
188struct vm_fault { 174struct vm_fault {
189 unsigned int flags; /* FAULT_FLAG_xxx flags */ 175 unsigned int flags; /* FAULT_FLAG_xxx flags */
@@ -241,6 +227,9 @@ struct vm_operations_struct {
241 int (*migrate)(struct vm_area_struct *vma, const nodemask_t *from, 227 int (*migrate)(struct vm_area_struct *vma, const nodemask_t *from,
242 const nodemask_t *to, unsigned long flags); 228 const nodemask_t *to, unsigned long flags);
243#endif 229#endif
230 /* called by sys_remap_file_pages() to populate non-linear mapping */
231 int (*remap_pages)(struct vm_area_struct *vma, unsigned long addr,
232 unsigned long size, pgoff_t pgoff);
244}; 233};
245 234
246struct mmu_gather; 235struct mmu_gather;
@@ -249,6 +238,18 @@ struct inode;
249#define page_private(page) ((page)->private) 238#define page_private(page) ((page)->private)
250#define set_page_private(page, v) ((page)->private = (v)) 239#define set_page_private(page, v) ((page)->private = (v))
251 240
241/* It's valid only if the page is free path or free_list */
242static inline void set_freepage_migratetype(struct page *page, int migratetype)
243{
244 page->index = migratetype;
245}
246
247/* It's valid only if the page is free path or free_list */
248static inline int get_freepage_migratetype(struct page *page)
249{
250 return page->index;
251}
252
252/* 253/*
253 * FIXME: take this include out, include page-flags.h in 254 * FIXME: take this include out, include page-flags.h in
254 * files which need it (119 of them) 255 * files which need it (119 of them)
@@ -454,6 +455,7 @@ void put_pages_list(struct list_head *pages);
454 455
455void split_page(struct page *page, unsigned int order); 456void split_page(struct page *page, unsigned int order);
456int split_free_page(struct page *page); 457int split_free_page(struct page *page);
458int capture_free_page(struct page *page, int alloc_order, int migratetype);
457 459
458/* 460/*
459 * Compound pages have a destructor function. Provide a 461 * Compound pages have a destructor function. Provide a
@@ -1071,7 +1073,8 @@ vm_is_stack(struct task_struct *task, struct vm_area_struct *vma, int in_group);
1071 1073
1072extern unsigned long move_page_tables(struct vm_area_struct *vma, 1074extern unsigned long move_page_tables(struct vm_area_struct *vma,
1073 unsigned long old_addr, struct vm_area_struct *new_vma, 1075 unsigned long old_addr, struct vm_area_struct *new_vma,
1074 unsigned long new_addr, unsigned long len); 1076 unsigned long new_addr, unsigned long len,
1077 bool need_rmap_locks);
1075extern unsigned long do_mremap(unsigned long addr, 1078extern unsigned long do_mremap(unsigned long addr,
1076 unsigned long old_len, unsigned long new_len, 1079 unsigned long old_len, unsigned long new_len,
1077 unsigned long flags, unsigned long new_addr); 1080 unsigned long flags, unsigned long new_addr);
@@ -1366,24 +1369,45 @@ extern void zone_pcp_reset(struct zone *zone);
1366extern atomic_long_t mmap_pages_allocated; 1369extern atomic_long_t mmap_pages_allocated;
1367extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); 1370extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);
1368 1371
1369/* prio_tree.c */ 1372/* interval_tree.c */
1370void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old); 1373void vma_interval_tree_insert(struct vm_area_struct *node,
1371void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *); 1374 struct rb_root *root);
1372void vma_prio_tree_remove(struct vm_area_struct *, struct prio_tree_root *); 1375void vma_interval_tree_insert_after(struct vm_area_struct *node,
1373struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma, 1376 struct vm_area_struct *prev,
1374 struct prio_tree_iter *iter); 1377 struct rb_root *root);
1375 1378void vma_interval_tree_remove(struct vm_area_struct *node,
1376#define vma_prio_tree_foreach(vma, iter, root, begin, end) \ 1379 struct rb_root *root);
1377 for (prio_tree_iter_init(iter, root, begin, end), vma = NULL; \ 1380struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root *root,
1378 (vma = vma_prio_tree_next(vma, iter)); ) 1381 unsigned long start, unsigned long last);
1382struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
1383 unsigned long start, unsigned long last);
1384
1385#define vma_interval_tree_foreach(vma, root, start, last) \
1386 for (vma = vma_interval_tree_iter_first(root, start, last); \
1387 vma; vma = vma_interval_tree_iter_next(vma, start, last))
1379 1388
1380static inline void vma_nonlinear_insert(struct vm_area_struct *vma, 1389static inline void vma_nonlinear_insert(struct vm_area_struct *vma,
1381 struct list_head *list) 1390 struct list_head *list)
1382{ 1391{
1383 vma->shared.vm_set.parent = NULL; 1392 list_add_tail(&vma->shared.nonlinear, list);
1384 list_add_tail(&vma->shared.vm_set.list, list);
1385} 1393}
1386 1394
1395void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
1396 struct rb_root *root);
1397void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
1398 struct rb_root *root);
1399struct anon_vma_chain *anon_vma_interval_tree_iter_first(
1400 struct rb_root *root, unsigned long start, unsigned long last);
1401struct anon_vma_chain *anon_vma_interval_tree_iter_next(
1402 struct anon_vma_chain *node, unsigned long start, unsigned long last);
1403#ifdef CONFIG_DEBUG_VM_RB
1404void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
1405#endif
1406
1407#define anon_vma_interval_tree_foreach(avc, root, start, last) \
1408 for (avc = anon_vma_interval_tree_iter_first(root, start, last); \
1409 avc; avc = anon_vma_interval_tree_iter_next(avc, start, last))
1410
1387/* mmap.c */ 1411/* mmap.c */
1388extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); 1412extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
1389extern int vma_adjust(struct vm_area_struct *vma, unsigned long start, 1413extern int vma_adjust(struct vm_area_struct *vma, unsigned long start,
@@ -1400,15 +1424,13 @@ extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
1400 struct rb_node **, struct rb_node *); 1424 struct rb_node **, struct rb_node *);
1401extern void unlink_file_vma(struct vm_area_struct *); 1425extern void unlink_file_vma(struct vm_area_struct *);
1402extern struct vm_area_struct *copy_vma(struct vm_area_struct **, 1426extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
1403 unsigned long addr, unsigned long len, pgoff_t pgoff); 1427 unsigned long addr, unsigned long len, pgoff_t pgoff,
1428 bool *need_rmap_locks);
1404extern void exit_mmap(struct mm_struct *); 1429extern void exit_mmap(struct mm_struct *);
1405 1430
1406extern int mm_take_all_locks(struct mm_struct *mm); 1431extern int mm_take_all_locks(struct mm_struct *mm);
1407extern void mm_drop_all_locks(struct mm_struct *mm); 1432extern void mm_drop_all_locks(struct mm_struct *mm);
1408 1433
1409/* From fs/proc/base.c. callers must _not_ hold the mm's exe_file_lock */
1410extern void added_exe_file_vma(struct mm_struct *mm);
1411extern void removed_exe_file_vma(struct mm_struct *mm);
1412extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); 1434extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
1413extern struct file *get_mm_exe_file(struct mm_struct *mm); 1435extern struct file *get_mm_exe_file(struct mm_struct *mm);
1414 1436
@@ -1662,5 +1684,9 @@ static inline unsigned int debug_guardpage_minorder(void) { return 0; }
1662static inline bool page_is_guard(struct page *page) { return false; } 1684static inline bool page_is_guard(struct page *page) { return false; }
1663#endif /* CONFIG_DEBUG_PAGEALLOC */ 1685#endif /* CONFIG_DEBUG_PAGEALLOC */
1664 1686
1687extern void reset_zone_present_pages(void);
1688extern void fixup_zone_present_pages(int nid, unsigned long start_pfn,
1689 unsigned long end_pfn);
1690
1665#endif /* __KERNEL__ */ 1691#endif /* __KERNEL__ */
1666#endif /* _LINUX_MM_H */ 1692#endif /* _LINUX_MM_H */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index bf7867200b95..31f8a3af7d94 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -6,7 +6,6 @@
6#include <linux/threads.h> 6#include <linux/threads.h>
7#include <linux/list.h> 7#include <linux/list.h>
8#include <linux/spinlock.h> 8#include <linux/spinlock.h>
9#include <linux/prio_tree.h>
10#include <linux/rbtree.h> 9#include <linux/rbtree.h>
11#include <linux/rwsem.h> 10#include <linux/rwsem.h>
12#include <linux/completion.h> 11#include <linux/completion.h>
@@ -240,18 +239,15 @@ struct vm_area_struct {
240 239
241 /* 240 /*
242 * For areas with an address space and backing store, 241 * For areas with an address space and backing store,
243 * linkage into the address_space->i_mmap prio tree, or 242 * linkage into the address_space->i_mmap interval tree, or
244 * linkage to the list of like vmas hanging off its node, or
245 * linkage of vma in the address_space->i_mmap_nonlinear list. 243 * linkage of vma in the address_space->i_mmap_nonlinear list.
246 */ 244 */
247 union { 245 union {
248 struct { 246 struct {
249 struct list_head list; 247 struct rb_node rb;
250 void *parent; /* aligns with prio_tree_node parent */ 248 unsigned long rb_subtree_last;
251 struct vm_area_struct *head; 249 } linear;
252 } vm_set; 250 struct list_head nonlinear;
253
254 struct raw_prio_tree_node prio_tree_node;
255 } shared; 251 } shared;
256 252
257 /* 253 /*
@@ -349,7 +345,6 @@ struct mm_struct {
349 unsigned long shared_vm; /* Shared pages (files) */ 345 unsigned long shared_vm; /* Shared pages (files) */
350 unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE */ 346 unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE */
351 unsigned long stack_vm; /* VM_GROWSUP/DOWN */ 347 unsigned long stack_vm; /* VM_GROWSUP/DOWN */
352 unsigned long reserved_vm; /* VM_RESERVED|VM_IO pages */
353 unsigned long def_flags; 348 unsigned long def_flags;
354 unsigned long nr_ptes; /* Page table pages */ 349 unsigned long nr_ptes; /* Page table pages */
355 unsigned long start_code, end_code, start_data, end_data; 350 unsigned long start_code, end_code, start_data, end_data;
@@ -394,7 +389,6 @@ struct mm_struct {
394 389
395 /* store ref to file /proc/<pid>/exe symlink points to */ 390 /* store ref to file /proc/<pid>/exe symlink points to */
396 struct file *exe_file; 391 struct file *exe_file;
397 unsigned long num_exe_file_vmas;
398#ifdef CONFIG_MMU_NOTIFIER 392#ifdef CONFIG_MMU_NOTIFIER
399 struct mmu_notifier_mm *mmu_notifier_mm; 393 struct mmu_notifier_mm *mmu_notifier_mm;
400#endif 394#endif
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 8b74e9b1d0ad..77cec2f45cb7 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -86,7 +86,6 @@ calc_vm_flag_bits(unsigned long flags)
86{ 86{
87 return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | 87 return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) |
88 _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | 88 _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) |
89 _calc_vm_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE) |
90 _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ); 89 _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED );
91} 90}
92#endif /* __KERNEL__ */ 91#endif /* __KERNEL__ */
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 1d1b1e13f79f..bc823c4c028b 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -4,6 +4,7 @@
4#include <linux/list.h> 4#include <linux/list.h>
5#include <linux/spinlock.h> 5#include <linux/spinlock.h>
6#include <linux/mm_types.h> 6#include <linux/mm_types.h>
7#include <linux/srcu.h>
7 8
8struct mmu_notifier; 9struct mmu_notifier;
9struct mmu_notifier_ops; 10struct mmu_notifier_ops;
@@ -245,50 +246,6 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
245 __mmu_notifier_mm_destroy(mm); 246 __mmu_notifier_mm_destroy(mm);
246} 247}
247 248
248/*
249 * These two macros will sometime replace ptep_clear_flush.
250 * ptep_clear_flush is implemented as macro itself, so this also is
251 * implemented as a macro until ptep_clear_flush will converted to an
252 * inline function, to diminish the risk of compilation failure. The
253 * invalidate_page method over time can be moved outside the PT lock
254 * and these two macros can be later removed.
255 */
256#define ptep_clear_flush_notify(__vma, __address, __ptep) \
257({ \
258 pte_t __pte; \
259 struct vm_area_struct *___vma = __vma; \
260 unsigned long ___address = __address; \
261 __pte = ptep_clear_flush(___vma, ___address, __ptep); \
262 mmu_notifier_invalidate_page(___vma->vm_mm, ___address); \
263 __pte; \
264})
265
266#define pmdp_clear_flush_notify(__vma, __address, __pmdp) \
267({ \
268 pmd_t __pmd; \
269 struct vm_area_struct *___vma = __vma; \
270 unsigned long ___address = __address; \
271 VM_BUG_ON(__address & ~HPAGE_PMD_MASK); \
272 mmu_notifier_invalidate_range_start(___vma->vm_mm, ___address, \
273 (__address)+HPAGE_PMD_SIZE);\
274 __pmd = pmdp_clear_flush(___vma, ___address, __pmdp); \
275 mmu_notifier_invalidate_range_end(___vma->vm_mm, ___address, \
276 (__address)+HPAGE_PMD_SIZE); \
277 __pmd; \
278})
279
280#define pmdp_splitting_flush_notify(__vma, __address, __pmdp) \
281({ \
282 struct vm_area_struct *___vma = __vma; \
283 unsigned long ___address = __address; \
284 VM_BUG_ON(__address & ~HPAGE_PMD_MASK); \
285 mmu_notifier_invalidate_range_start(___vma->vm_mm, ___address, \
286 (__address)+HPAGE_PMD_SIZE);\
287 pmdp_splitting_flush(___vma, ___address, __pmdp); \
288 mmu_notifier_invalidate_range_end(___vma->vm_mm, ___address, \
289 (__address)+HPAGE_PMD_SIZE); \
290})
291
292#define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ 249#define ptep_clear_flush_young_notify(__vma, __address, __ptep) \
293({ \ 250({ \
294 int __young; \ 251 int __young; \
@@ -311,14 +268,24 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
311 __young; \ 268 __young; \
312}) 269})
313 270
271/*
272 * set_pte_at_notify() sets the pte _after_ running the notifier.
273 * This is safe to start by updating the secondary MMUs, because the primary MMU
274 * pte invalidate must have already happened with a ptep_clear_flush() before
275 * set_pte_at_notify() has been invoked. Updating the secondary MMUs first is
276 * required when we change both the protection of the mapping from read-only to
277 * read-write and the pfn (like during copy on write page faults). Otherwise the
278 * old page would remain mapped readonly in the secondary MMUs after the new
279 * page is already writable by some CPU through the primary MMU.
280 */
314#define set_pte_at_notify(__mm, __address, __ptep, __pte) \ 281#define set_pte_at_notify(__mm, __address, __ptep, __pte) \
315({ \ 282({ \
316 struct mm_struct *___mm = __mm; \ 283 struct mm_struct *___mm = __mm; \
317 unsigned long ___address = __address; \ 284 unsigned long ___address = __address; \
318 pte_t ___pte = __pte; \ 285 pte_t ___pte = __pte; \
319 \ 286 \
320 set_pte_at(___mm, ___address, __ptep, ___pte); \
321 mmu_notifier_change_pte(___mm, ___address, ___pte); \ 287 mmu_notifier_change_pte(___mm, ___address, ___pte); \
288 set_pte_at(___mm, ___address, __ptep, ___pte); \
322}) 289})
323 290
324#else /* CONFIG_MMU_NOTIFIER */ 291#else /* CONFIG_MMU_NOTIFIER */
@@ -369,9 +336,6 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
369 336
370#define ptep_clear_flush_young_notify ptep_clear_flush_young 337#define ptep_clear_flush_young_notify ptep_clear_flush_young
371#define pmdp_clear_flush_young_notify pmdp_clear_flush_young 338#define pmdp_clear_flush_young_notify pmdp_clear_flush_young
372#define ptep_clear_flush_notify ptep_clear_flush
373#define pmdp_clear_flush_notify pmdp_clear_flush
374#define pmdp_splitting_flush_notify pmdp_splitting_flush
375#define set_pte_at_notify set_pte_at 339#define set_pte_at_notify set_pte_at
376 340
377#endif /* CONFIG_MMU_NOTIFIER */ 341#endif /* CONFIG_MMU_NOTIFIER */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2daa54f55db7..50aaca81f63d 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -142,6 +142,7 @@ enum zone_stat_item {
142 NUMA_OTHER, /* allocation from other node */ 142 NUMA_OTHER, /* allocation from other node */
143#endif 143#endif
144 NR_ANON_TRANSPARENT_HUGEPAGES, 144 NR_ANON_TRANSPARENT_HUGEPAGES,
145 NR_FREE_CMA_PAGES,
145 NR_VM_ZONE_STAT_ITEMS }; 146 NR_VM_ZONE_STAT_ITEMS };
146 147
147/* 148/*
@@ -217,6 +218,8 @@ struct lruvec {
217#define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2) 218#define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2)
218/* Isolate for asynchronous migration */ 219/* Isolate for asynchronous migration */
219#define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x4) 220#define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x4)
221/* Isolate unevictable pages */
222#define ISOLATE_UNEVICTABLE ((__force isolate_mode_t)0x8)
220 223
221/* LRU Isolation modes. */ 224/* LRU Isolation modes. */
222typedef unsigned __bitwise__ isolate_mode_t; 225typedef unsigned __bitwise__ isolate_mode_t;
@@ -369,8 +372,12 @@ struct zone {
369 spinlock_t lock; 372 spinlock_t lock;
370 int all_unreclaimable; /* All pages pinned */ 373 int all_unreclaimable; /* All pages pinned */
371#if defined CONFIG_COMPACTION || defined CONFIG_CMA 374#if defined CONFIG_COMPACTION || defined CONFIG_CMA
372 /* pfn where the last incremental compaction isolated free pages */ 375 /* Set to true when the PG_migrate_skip bits should be cleared */
376 bool compact_blockskip_flush;
377
378 /* pfns where compaction scanners should start */
373 unsigned long compact_cached_free_pfn; 379 unsigned long compact_cached_free_pfn;
380 unsigned long compact_cached_migrate_pfn;
374#endif 381#endif
375#ifdef CONFIG_MEMORY_HOTPLUG 382#ifdef CONFIG_MEMORY_HOTPLUG
376 /* see spanned/present_pages for more description */ 383 /* see spanned/present_pages for more description */
@@ -704,6 +711,7 @@ typedef struct pglist_data {
704 unsigned long node_spanned_pages; /* total size of physical page 711 unsigned long node_spanned_pages; /* total size of physical page
705 range, including holes */ 712 range, including holes */
706 int node_id; 713 int node_id;
714 nodemask_t reclaim_nodes; /* Nodes allowed to reclaim from */
707 wait_queue_head_t kswapd_wait; 715 wait_queue_head_t kswapd_wait;
708 wait_queue_head_t pfmemalloc_wait; 716 wait_queue_head_t pfmemalloc_wait;
709 struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */ 717 struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 49a3031fda50..d36a8221f58b 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -2,17 +2,6 @@
2#define __INCLUDE_LINUX_OOM_H 2#define __INCLUDE_LINUX_OOM_H
3 3
4/* 4/*
5 * /proc/<pid>/oom_adj is deprecated, see
6 * Documentation/feature-removal-schedule.txt.
7 *
8 * /proc/<pid>/oom_adj set to -17 protects from the oom-killer
9 */
10#define OOM_DISABLE (-17)
11/* inclusive */
12#define OOM_ADJUST_MIN (-16)
13#define OOM_ADJUST_MAX 15
14
15/*
16 * /proc/<pid>/oom_score_adj set to OOM_SCORE_ADJ_MIN disables oom killing for 5 * /proc/<pid>/oom_score_adj set to OOM_SCORE_ADJ_MIN disables oom killing for
17 * pid. 6 * pid.
18 */ 7 */
diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 105077aa7685..76a9539cfd3f 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -6,6 +6,10 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count);
6void set_pageblock_migratetype(struct page *page, int migratetype); 6void set_pageblock_migratetype(struct page *page, int migratetype);
7int move_freepages_block(struct zone *zone, struct page *page, 7int move_freepages_block(struct zone *zone, struct page *page,
8 int migratetype); 8 int migratetype);
9int move_freepages(struct zone *zone,
10 struct page *start_page, struct page *end_page,
11 int migratetype);
12
9/* 13/*
10 * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE. 14 * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE.
11 * If specified range includes migrate types other than MOVABLE or CMA, 15 * If specified range includes migrate types other than MOVABLE or CMA,
@@ -37,6 +41,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn);
37 */ 41 */
38int set_migratetype_isolate(struct page *page); 42int set_migratetype_isolate(struct page *page);
39void unset_migratetype_isolate(struct page *page, unsigned migratetype); 43void unset_migratetype_isolate(struct page *page, unsigned migratetype);
40 44struct page *alloc_migrate_target(struct page *page, unsigned long private,
45 int **resultp);
41 46
42#endif 47#endif
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index 19ef95d293ae..eed27f4f4c3e 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -30,6 +30,9 @@ enum pageblock_bits {
30 PB_migrate, 30 PB_migrate,
31 PB_migrate_end = PB_migrate + 3 - 1, 31 PB_migrate_end = PB_migrate + 3 - 1,
32 /* 3 bits required for migrate types */ 32 /* 3 bits required for migrate types */
33#ifdef CONFIG_COMPACTION
34 PB_migrate_skip,/* If set the block is skipped by compaction */
35#endif /* CONFIG_COMPACTION */
33 NR_PAGEBLOCK_BITS 36 NR_PAGEBLOCK_BITS
34}; 37};
35 38
@@ -65,10 +68,22 @@ unsigned long get_pageblock_flags_group(struct page *page,
65void set_pageblock_flags_group(struct page *page, unsigned long flags, 68void set_pageblock_flags_group(struct page *page, unsigned long flags,
66 int start_bitidx, int end_bitidx); 69 int start_bitidx, int end_bitidx);
67 70
71#ifdef CONFIG_COMPACTION
72#define get_pageblock_skip(page) \
73 get_pageblock_flags_group(page, PB_migrate_skip, \
74 PB_migrate_skip + 1)
75#define clear_pageblock_skip(page) \
76 set_pageblock_flags_group(page, 0, PB_migrate_skip, \
77 PB_migrate_skip + 1)
78#define set_pageblock_skip(page) \
79 set_pageblock_flags_group(page, 1, PB_migrate_skip, \
80 PB_migrate_skip + 1)
81#endif /* CONFIG_COMPACTION */
82
68#define get_pageblock_flags(page) \ 83#define get_pageblock_flags(page) \
69 get_pageblock_flags_group(page, 0, NR_PAGEBLOCK_BITS-1) 84 get_pageblock_flags_group(page, 0, PB_migrate_end)
70#define set_pageblock_flags(page, flags) \ 85#define set_pageblock_flags(page, flags) \
71 set_pageblock_flags_group(page, flags, \ 86 set_pageblock_flags_group(page, flags, \
72 0, NR_PAGEBLOCK_BITS-1) 87 0, PB_migrate_end)
73 88
74#endif /* PAGEBLOCK_FLAGS_H */ 89#endif /* PAGEBLOCK_FLAGS_H */
diff --git a/include/linux/prio_tree.h b/include/linux/prio_tree.h
deleted file mode 100644
index db04abb557e0..000000000000
--- a/include/linux/prio_tree.h
+++ /dev/null
@@ -1,120 +0,0 @@
1#ifndef _LINUX_PRIO_TREE_H
2#define _LINUX_PRIO_TREE_H
3
4/*
5 * K&R 2nd ed. A8.3 somewhat obliquely hints that initial sequences of struct
6 * fields with identical types should end up at the same location. We'll use
7 * this until we can scrap struct raw_prio_tree_node.
8 *
9 * Note: all this could be done more elegantly by using unnamed union/struct
10 * fields. However, gcc 2.95.3 and apparently also gcc 3.0.4 don't support this
11 * language extension.
12 */
13
14struct raw_prio_tree_node {
15 struct prio_tree_node *left;
16 struct prio_tree_node *right;
17 struct prio_tree_node *parent;
18};
19
20struct prio_tree_node {
21 struct prio_tree_node *left;
22 struct prio_tree_node *right;
23 struct prio_tree_node *parent;
24 unsigned long start;
25 unsigned long last; /* last location _in_ interval */
26};
27
28struct prio_tree_root {
29 struct prio_tree_node *prio_tree_node;
30 unsigned short index_bits;
31 unsigned short raw;
32 /*
33 * 0: nodes are of type struct prio_tree_node
34 * 1: nodes are of type raw_prio_tree_node
35 */
36};
37
38struct prio_tree_iter {
39 struct prio_tree_node *cur;
40 unsigned long mask;
41 unsigned long value;
42 int size_level;
43
44 struct prio_tree_root *root;
45 pgoff_t r_index;
46 pgoff_t h_index;
47};
48
49static inline void prio_tree_iter_init(struct prio_tree_iter *iter,
50 struct prio_tree_root *root, pgoff_t r_index, pgoff_t h_index)
51{
52 iter->root = root;
53 iter->r_index = r_index;
54 iter->h_index = h_index;
55 iter->cur = NULL;
56}
57
58#define __INIT_PRIO_TREE_ROOT(ptr, _raw) \
59do { \
60 (ptr)->prio_tree_node = NULL; \
61 (ptr)->index_bits = 1; \
62 (ptr)->raw = (_raw); \
63} while (0)
64
65#define INIT_PRIO_TREE_ROOT(ptr) __INIT_PRIO_TREE_ROOT(ptr, 0)
66#define INIT_RAW_PRIO_TREE_ROOT(ptr) __INIT_PRIO_TREE_ROOT(ptr, 1)
67
68#define INIT_PRIO_TREE_NODE(ptr) \
69do { \
70 (ptr)->left = (ptr)->right = (ptr)->parent = (ptr); \
71} while (0)
72
73#define INIT_PRIO_TREE_ITER(ptr) \
74do { \
75 (ptr)->cur = NULL; \
76 (ptr)->mask = 0UL; \
77 (ptr)->value = 0UL; \
78 (ptr)->size_level = 0; \
79} while (0)
80
81#define prio_tree_entry(ptr, type, member) \
82 ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
83
84static inline int prio_tree_empty(const struct prio_tree_root *root)
85{
86 return root->prio_tree_node == NULL;
87}
88
89static inline int prio_tree_root(const struct prio_tree_node *node)
90{
91 return node->parent == node;
92}
93
94static inline int prio_tree_left_empty(const struct prio_tree_node *node)
95{
96 return node->left == node;
97}
98
99static inline int prio_tree_right_empty(const struct prio_tree_node *node)
100{
101 return node->right == node;
102}
103
104
105struct prio_tree_node *prio_tree_replace(struct prio_tree_root *root,
106 struct prio_tree_node *old, struct prio_tree_node *node);
107struct prio_tree_node *prio_tree_insert(struct prio_tree_root *root,
108 struct prio_tree_node *node);
109void prio_tree_remove(struct prio_tree_root *root, struct prio_tree_node *node);
110struct prio_tree_node *prio_tree_next(struct prio_tree_iter *iter);
111
112#define raw_prio_tree_replace(root, old, node) \
113 prio_tree_replace(root, (struct prio_tree_node *) (old), \
114 (struct prio_tree_node *) (node))
115#define raw_prio_tree_insert(root, node) \
116 prio_tree_insert(root, (struct prio_tree_node *) (node))
117#define raw_prio_tree_remove(root, node) \
118 prio_tree_remove(root, (struct prio_tree_node *) (node))
119
120#endif /* _LINUX_PRIO_TREE_H */
diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index 033b507b33b1..0022c1bb1e26 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -23,72 +23,7 @@
23 I know it's not the cleaner way, but in C (not in C++) to get 23 I know it's not the cleaner way, but in C (not in C++) to get
24 performances and genericity... 24 performances and genericity...
25 25
26 Some example of insert and search follows here. The search is a plain 26 See Documentation/rbtree.txt for documentation and samples.
27 normal search over an ordered tree. The insert instead must be implemented
28 in two steps: First, the code must insert the element in order as a red leaf
29 in the tree, and then the support library function rb_insert_color() must
30 be called. Such function will do the not trivial work to rebalance the
31 rbtree, if necessary.
32
33-----------------------------------------------------------------------
34static inline struct page * rb_search_page_cache(struct inode * inode,
35 unsigned long offset)
36{
37 struct rb_node * n = inode->i_rb_page_cache.rb_node;
38 struct page * page;
39
40 while (n)
41 {
42 page = rb_entry(n, struct page, rb_page_cache);
43
44 if (offset < page->offset)
45 n = n->rb_left;
46 else if (offset > page->offset)
47 n = n->rb_right;
48 else
49 return page;
50 }
51 return NULL;
52}
53
54static inline struct page * __rb_insert_page_cache(struct inode * inode,
55 unsigned long offset,
56 struct rb_node * node)
57{
58 struct rb_node ** p = &inode->i_rb_page_cache.rb_node;
59 struct rb_node * parent = NULL;
60 struct page * page;
61
62 while (*p)
63 {
64 parent = *p;
65 page = rb_entry(parent, struct page, rb_page_cache);
66
67 if (offset < page->offset)
68 p = &(*p)->rb_left;
69 else if (offset > page->offset)
70 p = &(*p)->rb_right;
71 else
72 return page;
73 }
74
75 rb_link_node(node, parent, p);
76
77 return NULL;
78}
79
80static inline struct page * rb_insert_page_cache(struct inode * inode,
81 unsigned long offset,
82 struct rb_node * node)
83{
84 struct page * ret;
85 if ((ret = __rb_insert_page_cache(inode, offset, node)))
86 goto out;
87 rb_insert_color(node, &inode->i_rb_page_cache);
88 out:
89 return ret;
90}
91-----------------------------------------------------------------------
92*/ 27*/
93 28
94#ifndef _LINUX_RBTREE_H 29#ifndef _LINUX_RBTREE_H
@@ -97,63 +32,35 @@ static inline struct page * rb_insert_page_cache(struct inode * inode,
97#include <linux/kernel.h> 32#include <linux/kernel.h>
98#include <linux/stddef.h> 33#include <linux/stddef.h>
99 34
100struct rb_node 35struct rb_node {
101{ 36 unsigned long __rb_parent_color;
102 unsigned long rb_parent_color;
103#define RB_RED 0
104#define RB_BLACK 1
105 struct rb_node *rb_right; 37 struct rb_node *rb_right;
106 struct rb_node *rb_left; 38 struct rb_node *rb_left;
107} __attribute__((aligned(sizeof(long)))); 39} __attribute__((aligned(sizeof(long))));
108 /* The alignment might seem pointless, but allegedly CRIS needs it */ 40 /* The alignment might seem pointless, but allegedly CRIS needs it */
109 41
110struct rb_root 42struct rb_root {
111{
112 struct rb_node *rb_node; 43 struct rb_node *rb_node;
113}; 44};
114 45
115 46
116#define rb_parent(r) ((struct rb_node *)((r)->rb_parent_color & ~3)) 47#define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3))
117#define rb_color(r) ((r)->rb_parent_color & 1)
118#define rb_is_red(r) (!rb_color(r))
119#define rb_is_black(r) rb_color(r)
120#define rb_set_red(r) do { (r)->rb_parent_color &= ~1; } while (0)
121#define rb_set_black(r) do { (r)->rb_parent_color |= 1; } while (0)
122
123static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
124{
125 rb->rb_parent_color = (rb->rb_parent_color & 3) | (unsigned long)p;
126}
127static inline void rb_set_color(struct rb_node *rb, int color)
128{
129 rb->rb_parent_color = (rb->rb_parent_color & ~1) | color;
130}
131 48
132#define RB_ROOT (struct rb_root) { NULL, } 49#define RB_ROOT (struct rb_root) { NULL, }
133#define rb_entry(ptr, type, member) container_of(ptr, type, member) 50#define rb_entry(ptr, type, member) container_of(ptr, type, member)
134 51
135#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) 52#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL)
136#define RB_EMPTY_NODE(node) (rb_parent(node) == node) 53
137#define RB_CLEAR_NODE(node) (rb_set_parent(node, node)) 54/* 'empty' nodes are nodes that are known not to be inserted in an rbree */
55#define RB_EMPTY_NODE(node) \
56 ((node)->__rb_parent_color == (unsigned long)(node))
57#define RB_CLEAR_NODE(node) \
58 ((node)->__rb_parent_color = (unsigned long)(node))
138 59
139static inline void rb_init_node(struct rb_node *rb)
140{
141 rb->rb_parent_color = 0;
142 rb->rb_right = NULL;
143 rb->rb_left = NULL;
144 RB_CLEAR_NODE(rb);
145}
146 60
147extern void rb_insert_color(struct rb_node *, struct rb_root *); 61extern void rb_insert_color(struct rb_node *, struct rb_root *);
148extern void rb_erase(struct rb_node *, struct rb_root *); 62extern void rb_erase(struct rb_node *, struct rb_root *);
149 63
150typedef void (*rb_augment_f)(struct rb_node *node, void *data);
151
152extern void rb_augment_insert(struct rb_node *node,
153 rb_augment_f func, void *data);
154extern struct rb_node *rb_augment_erase_begin(struct rb_node *node);
155extern void rb_augment_erase_end(struct rb_node *node,
156 rb_augment_f func, void *data);
157 64
158/* Find logical next and previous nodes in a tree */ 65/* Find logical next and previous nodes in a tree */
159extern struct rb_node *rb_next(const struct rb_node *); 66extern struct rb_node *rb_next(const struct rb_node *);
@@ -168,7 +75,7 @@ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
168static inline void rb_link_node(struct rb_node * node, struct rb_node * parent, 75static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
169 struct rb_node ** rb_link) 76 struct rb_node ** rb_link)
170{ 77{
171 node->rb_parent_color = (unsigned long )parent; 78 node->__rb_parent_color = (unsigned long)parent;
172 node->rb_left = node->rb_right = NULL; 79 node->rb_left = node->rb_right = NULL;
173 80
174 *rb_link = node; 81 *rb_link = node;
diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
new file mode 100644
index 000000000000..214caa33433b
--- /dev/null
+++ b/include/linux/rbtree_augmented.h
@@ -0,0 +1,223 @@
1/*
2 Red Black Trees
3 (C) 1999 Andrea Arcangeli <andrea@suse.de>
4 (C) 2002 David Woodhouse <dwmw2@infradead.org>
5 (C) 2012 Michel Lespinasse <walken@google.com>
6
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2 of the License, or
10 (at your option) any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20
21 linux/include/linux/rbtree_augmented.h
22*/
23
24#ifndef _LINUX_RBTREE_AUGMENTED_H
25#define _LINUX_RBTREE_AUGMENTED_H
26
27#include <linux/rbtree.h>
28
29/*
30 * Please note - only struct rb_augment_callbacks and the prototypes for
31 * rb_insert_augmented() and rb_erase_augmented() are intended to be public.
32 * The rest are implementation details you are not expected to depend on.
33 *
34 * See Documentation/rbtree.txt for documentation and samples.
35 */
36
37struct rb_augment_callbacks {
38 void (*propagate)(struct rb_node *node, struct rb_node *stop);
39 void (*copy)(struct rb_node *old, struct rb_node *new);
40 void (*rotate)(struct rb_node *old, struct rb_node *new);
41};
42
43extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
44 void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
45static inline void
46rb_insert_augmented(struct rb_node *node, struct rb_root *root,
47 const struct rb_augment_callbacks *augment)
48{
49 __rb_insert_augmented(node, root, augment->rotate);
50}
51
52#define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield, \
53 rbtype, rbaugmented, rbcompute) \
54static inline void \
55rbname ## _propagate(struct rb_node *rb, struct rb_node *stop) \
56{ \
57 while (rb != stop) { \
58 rbstruct *node = rb_entry(rb, rbstruct, rbfield); \
59 rbtype augmented = rbcompute(node); \
60 if (node->rbaugmented == augmented) \
61 break; \
62 node->rbaugmented = augmented; \
63 rb = rb_parent(&node->rbfield); \
64 } \
65} \
66static inline void \
67rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \
68{ \
69 rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \
70 rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \
71 new->rbaugmented = old->rbaugmented; \
72} \
73static void \
74rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \
75{ \
76 rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \
77 rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \
78 new->rbaugmented = old->rbaugmented; \
79 old->rbaugmented = rbcompute(old); \
80} \
81rbstatic const struct rb_augment_callbacks rbname = { \
82 rbname ## _propagate, rbname ## _copy, rbname ## _rotate \
83};
84
85
86#define RB_RED 0
87#define RB_BLACK 1
88
89#define __rb_parent(pc) ((struct rb_node *)(pc & ~3))
90
91#define __rb_color(pc) ((pc) & 1)
92#define __rb_is_black(pc) __rb_color(pc)
93#define __rb_is_red(pc) (!__rb_color(pc))
94#define rb_color(rb) __rb_color((rb)->__rb_parent_color)
95#define rb_is_red(rb) __rb_is_red((rb)->__rb_parent_color)
96#define rb_is_black(rb) __rb_is_black((rb)->__rb_parent_color)
97
98static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
99{
100 rb->__rb_parent_color = rb_color(rb) | (unsigned long)p;
101}
102
103static inline void rb_set_parent_color(struct rb_node *rb,
104 struct rb_node *p, int color)
105{
106 rb->__rb_parent_color = (unsigned long)p | color;
107}
108
109static inline void
110__rb_change_child(struct rb_node *old, struct rb_node *new,
111 struct rb_node *parent, struct rb_root *root)
112{
113 if (parent) {
114 if (parent->rb_left == old)
115 parent->rb_left = new;
116 else
117 parent->rb_right = new;
118 } else
119 root->rb_node = new;
120}
121
122extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
123 void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
124
125static __always_inline void
126rb_erase_augmented(struct rb_node *node, struct rb_root *root,
127 const struct rb_augment_callbacks *augment)
128{
129 struct rb_node *child = node->rb_right, *tmp = node->rb_left;
130 struct rb_node *parent, *rebalance;
131 unsigned long pc;
132
133 if (!tmp) {
134 /*
135 * Case 1: node to erase has no more than 1 child (easy!)
136 *
137 * Note that if there is one child it must be red due to 5)
138 * and node must be black due to 4). We adjust colors locally
139 * so as to bypass __rb_erase_color() later on.
140 */
141 pc = node->__rb_parent_color;
142 parent = __rb_parent(pc);
143 __rb_change_child(node, child, parent, root);
144 if (child) {
145 child->__rb_parent_color = pc;
146 rebalance = NULL;
147 } else
148 rebalance = __rb_is_black(pc) ? parent : NULL;
149 tmp = parent;
150 } else if (!child) {
151 /* Still case 1, but this time the child is node->rb_left */
152 tmp->__rb_parent_color = pc = node->__rb_parent_color;
153 parent = __rb_parent(pc);
154 __rb_change_child(node, tmp, parent, root);
155 rebalance = NULL;
156 tmp = parent;
157 } else {
158 struct rb_node *successor = child, *child2;
159 tmp = child->rb_left;
160 if (!tmp) {
161 /*
162 * Case 2: node's successor is its right child
163 *
164 * (n) (s)
165 * / \ / \
166 * (x) (s) -> (x) (c)
167 * \
168 * (c)
169 */
170 parent = successor;
171 child2 = successor->rb_right;
172 augment->copy(node, successor);
173 } else {
174 /*
175 * Case 3: node's successor is leftmost under
176 * node's right child subtree
177 *
178 * (n) (s)
179 * / \ / \
180 * (x) (y) -> (x) (y)
181 * / /
182 * (p) (p)
183 * / /
184 * (s) (c)
185 * \
186 * (c)
187 */
188 do {
189 parent = successor;
190 successor = tmp;
191 tmp = tmp->rb_left;
192 } while (tmp);
193 parent->rb_left = child2 = successor->rb_right;
194 successor->rb_right = child;
195 rb_set_parent(child, successor);
196 augment->copy(node, successor);
197 augment->propagate(parent, successor);
198 }
199
200 successor->rb_left = tmp = node->rb_left;
201 rb_set_parent(tmp, successor);
202
203 pc = node->__rb_parent_color;
204 tmp = __rb_parent(pc);
205 __rb_change_child(node, successor, tmp, root);
206 if (child2) {
207 successor->__rb_parent_color = pc;
208 rb_set_parent_color(child2, parent, RB_BLACK);
209 rebalance = NULL;
210 } else {
211 unsigned long pc2 = successor->__rb_parent_color;
212 successor->__rb_parent_color = pc;
213 rebalance = __rb_is_black(pc2) ? parent : NULL;
214 }
215 tmp = successor;
216 }
217
218 augment->propagate(tmp, NULL);
219 if (rebalance)
220 __rb_erase_color(rebalance, root, augment->rotate);
221}
222
223#endif /* _LINUX_RBTREE_AUGMENTED_H */
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 3fce545df394..bfe1f4780644 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -37,14 +37,14 @@ struct anon_vma {
37 atomic_t refcount; 37 atomic_t refcount;
38 38
39 /* 39 /*
40 * NOTE: the LSB of the head.next is set by 40 * NOTE: the LSB of the rb_root.rb_node is set by
41 * mm_take_all_locks() _after_ taking the above lock. So the 41 * mm_take_all_locks() _after_ taking the above lock. So the
42 * head must only be read/written after taking the above lock 42 * rb_root must only be read/written after taking the above lock
43 * to be sure to see a valid next pointer. The LSB bit itself 43 * to be sure to see a valid next pointer. The LSB bit itself
44 * is serialized by a system wide lock only visible to 44 * is serialized by a system wide lock only visible to
45 * mm_take_all_locks() (mm_all_locks_mutex). 45 * mm_take_all_locks() (mm_all_locks_mutex).
46 */ 46 */
47 struct list_head head; /* Chain of private "related" vmas */ 47 struct rb_root rb_root; /* Interval tree of private "related" vmas */
48}; 48};
49 49
50/* 50/*
@@ -57,14 +57,29 @@ struct anon_vma {
57 * with a VMA, or the VMAs associated with an anon_vma. 57 * with a VMA, or the VMAs associated with an anon_vma.
58 * The "same_vma" list contains the anon_vma_chains linking 58 * The "same_vma" list contains the anon_vma_chains linking
59 * all the anon_vmas associated with this VMA. 59 * all the anon_vmas associated with this VMA.
60 * The "same_anon_vma" list contains the anon_vma_chains 60 * The "rb" field indexes on an interval tree the anon_vma_chains
61 * which link all the VMAs associated with this anon_vma. 61 * which link all the VMAs associated with this anon_vma.
62 */ 62 */
63struct anon_vma_chain { 63struct anon_vma_chain {
64 struct vm_area_struct *vma; 64 struct vm_area_struct *vma;
65 struct anon_vma *anon_vma; 65 struct anon_vma *anon_vma;
66 struct list_head same_vma; /* locked by mmap_sem & page_table_lock */ 66 struct list_head same_vma; /* locked by mmap_sem & page_table_lock */
67 struct list_head same_anon_vma; /* locked by anon_vma->mutex */ 67 struct rb_node rb; /* locked by anon_vma->mutex */
68 unsigned long rb_subtree_last;
69#ifdef CONFIG_DEBUG_VM_RB
70 unsigned long cached_vma_start, cached_vma_last;
71#endif
72};
73
74enum ttu_flags {
75 TTU_UNMAP = 0, /* unmap mode */
76 TTU_MIGRATION = 1, /* migration mode */
77 TTU_MUNLOCK = 2, /* munlock mode */
78 TTU_ACTION_MASK = 0xff,
79
80 TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */
81 TTU_IGNORE_ACCESS = (1 << 9), /* don't age */
82 TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */
68}; 83};
69 84
70#ifdef CONFIG_MMU 85#ifdef CONFIG_MMU
@@ -120,7 +135,6 @@ void anon_vma_init(void); /* create anon_vma_cachep */
120int anon_vma_prepare(struct vm_area_struct *); 135int anon_vma_prepare(struct vm_area_struct *);
121void unlink_anon_vmas(struct vm_area_struct *); 136void unlink_anon_vmas(struct vm_area_struct *);
122int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *); 137int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
123void anon_vma_moveto_tail(struct vm_area_struct *);
124int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *); 138int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);
125 139
126static inline void anon_vma_merge(struct vm_area_struct *vma, 140static inline void anon_vma_merge(struct vm_area_struct *vma,
@@ -161,16 +175,6 @@ int page_referenced(struct page *, int is_locked,
161int page_referenced_one(struct page *, struct vm_area_struct *, 175int page_referenced_one(struct page *, struct vm_area_struct *,
162 unsigned long address, unsigned int *mapcount, unsigned long *vm_flags); 176 unsigned long address, unsigned int *mapcount, unsigned long *vm_flags);
163 177
164enum ttu_flags {
165 TTU_UNMAP = 0, /* unmap mode */
166 TTU_MIGRATION = 1, /* migration mode */
167 TTU_MUNLOCK = 2, /* munlock mode */
168 TTU_ACTION_MASK = 0xff,
169
170 TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */
171 TTU_IGNORE_ACCESS = (1 << 9), /* don't age */
172 TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */
173};
174#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK) 178#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
175 179
176int try_to_unmap(struct page *, enum ttu_flags flags); 180int try_to_unmap(struct page *, enum ttu_flags flags);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9c5612f0374b..c2070e92a9d6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -671,7 +671,6 @@ struct signal_struct {
671 struct rw_semaphore group_rwsem; 671 struct rw_semaphore group_rwsem;
672#endif 672#endif
673 673
674 int oom_adj; /* OOM kill score adjustment (bit shift) */
675 int oom_score_adj; /* OOM kill score adjustment */ 674 int oom_score_adj; /* OOM kill score adjustment */
676 int oom_score_adj_min; /* OOM kill score adjustment minimum value. 675 int oom_score_adj_min; /* OOM kill score adjustment minimum value.
677 * Only settable by CAP_SYS_RESOURCE. */ 676 * Only settable by CAP_SYS_RESOURCE. */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 388e70601413..68df9c17fbbb 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -281,7 +281,7 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
281} 281}
282#endif 282#endif
283 283
284extern int page_evictable(struct page *page, struct vm_area_struct *vma); 284extern int page_evictable(struct page *page);
285extern void check_move_unevictable_pages(struct page **, int nr_pages); 285extern void check_move_unevictable_pages(struct page **, int nr_pages);
286 286
287extern unsigned long scan_unevictable_pages; 287extern unsigned long scan_unevictable_pages;
diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h
index 5088727478fd..a520fd70a59f 100644
--- a/include/linux/timerqueue.h
+++ b/include/linux/timerqueue.h
@@ -39,7 +39,7 @@ struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head)
39 39
40static inline void timerqueue_init(struct timerqueue_node *node) 40static inline void timerqueue_init(struct timerqueue_node *node)
41{ 41{
42 rb_init_node(&node->node); 42 RB_CLEAR_NODE(&node->node);
43} 43}
44 44
45static inline void timerqueue_init_head(struct timerqueue_head *head) 45static inline void timerqueue_init_head(struct timerqueue_head *head)
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 57f7b1091511..3d3114594370 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -52,7 +52,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
52 UNEVICTABLE_PGMUNLOCKED, 52 UNEVICTABLE_PGMUNLOCKED,
53 UNEVICTABLE_PGCLEARED, /* on COW, page truncate */ 53 UNEVICTABLE_PGCLEARED, /* on COW, page truncate */
54 UNEVICTABLE_PGSTRANDED, /* unable to isolate on unlock */ 54 UNEVICTABLE_PGSTRANDED, /* unable to isolate on unlock */
55 UNEVICTABLE_MLOCKFREED,
56#ifdef CONFIG_TRANSPARENT_HUGEPAGE 55#ifdef CONFIG_TRANSPARENT_HUGEPAGE
57 THP_FAULT_ALLOC, 56 THP_FAULT_ALLOC,
58 THP_FAULT_FALLBACK, 57 THP_FAULT_FALLBACK,
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index ad2cfd53dadc..92a86b2cce33 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -198,6 +198,8 @@ extern void __dec_zone_state(struct zone *, enum zone_stat_item);
198void refresh_cpu_vm_stats(int); 198void refresh_cpu_vm_stats(int);
199void refresh_zone_stat_thresholds(void); 199void refresh_zone_stat_thresholds(void);
200 200
201void drain_zonestat(struct zone *zone, struct per_cpu_pageset *);
202
201int calculate_pressure_threshold(struct zone *zone); 203int calculate_pressure_threshold(struct zone *zone);
202int calculate_normal_threshold(struct zone *zone); 204int calculate_normal_threshold(struct zone *zone);
203void set_pgdat_percpu_threshold(pg_data_t *pgdat, 205void set_pgdat_percpu_threshold(pg_data_t *pgdat,
@@ -251,8 +253,18 @@ static inline void __dec_zone_page_state(struct page *page,
251static inline void refresh_cpu_vm_stats(int cpu) { } 253static inline void refresh_cpu_vm_stats(int cpu) { }
252static inline void refresh_zone_stat_thresholds(void) { } 254static inline void refresh_zone_stat_thresholds(void) { }
253 255
256static inline void drain_zonestat(struct zone *zone,
257 struct per_cpu_pageset *pset) { }
254#endif /* CONFIG_SMP */ 258#endif /* CONFIG_SMP */
255 259
260static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages,
261 int migratetype)
262{
263 __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages);
264 if (is_migrate_cma(migratetype))
265 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
266}
267
256extern const char * const vmstat_text[]; 268extern const char * const vmstat_text[];
257 269
258#endif /* _LINUX_VMSTAT_H */ 270#endif /* _LINUX_VMSTAT_H */
diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h
index d6fd8e5b14b7..9391706e9254 100644
--- a/include/trace/events/gfpflags.h
+++ b/include/trace/events/gfpflags.h
@@ -36,7 +36,6 @@
36 {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ 36 {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \
37 {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \ 37 {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \
38 {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \ 38 {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \
39 {(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \
40 {(unsigned long)__GFP_OTHER_NODE, "GFP_OTHER_NODE"} \ 39 {(unsigned long)__GFP_OTHER_NODE, "GFP_OTHER_NODE"} \
41 ) : "GFP_NOWAIT" 40 ) : "GFP_NOWAIT"
42 41
diff --git a/init/Kconfig b/init/Kconfig
index ed6334dd5e71..4c93533da42c 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1125,10 +1125,12 @@ menuconfig EXPERT
1125 environments which can tolerate a "non-standard" kernel. 1125 environments which can tolerate a "non-standard" kernel.
1126 Only use this if you really know what you are doing. 1126 Only use this if you really know what you are doing.
1127 1127
1128config HAVE_UID16
1129 bool
1130
1128config UID16 1131config UID16
1129 bool "Enable 16-bit UID system calls" if EXPERT 1132 bool "Enable 16-bit UID system calls" if EXPERT
1130 depends on ARM || BLACKFIN || CRIS || FRV || H8300 || X86_32 || M68K || (S390 && !64BIT) || SUPERH || SPARC32 || (SPARC64 && COMPAT) || UML || (X86_64 && IA32_EMULATION) \ 1133 depends on HAVE_UID16
1131 || AARCH32_EMULATION
1132 default y 1134 default y
1133 help 1135 help
1134 This enables the legacy 16-bit UID syscall wrappers. 1136 This enables the legacy 16-bit UID syscall wrappers.
@@ -1150,6 +1152,11 @@ config SYSCTL_SYSCALL
1150 1152
1151 If unsure say N here. 1153 If unsure say N here.
1152 1154
1155config SYSCTL_EXCEPTION_TRACE
1156 bool
1157 help
1158 Enable support for /proc/sys/debug/exception-trace.
1159
1153config KALLSYMS 1160config KALLSYMS
1154 bool "Load all symbols for debugging/ksymoops" if EXPERT 1161 bool "Load all symbols for debugging/ksymoops" if EXPERT
1155 default y 1162 default y
diff --git a/init/main.c b/init/main.c
index db34c0ec4711..313360fe1118 100644
--- a/init/main.c
+++ b/init/main.c
@@ -86,7 +86,6 @@ extern void init_IRQ(void);
86extern void fork_init(unsigned long); 86extern void fork_init(unsigned long);
87extern void mca_init(void); 87extern void mca_init(void);
88extern void sbus_init(void); 88extern void sbus_init(void);
89extern void prio_tree_init(void);
90extern void radix_tree_init(void); 89extern void radix_tree_init(void);
91#ifndef CONFIG_DEBUG_RODATA 90#ifndef CONFIG_DEBUG_RODATA
92static inline void mark_rodata_ro(void) { } 91static inline void mark_rodata_ro(void) { }
@@ -547,7 +546,6 @@ asmlinkage void __init start_kernel(void)
547 /* init some links before init_ISA_irqs() */ 546 /* init some links before init_ISA_irqs() */
548 early_irq_init(); 547 early_irq_init();
549 init_IRQ(); 548 init_IRQ();
550 prio_tree_init();
551 init_timers(); 549 init_timers();
552 hrtimers_init(); 550 hrtimers_init();
553 softirq_init(); 551 softirq_init();
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 6d255e535d03..6b97e2466fad 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -142,7 +142,6 @@ static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info)
142 leaf = kmalloc(sizeof(*leaf), GFP_ATOMIC); 142 leaf = kmalloc(sizeof(*leaf), GFP_ATOMIC);
143 if (!leaf) 143 if (!leaf)
144 return -ENOMEM; 144 return -ENOMEM;
145 rb_init_node(&leaf->rb_node);
146 INIT_LIST_HEAD(&leaf->msg_list); 145 INIT_LIST_HEAD(&leaf->msg_list);
147 info->qsize += sizeof(*leaf); 146 info->qsize += sizeof(*leaf);
148 } 147 }
@@ -1013,7 +1012,6 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
1013 1012
1014 if (!info->node_cache && new_leaf) { 1013 if (!info->node_cache && new_leaf) {
1015 /* Save our speculative allocation into the cache */ 1014 /* Save our speculative allocation into the cache */
1016 rb_init_node(&new_leaf->rb_node);
1017 INIT_LIST_HEAD(&new_leaf->msg_list); 1015 INIT_LIST_HEAD(&new_leaf->msg_list);
1018 info->node_cache = new_leaf; 1016 info->node_cache = new_leaf;
1019 info->qsize += sizeof(*new_leaf); 1017 info->qsize += sizeof(*new_leaf);
@@ -1121,7 +1119,6 @@ SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,
1121 1119
1122 if (!info->node_cache && new_leaf) { 1120 if (!info->node_cache && new_leaf) {
1123 /* Save our speculative allocation into the cache */ 1121 /* Save our speculative allocation into the cache */
1124 rb_init_node(&new_leaf->rb_node);
1125 INIT_LIST_HEAD(&new_leaf->msg_list); 1122 INIT_LIST_HEAD(&new_leaf->msg_list);
1126 info->node_cache = new_leaf; 1123 info->node_cache = new_leaf;
1127 info->qsize += sizeof(*new_leaf); 1124 info->qsize += sizeof(*new_leaf);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 29e090cc0e46..f4a7756f999c 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1151,7 +1151,6 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
1151 const struct cred *cred; 1151 const struct cred *cred;
1152 char name[sizeof(tsk->comm)]; 1152 char name[sizeof(tsk->comm)];
1153 struct mm_struct *mm = tsk->mm; 1153 struct mm_struct *mm = tsk->mm;
1154 struct vm_area_struct *vma;
1155 char *tty; 1154 char *tty;
1156 1155
1157 if (!ab) 1156 if (!ab)
@@ -1191,16 +1190,8 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
1191 1190
1192 if (mm) { 1191 if (mm) {
1193 down_read(&mm->mmap_sem); 1192 down_read(&mm->mmap_sem);
1194 vma = mm->mmap; 1193 if (mm->exe_file)
1195 while (vma) { 1194 audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
1196 if ((vma->vm_flags & VM_EXECUTABLE) &&
1197 vma->vm_file) {
1198 audit_log_d_path(ab, " exe=",
1199 &vma->vm_file->f_path);
1200 break;
1201 }
1202 vma = vma->vm_next;
1203 }
1204 up_read(&mm->mmap_sem); 1195 up_read(&mm->mmap_sem);
1205 } 1196 }
1206 audit_log_task_context(ab); 1197 audit_log_task_context(ab);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f560598807c1..42bd331ee0ab 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -80,6 +80,10 @@ void put_online_cpus(void)
80 if (cpu_hotplug.active_writer == current) 80 if (cpu_hotplug.active_writer == current)
81 return; 81 return;
82 mutex_lock(&cpu_hotplug.lock); 82 mutex_lock(&cpu_hotplug.lock);
83
84 if (WARN_ON(!cpu_hotplug.refcount))
85 cpu_hotplug.refcount++; /* try to fix things up */
86
83 if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) 87 if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
84 wake_up_process(cpu_hotplug.active_writer); 88 wake_up_process(cpu_hotplug.active_writer);
85 mutex_unlock(&cpu_hotplug.lock); 89 mutex_unlock(&cpu_hotplug.lock);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f16f3c58f11a..cda3ebd49e86 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3671,7 +3671,7 @@ unlock:
3671 atomic_inc(&event->mmap_count); 3671 atomic_inc(&event->mmap_count);
3672 mutex_unlock(&event->mmap_mutex); 3672 mutex_unlock(&event->mmap_mutex);
3673 3673
3674 vma->vm_flags |= VM_RESERVED; 3674 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
3675 vma->vm_ops = &perf_mmap_vmops; 3675 vma->vm_ops = &perf_mmap_vmops;
3676 3676
3677 return ret; 3677 return ret;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 912ef48d28ab..98256bc71ee1 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -141,10 +141,14 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
141 spinlock_t *ptl; 141 spinlock_t *ptl;
142 pte_t *ptep; 142 pte_t *ptep;
143 int err; 143 int err;
144 /* For mmu_notifiers */
145 const unsigned long mmun_start = addr;
146 const unsigned long mmun_end = addr + PAGE_SIZE;
144 147
145 /* For try_to_free_swap() and munlock_vma_page() below */ 148 /* For try_to_free_swap() and munlock_vma_page() below */
146 lock_page(page); 149 lock_page(page);
147 150
151 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
148 err = -EAGAIN; 152 err = -EAGAIN;
149 ptep = page_check_address(page, mm, addr, &ptl, 0); 153 ptep = page_check_address(page, mm, addr, &ptl, 0);
150 if (!ptep) 154 if (!ptep)
@@ -173,6 +177,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
173 177
174 err = 0; 178 err = 0;
175 unlock: 179 unlock:
180 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
176 unlock_page(page); 181 unlock_page(page);
177 return err; 182 return err;
178} 183}
@@ -735,7 +740,6 @@ static struct map_info *
735build_map_info(struct address_space *mapping, loff_t offset, bool is_register) 740build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
736{ 741{
737 unsigned long pgoff = offset >> PAGE_SHIFT; 742 unsigned long pgoff = offset >> PAGE_SHIFT;
738 struct prio_tree_iter iter;
739 struct vm_area_struct *vma; 743 struct vm_area_struct *vma;
740 struct map_info *curr = NULL; 744 struct map_info *curr = NULL;
741 struct map_info *prev = NULL; 745 struct map_info *prev = NULL;
@@ -744,7 +748,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
744 748
745 again: 749 again:
746 mutex_lock(&mapping->i_mmap_mutex); 750 mutex_lock(&mapping->i_mmap_mutex);
747 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 751 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
748 if (!valid_vma(vma, is_register)) 752 if (!valid_vma(vma, is_register))
749 continue; 753 continue;
750 754
diff --git a/kernel/fork.c b/kernel/fork.c
index a2b1efc20928..1cd7d581b3b2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -423,7 +423,12 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
423 mapping->i_mmap_writable++; 423 mapping->i_mmap_writable++;
424 flush_dcache_mmap_lock(mapping); 424 flush_dcache_mmap_lock(mapping);
425 /* insert tmp into the share list, just after mpnt */ 425 /* insert tmp into the share list, just after mpnt */
426 vma_prio_tree_add(tmp, mpnt); 426 if (unlikely(tmp->vm_flags & VM_NONLINEAR))
427 vma_nonlinear_insert(tmp,
428 &mapping->i_mmap_nonlinear);
429 else
430 vma_interval_tree_insert_after(tmp, mpnt,
431 &mapping->i_mmap);
427 flush_dcache_mmap_unlock(mapping); 432 flush_dcache_mmap_unlock(mapping);
428 mutex_unlock(&mapping->i_mmap_mutex); 433 mutex_unlock(&mapping->i_mmap_mutex);
429 } 434 }
@@ -622,26 +627,6 @@ void mmput(struct mm_struct *mm)
622} 627}
623EXPORT_SYMBOL_GPL(mmput); 628EXPORT_SYMBOL_GPL(mmput);
624 629
625/*
626 * We added or removed a vma mapping the executable. The vmas are only mapped
627 * during exec and are not mapped with the mmap system call.
628 * Callers must hold down_write() on the mm's mmap_sem for these
629 */
630void added_exe_file_vma(struct mm_struct *mm)
631{
632 mm->num_exe_file_vmas++;
633}
634
635void removed_exe_file_vma(struct mm_struct *mm)
636{
637 mm->num_exe_file_vmas--;
638 if ((mm->num_exe_file_vmas == 0) && mm->exe_file) {
639 fput(mm->exe_file);
640 mm->exe_file = NULL;
641 }
642
643}
644
645void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) 630void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
646{ 631{
647 if (new_exe_file) 632 if (new_exe_file)
@@ -649,15 +634,13 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
649 if (mm->exe_file) 634 if (mm->exe_file)
650 fput(mm->exe_file); 635 fput(mm->exe_file);
651 mm->exe_file = new_exe_file; 636 mm->exe_file = new_exe_file;
652 mm->num_exe_file_vmas = 0;
653} 637}
654 638
655struct file *get_mm_exe_file(struct mm_struct *mm) 639struct file *get_mm_exe_file(struct mm_struct *mm)
656{ 640{
657 struct file *exe_file; 641 struct file *exe_file;
658 642
659 /* We need mmap_sem to protect against races with removal of 643 /* We need mmap_sem to protect against races with removal of exe_file */
660 * VM_EXECUTABLE vmas */
661 down_read(&mm->mmap_sem); 644 down_read(&mm->mmap_sem);
662 exe_file = mm->exe_file; 645 exe_file = mm->exe_file;
663 if (exe_file) 646 if (exe_file)
@@ -1078,7 +1061,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1078 init_rwsem(&sig->group_rwsem); 1061 init_rwsem(&sig->group_rwsem);
1079#endif 1062#endif
1080 1063
1081 sig->oom_adj = current->signal->oom_adj;
1082 sig->oom_score_adj = current->signal->oom_score_adj; 1064 sig->oom_score_adj = current->signal->oom_score_adj;
1083 sig->oom_score_adj_min = current->signal->oom_score_adj_min; 1065 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
1084 1066
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c2a2f8084bad..26f65eaa01f9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1549,8 +1549,7 @@ static struct ctl_table fs_table[] = {
1549}; 1549};
1550 1550
1551static struct ctl_table debug_table[] = { 1551static struct ctl_table debug_table[] = {
1552#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \ 1552#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE
1553 defined(CONFIG_S390) || defined(CONFIG_TILE) || defined(CONFIG_ARM64)
1554 { 1553 {
1555 .procname = "exception-trace", 1554 .procname = "exception-trace",
1556 .data = &show_unhandled_signals, 1555 .data = &show_unhandled_signals,
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 7fba3a98967f..28e9d6c98941 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -450,12 +450,12 @@ config SLUB_STATS
450 out which slabs are relevant to a particular load. 450 out which slabs are relevant to a particular load.
451 Try running: slabinfo -DA 451 Try running: slabinfo -DA
452 452
453config HAVE_DEBUG_KMEMLEAK
454 bool
455
453config DEBUG_KMEMLEAK 456config DEBUG_KMEMLEAK
454 bool "Kernel memory leak detector" 457 bool "Kernel memory leak detector"
455 depends on DEBUG_KERNEL && EXPERIMENTAL && \ 458 depends on DEBUG_KERNEL && EXPERIMENTAL && HAVE_DEBUG_KMEMLEAK
456 (X86 || ARM || PPC || MIPS || S390 || SPARC64 || SUPERH || \
457 MICROBLAZE || TILE || ARM64)
458
459 select DEBUG_FS 459 select DEBUG_FS
460 select STACKTRACE if STACKTRACE_SUPPORT 460 select STACKTRACE if STACKTRACE_SUPPORT
461 select KALLSYMS 461 select KALLSYMS
@@ -751,12 +751,12 @@ config DEBUG_HIGHMEM
751 This options enables addition error checking for high memory systems. 751 This options enables addition error checking for high memory systems.
752 Disable for production systems. 752 Disable for production systems.
753 753
754config HAVE_DEBUG_BUGVERBOSE
755 bool
756
754config DEBUG_BUGVERBOSE 757config DEBUG_BUGVERBOSE
755 bool "Verbose BUG() reporting (adds 70K)" if DEBUG_KERNEL && EXPERT 758 bool "Verbose BUG() reporting (adds 70K)" if DEBUG_KERNEL && EXPERT
756 depends on BUG 759 depends on BUG && (GENERIC_BUG || HAVE_DEBUG_BUGVERBOSE)
757 depends on ARM || AVR32 || M32R || M68K || SPARC32 || SPARC64 || \
758 FRV || SUPERH || GENERIC_BUG || BLACKFIN || MN10300 || \
759 TILE || ARM64
760 default y 760 default y
761 help 761 help
762 Say Y here to make BUG() panics output the file name and line number 762 Say Y here to make BUG() panics output the file name and line number
@@ -798,6 +798,15 @@ config DEBUG_VM
798 798
799 If unsure, say N. 799 If unsure, say N.
800 800
801config DEBUG_VM_RB
802 bool "Debug VM red-black trees"
803 depends on DEBUG_VM
804 help
805 Enable this to turn on more extended checks in the virtual-memory
806 system that may impact performance.
807
808 If unsure, say N.
809
801config DEBUG_VIRTUAL 810config DEBUG_VIRTUAL
802 bool "Debug VM translations" 811 bool "Debug VM translations"
803 depends on DEBUG_KERNEL && X86 812 depends on DEBUG_KERNEL && X86
@@ -1282,6 +1291,19 @@ config LATENCYTOP
1282source mm/Kconfig.debug 1291source mm/Kconfig.debug
1283source kernel/trace/Kconfig 1292source kernel/trace/Kconfig
1284 1293
1294config RBTREE_TEST
1295 tristate "Red-Black tree test"
1296 depends on m && DEBUG_KERNEL
1297 help
1298 A benchmark measuring the performance of the rbtree library.
1299 Also includes rbtree invariant checks.
1300
1301config INTERVAL_TREE_TEST
1302 tristate "Interval tree test"
1303 depends on m && DEBUG_KERNEL
1304 help
1305 A benchmark measuring the performance of the interval tree library
1306
1285config PROVIDE_OHCI1394_DMA_INIT 1307config PROVIDE_OHCI1394_DMA_INIT
1286 bool "Remote debugging over FireWire early on boot" 1308 bool "Remote debugging over FireWire early on boot"
1287 depends on PCI && X86 1309 depends on PCI && X86
diff --git a/lib/Makefile b/lib/Makefile
index 42d283edc4d3..3128e357e286 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -9,7 +9,7 @@ endif
9 9
10lib-y := ctype.o string.o vsprintf.o cmdline.o \ 10lib-y := ctype.o string.o vsprintf.o cmdline.o \
11 rbtree.o radix-tree.o dump_stack.o timerqueue.o\ 11 rbtree.o radix-tree.o dump_stack.o timerqueue.o\
12 idr.o int_sqrt.o extable.o prio_tree.o \ 12 idr.o int_sqrt.o extable.o \
13 sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \ 13 sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \
14 proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \ 14 proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \
15 is_single_threaded.o plist.o decompress.o 15 is_single_threaded.o plist.o decompress.o
@@ -140,6 +140,11 @@ $(foreach file, $(libfdt_files), \
140 $(eval CFLAGS_$(file) = -I$(src)/../scripts/dtc/libfdt)) 140 $(eval CFLAGS_$(file) = -I$(src)/../scripts/dtc/libfdt))
141lib-$(CONFIG_LIBFDT) += $(libfdt_files) 141lib-$(CONFIG_LIBFDT) += $(libfdt_files)
142 142
143obj-$(CONFIG_RBTREE_TEST) += rbtree_test.o
144obj-$(CONFIG_INTERVAL_TREE_TEST) += interval_tree_test.o
145
146interval_tree_test-objs := interval_tree_test_main.o interval_tree.o
147
143hostprogs-y := gen_crc32table 148hostprogs-y := gen_crc32table
144clean-files := crc32table.h 149clean-files := crc32table.h
145 150
diff --git a/lib/interval_tree.c b/lib/interval_tree.c
new file mode 100644
index 000000000000..e6eb406f2d65
--- /dev/null
+++ b/lib/interval_tree.c
@@ -0,0 +1,10 @@
1#include <linux/init.h>
2#include <linux/interval_tree.h>
3#include <linux/interval_tree_generic.h>
4
5#define START(node) ((node)->start)
6#define LAST(node) ((node)->last)
7
8INTERVAL_TREE_DEFINE(struct interval_tree_node, rb,
9 unsigned long, __subtree_last,
10 START, LAST,, interval_tree)
diff --git a/lib/interval_tree_test_main.c b/lib/interval_tree_test_main.c
new file mode 100644
index 000000000000..b25903987f7a
--- /dev/null
+++ b/lib/interval_tree_test_main.c
@@ -0,0 +1,105 @@
1#include <linux/module.h>
2#include <linux/interval_tree.h>
3#include <linux/random.h>
4#include <asm/timex.h>
5
6#define NODES 100
7#define PERF_LOOPS 100000
8#define SEARCHES 100
9#define SEARCH_LOOPS 10000
10
11static struct rb_root root = RB_ROOT;
12static struct interval_tree_node nodes[NODES];
13static u32 queries[SEARCHES];
14
15static struct rnd_state rnd;
16
17static inline unsigned long
18search(unsigned long query, struct rb_root *root)
19{
20 struct interval_tree_node *node;
21 unsigned long results = 0;
22
23 for (node = interval_tree_iter_first(root, query, query); node;
24 node = interval_tree_iter_next(node, query, query))
25 results++;
26 return results;
27}
28
29static void init(void)
30{
31 int i;
32 for (i = 0; i < NODES; i++) {
33 u32 a = prandom32(&rnd), b = prandom32(&rnd);
34 if (a <= b) {
35 nodes[i].start = a;
36 nodes[i].last = b;
37 } else {
38 nodes[i].start = b;
39 nodes[i].last = a;
40 }
41 }
42 for (i = 0; i < SEARCHES; i++)
43 queries[i] = prandom32(&rnd);
44}
45
46static int interval_tree_test_init(void)
47{
48 int i, j;
49 unsigned long results;
50 cycles_t time1, time2, time;
51
52 printk(KERN_ALERT "interval tree insert/remove");
53
54 prandom32_seed(&rnd, 3141592653589793238ULL);
55 init();
56
57 time1 = get_cycles();
58
59 for (i = 0; i < PERF_LOOPS; i++) {
60 for (j = 0; j < NODES; j++)
61 interval_tree_insert(nodes + j, &root);
62 for (j = 0; j < NODES; j++)
63 interval_tree_remove(nodes + j, &root);
64 }
65
66 time2 = get_cycles();
67 time = time2 - time1;
68
69 time = div_u64(time, PERF_LOOPS);
70 printk(" -> %llu cycles\n", (unsigned long long)time);
71
72 printk(KERN_ALERT "interval tree search");
73
74 for (j = 0; j < NODES; j++)
75 interval_tree_insert(nodes + j, &root);
76
77 time1 = get_cycles();
78
79 results = 0;
80 for (i = 0; i < SEARCH_LOOPS; i++)
81 for (j = 0; j < SEARCHES; j++)
82 results += search(queries[j], &root);
83
84 time2 = get_cycles();
85 time = time2 - time1;
86
87 time = div_u64(time, SEARCH_LOOPS);
88 results = div_u64(results, SEARCH_LOOPS);
89 printk(" -> %llu cycles (%lu results)\n",
90 (unsigned long long)time, results);
91
92 return -EAGAIN; /* Fail will directly unload the module */
93}
94
95static void interval_tree_test_exit(void)
96{
97 printk(KERN_ALERT "test exit\n");
98}
99
100module_init(interval_tree_test_init)
101module_exit(interval_tree_test_exit)
102
103MODULE_LICENSE("GPL");
104MODULE_AUTHOR("Michel Lespinasse");
105MODULE_DESCRIPTION("Interval Tree test");
diff --git a/lib/prio_tree.c b/lib/prio_tree.c
deleted file mode 100644
index 8d443af03b4c..000000000000
--- a/lib/prio_tree.c
+++ /dev/null
@@ -1,466 +0,0 @@
1/*
2 * lib/prio_tree.c - priority search tree
3 *
4 * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu>
5 *
6 * This file is released under the GPL v2.
7 *
8 * Based on the radix priority search tree proposed by Edward M. McCreight
9 * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985
10 *
11 * 02Feb2004 Initial version
12 */
13
14#include <linux/init.h>
15#include <linux/mm.h>
16#include <linux/prio_tree.h>
17
18/*
19 * A clever mix of heap and radix trees forms a radix priority search tree (PST)
20 * which is useful for storing intervals, e.g, we can consider a vma as a closed
21 * interval of file pages [offset_begin, offset_end], and store all vmas that
22 * map a file in a PST. Then, using the PST, we can answer a stabbing query,
23 * i.e., selecting a set of stored intervals (vmas) that overlap with (map) a
24 * given input interval X (a set of consecutive file pages), in "O(log n + m)"
25 * time where 'log n' is the height of the PST, and 'm' is the number of stored
26 * intervals (vmas) that overlap (map) with the input interval X (the set of
27 * consecutive file pages).
28 *
29 * In our implementation, we store closed intervals of the form [radix_index,
30 * heap_index]. We assume that always radix_index <= heap_index. McCreight's PST
31 * is designed for storing intervals with unique radix indices, i.e., each
32 * interval have different radix_index. However, this limitation can be easily
33 * overcome by using the size, i.e., heap_index - radix_index, as part of the
34 * index, so we index the tree using [(radix_index,size), heap_index].
35 *
36 * When the above-mentioned indexing scheme is used, theoretically, in a 32 bit
37 * machine, the maximum height of a PST can be 64. We can use a balanced version
38 * of the priority search tree to optimize the tree height, but the balanced
39 * tree proposed by McCreight is too complex and memory-hungry for our purpose.
40 */
41
42/*
43 * The following macros are used for implementing prio_tree for i_mmap
44 */
45
46#define RADIX_INDEX(vma) ((vma)->vm_pgoff)
47#define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT)
48/* avoid overflow */
49#define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1))
50
51
52static void get_index(const struct prio_tree_root *root,
53 const struct prio_tree_node *node,
54 unsigned long *radix, unsigned long *heap)
55{
56 if (root->raw) {
57 struct vm_area_struct *vma = prio_tree_entry(
58 node, struct vm_area_struct, shared.prio_tree_node);
59
60 *radix = RADIX_INDEX(vma);
61 *heap = HEAP_INDEX(vma);
62 }
63 else {
64 *radix = node->start;
65 *heap = node->last;
66 }
67}
68
69static unsigned long index_bits_to_maxindex[BITS_PER_LONG];
70
71void __init prio_tree_init(void)
72{
73 unsigned int i;
74
75 for (i = 0; i < ARRAY_SIZE(index_bits_to_maxindex) - 1; i++)
76 index_bits_to_maxindex[i] = (1UL << (i + 1)) - 1;
77 index_bits_to_maxindex[ARRAY_SIZE(index_bits_to_maxindex) - 1] = ~0UL;
78}
79
80/*
81 * Maximum heap_index that can be stored in a PST with index_bits bits
82 */
83static inline unsigned long prio_tree_maxindex(unsigned int bits)
84{
85 return index_bits_to_maxindex[bits - 1];
86}
87
88static void prio_set_parent(struct prio_tree_node *parent,
89 struct prio_tree_node *child, bool left)
90{
91 if (left)
92 parent->left = child;
93 else
94 parent->right = child;
95
96 child->parent = parent;
97}
98
99/*
100 * Extend a priority search tree so that it can store a node with heap_index
101 * max_heap_index. In the worst case, this algorithm takes O((log n)^2).
102 * However, this function is used rarely and the common case performance is
103 * not bad.
104 */
105static struct prio_tree_node *prio_tree_expand(struct prio_tree_root *root,
106 struct prio_tree_node *node, unsigned long max_heap_index)
107{
108 struct prio_tree_node *prev;
109
110 if (max_heap_index > prio_tree_maxindex(root->index_bits))
111 root->index_bits++;
112
113 prev = node;
114 INIT_PRIO_TREE_NODE(node);
115
116 while (max_heap_index > prio_tree_maxindex(root->index_bits)) {
117 struct prio_tree_node *tmp = root->prio_tree_node;
118
119 root->index_bits++;
120
121 if (prio_tree_empty(root))
122 continue;
123
124 prio_tree_remove(root, root->prio_tree_node);
125 INIT_PRIO_TREE_NODE(tmp);
126
127 prio_set_parent(prev, tmp, true);
128 prev = tmp;
129 }
130
131 if (!prio_tree_empty(root))
132 prio_set_parent(prev, root->prio_tree_node, true);
133
134 root->prio_tree_node = node;
135 return node;
136}
137
138/*
139 * Replace a prio_tree_node with a new node and return the old node
140 */
141struct prio_tree_node *prio_tree_replace(struct prio_tree_root *root,
142 struct prio_tree_node *old, struct prio_tree_node *node)
143{
144 INIT_PRIO_TREE_NODE(node);
145
146 if (prio_tree_root(old)) {
147 BUG_ON(root->prio_tree_node != old);
148 /*
149 * We can reduce root->index_bits here. However, it is complex
150 * and does not help much to improve performance (IMO).
151 */
152 root->prio_tree_node = node;
153 } else
154 prio_set_parent(old->parent, node, old->parent->left == old);
155
156 if (!prio_tree_left_empty(old))
157 prio_set_parent(node, old->left, true);
158
159 if (!prio_tree_right_empty(old))
160 prio_set_parent(node, old->right, false);
161
162 return old;
163}
164
165/*
166 * Insert a prio_tree_node @node into a radix priority search tree @root. The
167 * algorithm typically takes O(log n) time where 'log n' is the number of bits
168 * required to represent the maximum heap_index. In the worst case, the algo
169 * can take O((log n)^2) - check prio_tree_expand.
170 *
171 * If a prior node with same radix_index and heap_index is already found in
172 * the tree, then returns the address of the prior node. Otherwise, inserts
173 * @node into the tree and returns @node.
174 */
175struct prio_tree_node *prio_tree_insert(struct prio_tree_root *root,
176 struct prio_tree_node *node)
177{
178 struct prio_tree_node *cur, *res = node;
179 unsigned long radix_index, heap_index;
180 unsigned long r_index, h_index, index, mask;
181 int size_flag = 0;
182
183 get_index(root, node, &radix_index, &heap_index);
184
185 if (prio_tree_empty(root) ||
186 heap_index > prio_tree_maxindex(root->index_bits))
187 return prio_tree_expand(root, node, heap_index);
188
189 cur = root->prio_tree_node;
190 mask = 1UL << (root->index_bits - 1);
191
192 while (mask) {
193 get_index(root, cur, &r_index, &h_index);
194
195 if (r_index == radix_index && h_index == heap_index)
196 return cur;
197
198 if (h_index < heap_index ||
199 (h_index == heap_index && r_index > radix_index)) {
200 struct prio_tree_node *tmp = node;
201 node = prio_tree_replace(root, cur, node);
202 cur = tmp;
203 /* swap indices */
204 index = r_index;
205 r_index = radix_index;
206 radix_index = index;
207 index = h_index;
208 h_index = heap_index;
209 heap_index = index;
210 }
211
212 if (size_flag)
213 index = heap_index - radix_index;
214 else
215 index = radix_index;
216
217 if (index & mask) {
218 if (prio_tree_right_empty(cur)) {
219 INIT_PRIO_TREE_NODE(node);
220 prio_set_parent(cur, node, false);
221 return res;
222 } else
223 cur = cur->right;
224 } else {
225 if (prio_tree_left_empty(cur)) {
226 INIT_PRIO_TREE_NODE(node);
227 prio_set_parent(cur, node, true);
228 return res;
229 } else
230 cur = cur->left;
231 }
232
233 mask >>= 1;
234
235 if (!mask) {
236 mask = 1UL << (BITS_PER_LONG - 1);
237 size_flag = 1;
238 }
239 }
240 /* Should not reach here */
241 BUG();
242 return NULL;
243}
244
245/*
246 * Remove a prio_tree_node @node from a radix priority search tree @root. The
247 * algorithm takes O(log n) time where 'log n' is the number of bits required
248 * to represent the maximum heap_index.
249 */
250void prio_tree_remove(struct prio_tree_root *root, struct prio_tree_node *node)
251{
252 struct prio_tree_node *cur;
253 unsigned long r_index, h_index_right, h_index_left;
254
255 cur = node;
256
257 while (!prio_tree_left_empty(cur) || !prio_tree_right_empty(cur)) {
258 if (!prio_tree_left_empty(cur))
259 get_index(root, cur->left, &r_index, &h_index_left);
260 else {
261 cur = cur->right;
262 continue;
263 }
264
265 if (!prio_tree_right_empty(cur))
266 get_index(root, cur->right, &r_index, &h_index_right);
267 else {
268 cur = cur->left;
269 continue;
270 }
271
272 /* both h_index_left and h_index_right cannot be 0 */
273 if (h_index_left >= h_index_right)
274 cur = cur->left;
275 else
276 cur = cur->right;
277 }
278
279 if (prio_tree_root(cur)) {
280 BUG_ON(root->prio_tree_node != cur);
281 __INIT_PRIO_TREE_ROOT(root, root->raw);
282 return;
283 }
284
285 if (cur->parent->right == cur)
286 cur->parent->right = cur->parent;
287 else
288 cur->parent->left = cur->parent;
289
290 while (cur != node)
291 cur = prio_tree_replace(root, cur->parent, cur);
292}
293
294static void iter_walk_down(struct prio_tree_iter *iter)
295{
296 iter->mask >>= 1;
297 if (iter->mask) {
298 if (iter->size_level)
299 iter->size_level++;
300 return;
301 }
302
303 if (iter->size_level) {
304 BUG_ON(!prio_tree_left_empty(iter->cur));
305 BUG_ON(!prio_tree_right_empty(iter->cur));
306 iter->size_level++;
307 iter->mask = ULONG_MAX;
308 } else {
309 iter->size_level = 1;
310 iter->mask = 1UL << (BITS_PER_LONG - 1);
311 }
312}
313
314static void iter_walk_up(struct prio_tree_iter *iter)
315{
316 if (iter->mask == ULONG_MAX)
317 iter->mask = 1UL;
318 else if (iter->size_level == 1)
319 iter->mask = 1UL;
320 else
321 iter->mask <<= 1;
322 if (iter->size_level)
323 iter->size_level--;
324 if (!iter->size_level && (iter->value & iter->mask))
325 iter->value ^= iter->mask;
326}
327
328/*
329 * Following functions help to enumerate all prio_tree_nodes in the tree that
330 * overlap with the input interval X [radix_index, heap_index]. The enumeration
331 * takes O(log n + m) time where 'log n' is the height of the tree (which is
332 * proportional to # of bits required to represent the maximum heap_index) and
333 * 'm' is the number of prio_tree_nodes that overlap the interval X.
334 */
335
336static struct prio_tree_node *prio_tree_left(struct prio_tree_iter *iter,
337 unsigned long *r_index, unsigned long *h_index)
338{
339 if (prio_tree_left_empty(iter->cur))
340 return NULL;
341
342 get_index(iter->root, iter->cur->left, r_index, h_index);
343
344 if (iter->r_index <= *h_index) {
345 iter->cur = iter->cur->left;
346 iter_walk_down(iter);
347 return iter->cur;
348 }
349
350 return NULL;
351}
352
353static struct prio_tree_node *prio_tree_right(struct prio_tree_iter *iter,
354 unsigned long *r_index, unsigned long *h_index)
355{
356 unsigned long value;
357
358 if (prio_tree_right_empty(iter->cur))
359 return NULL;
360
361 if (iter->size_level)
362 value = iter->value;
363 else
364 value = iter->value | iter->mask;
365
366 if (iter->h_index < value)
367 return NULL;
368
369 get_index(iter->root, iter->cur->right, r_index, h_index);
370
371 if (iter->r_index <= *h_index) {
372 iter->cur = iter->cur->right;
373 iter_walk_down(iter);
374 return iter->cur;
375 }
376
377 return NULL;
378}
379
380static struct prio_tree_node *prio_tree_parent(struct prio_tree_iter *iter)
381{
382 iter->cur = iter->cur->parent;
383 iter_walk_up(iter);
384 return iter->cur;
385}
386
387static inline int overlap(struct prio_tree_iter *iter,
388 unsigned long r_index, unsigned long h_index)
389{
390 return iter->h_index >= r_index && iter->r_index <= h_index;
391}
392
393/*
394 * prio_tree_first:
395 *
396 * Get the first prio_tree_node that overlaps with the interval [radix_index,
397 * heap_index]. Note that always radix_index <= heap_index. We do a pre-order
398 * traversal of the tree.
399 */
400static struct prio_tree_node *prio_tree_first(struct prio_tree_iter *iter)
401{
402 struct prio_tree_root *root;
403 unsigned long r_index, h_index;
404
405 INIT_PRIO_TREE_ITER(iter);
406
407 root = iter->root;
408 if (prio_tree_empty(root))
409 return NULL;
410
411 get_index(root, root->prio_tree_node, &r_index, &h_index);
412
413 if (iter->r_index > h_index)
414 return NULL;
415
416 iter->mask = 1UL << (root->index_bits - 1);
417 iter->cur = root->prio_tree_node;
418
419 while (1) {
420 if (overlap(iter, r_index, h_index))
421 return iter->cur;
422
423 if (prio_tree_left(iter, &r_index, &h_index))
424 continue;
425
426 if (prio_tree_right(iter, &r_index, &h_index))
427 continue;
428
429 break;
430 }
431 return NULL;
432}
433
434/*
435 * prio_tree_next:
436 *
437 * Get the next prio_tree_node that overlaps with the input interval in iter
438 */
439struct prio_tree_node *prio_tree_next(struct prio_tree_iter *iter)
440{
441 unsigned long r_index, h_index;
442
443 if (iter->cur == NULL)
444 return prio_tree_first(iter);
445
446repeat:
447 while (prio_tree_left(iter, &r_index, &h_index))
448 if (overlap(iter, r_index, h_index))
449 return iter->cur;
450
451 while (!prio_tree_right(iter, &r_index, &h_index)) {
452 while (!prio_tree_root(iter->cur) &&
453 iter->cur->parent->right == iter->cur)
454 prio_tree_parent(iter);
455
456 if (prio_tree_root(iter->cur))
457 return NULL;
458
459 prio_tree_parent(iter);
460 }
461
462 if (overlap(iter, r_index, h_index))
463 return iter->cur;
464
465 goto repeat;
466}
diff --git a/lib/rbtree.c b/lib/rbtree.c
index d4175565dc2c..4f56a11d67fa 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -2,7 +2,8 @@
2 Red Black Trees 2 Red Black Trees
3 (C) 1999 Andrea Arcangeli <andrea@suse.de> 3 (C) 1999 Andrea Arcangeli <andrea@suse.de>
4 (C) 2002 David Woodhouse <dwmw2@infradead.org> 4 (C) 2002 David Woodhouse <dwmw2@infradead.org>
5 5 (C) 2012 Michel Lespinasse <walken@google.com>
6
6 This program is free software; you can redistribute it and/or modify 7 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by 8 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or 9 the Free Software Foundation; either version 2 of the License, or
@@ -20,339 +21,382 @@
20 linux/lib/rbtree.c 21 linux/lib/rbtree.c
21*/ 22*/
22 23
23#include <linux/rbtree.h> 24#include <linux/rbtree_augmented.h>
24#include <linux/export.h> 25#include <linux/export.h>
25 26
26static void __rb_rotate_left(struct rb_node *node, struct rb_root *root) 27/*
27{ 28 * red-black trees properties: http://en.wikipedia.org/wiki/Rbtree
28 struct rb_node *right = node->rb_right; 29 *
29 struct rb_node *parent = rb_parent(node); 30 * 1) A node is either red or black
30 31 * 2) The root is black
31 if ((node->rb_right = right->rb_left)) 32 * 3) All leaves (NULL) are black
32 rb_set_parent(right->rb_left, node); 33 * 4) Both children of every red node are black
33 right->rb_left = node; 34 * 5) Every simple path from root to leaves contains the same number
34 35 * of black nodes.
35 rb_set_parent(right, parent); 36 *
37 * 4 and 5 give the O(log n) guarantee, since 4 implies you cannot have two
38 * consecutive red nodes in a path and every red node is therefore followed by
39 * a black. So if B is the number of black nodes on every simple path (as per
40 * 5), then the longest possible path due to 4 is 2B.
41 *
42 * We shall indicate color with case, where black nodes are uppercase and red
43 * nodes will be lowercase. Unknown color nodes shall be drawn as red within
44 * parentheses and have some accompanying text comment.
45 */
36 46
37 if (parent) 47static inline void rb_set_black(struct rb_node *rb)
38 { 48{
39 if (node == parent->rb_left) 49 rb->__rb_parent_color |= RB_BLACK;
40 parent->rb_left = right;
41 else
42 parent->rb_right = right;
43 }
44 else
45 root->rb_node = right;
46 rb_set_parent(node, right);
47} 50}
48 51
49static void __rb_rotate_right(struct rb_node *node, struct rb_root *root) 52static inline struct rb_node *rb_red_parent(struct rb_node *red)
50{ 53{
51 struct rb_node *left = node->rb_left; 54 return (struct rb_node *)red->__rb_parent_color;
52 struct rb_node *parent = rb_parent(node); 55}
53
54 if ((node->rb_left = left->rb_right))
55 rb_set_parent(left->rb_right, node);
56 left->rb_right = node;
57
58 rb_set_parent(left, parent);
59 56
60 if (parent) 57/*
61 { 58 * Helper function for rotations:
62 if (node == parent->rb_right) 59 * - old's parent and color get assigned to new
63 parent->rb_right = left; 60 * - old gets assigned new as a parent and 'color' as a color.
64 else 61 */
65 parent->rb_left = left; 62static inline void
66 } 63__rb_rotate_set_parents(struct rb_node *old, struct rb_node *new,
67 else 64 struct rb_root *root, int color)
68 root->rb_node = left; 65{
69 rb_set_parent(node, left); 66 struct rb_node *parent = rb_parent(old);
67 new->__rb_parent_color = old->__rb_parent_color;
68 rb_set_parent_color(old, new, color);
69 __rb_change_child(old, new, parent, root);
70} 70}
71 71
72void rb_insert_color(struct rb_node *node, struct rb_root *root) 72static __always_inline void
73__rb_insert(struct rb_node *node, struct rb_root *root,
74 void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
73{ 75{
74 struct rb_node *parent, *gparent; 76 struct rb_node *parent = rb_red_parent(node), *gparent, *tmp;
75 77
76 while ((parent = rb_parent(node)) && rb_is_red(parent)) 78 while (true) {
77 { 79 /*
78 gparent = rb_parent(parent); 80 * Loop invariant: node is red
79 81 *
80 if (parent == gparent->rb_left) 82 * If there is a black parent, we are done.
81 { 83 * Otherwise, take some corrective action as we don't
82 { 84 * want a red root or two consecutive red nodes.
83 register struct rb_node *uncle = gparent->rb_right; 85 */
84 if (uncle && rb_is_red(uncle)) 86 if (!parent) {
85 { 87 rb_set_parent_color(node, NULL, RB_BLACK);
86 rb_set_black(uncle); 88 break;
87 rb_set_black(parent); 89 } else if (rb_is_black(parent))
88 rb_set_red(gparent); 90 break;
89 node = gparent; 91
90 continue; 92 gparent = rb_red_parent(parent);
91 } 93
94 tmp = gparent->rb_right;
95 if (parent != tmp) { /* parent == gparent->rb_left */
96 if (tmp && rb_is_red(tmp)) {
97 /*
98 * Case 1 - color flips
99 *
100 * G g
101 * / \ / \
102 * p u --> P U
103 * / /
104 * n N
105 *
106 * However, since g's parent might be red, and
107 * 4) does not allow this, we need to recurse
108 * at g.
109 */
110 rb_set_parent_color(tmp, gparent, RB_BLACK);
111 rb_set_parent_color(parent, gparent, RB_BLACK);
112 node = gparent;
113 parent = rb_parent(node);
114 rb_set_parent_color(node, parent, RB_RED);
115 continue;
92 } 116 }
93 117
94 if (parent->rb_right == node) 118 tmp = parent->rb_right;
95 { 119 if (node == tmp) {
96 register struct rb_node *tmp; 120 /*
97 __rb_rotate_left(parent, root); 121 * Case 2 - left rotate at parent
98 tmp = parent; 122 *
123 * G G
124 * / \ / \
125 * p U --> n U
126 * \ /
127 * n p
128 *
129 * This still leaves us in violation of 4), the
130 * continuation into Case 3 will fix that.
131 */
132 parent->rb_right = tmp = node->rb_left;
133 node->rb_left = parent;
134 if (tmp)
135 rb_set_parent_color(tmp, parent,
136 RB_BLACK);
137 rb_set_parent_color(parent, node, RB_RED);
138 augment_rotate(parent, node);
99 parent = node; 139 parent = node;
100 node = tmp; 140 tmp = node->rb_right;
101 } 141 }
102 142
103 rb_set_black(parent); 143 /*
104 rb_set_red(gparent); 144 * Case 3 - right rotate at gparent
105 __rb_rotate_right(gparent, root); 145 *
146 * G P
147 * / \ / \
148 * p U --> n g
149 * / \
150 * n U
151 */
152 gparent->rb_left = tmp; /* == parent->rb_right */
153 parent->rb_right = gparent;
154 if (tmp)
155 rb_set_parent_color(tmp, gparent, RB_BLACK);
156 __rb_rotate_set_parents(gparent, parent, root, RB_RED);
157 augment_rotate(gparent, parent);
158 break;
106 } else { 159 } else {
107 { 160 tmp = gparent->rb_left;
108 register struct rb_node *uncle = gparent->rb_left; 161 if (tmp && rb_is_red(tmp)) {
109 if (uncle && rb_is_red(uncle)) 162 /* Case 1 - color flips */
110 { 163 rb_set_parent_color(tmp, gparent, RB_BLACK);
111 rb_set_black(uncle); 164 rb_set_parent_color(parent, gparent, RB_BLACK);
112 rb_set_black(parent); 165 node = gparent;
113 rb_set_red(gparent); 166 parent = rb_parent(node);
114 node = gparent; 167 rb_set_parent_color(node, parent, RB_RED);
115 continue; 168 continue;
116 }
117 } 169 }
118 170
119 if (parent->rb_left == node) 171 tmp = parent->rb_left;
120 { 172 if (node == tmp) {
121 register struct rb_node *tmp; 173 /* Case 2 - right rotate at parent */
122 __rb_rotate_right(parent, root); 174 parent->rb_left = tmp = node->rb_right;
123 tmp = parent; 175 node->rb_right = parent;
176 if (tmp)
177 rb_set_parent_color(tmp, parent,
178 RB_BLACK);
179 rb_set_parent_color(parent, node, RB_RED);
180 augment_rotate(parent, node);
124 parent = node; 181 parent = node;
125 node = tmp; 182 tmp = node->rb_left;
126 } 183 }
127 184
128 rb_set_black(parent); 185 /* Case 3 - left rotate at gparent */
129 rb_set_red(gparent); 186 gparent->rb_right = tmp; /* == parent->rb_left */
130 __rb_rotate_left(gparent, root); 187 parent->rb_left = gparent;
188 if (tmp)
189 rb_set_parent_color(tmp, gparent, RB_BLACK);
190 __rb_rotate_set_parents(gparent, parent, root, RB_RED);
191 augment_rotate(gparent, parent);
192 break;
131 } 193 }
132 } 194 }
133
134 rb_set_black(root->rb_node);
135} 195}
136EXPORT_SYMBOL(rb_insert_color);
137 196
138static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, 197__always_inline void
139 struct rb_root *root) 198__rb_erase_color(struct rb_node *parent, struct rb_root *root,
199 void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
140{ 200{
141 struct rb_node *other; 201 struct rb_node *node = NULL, *sibling, *tmp1, *tmp2;
142 202
143 while ((!node || rb_is_black(node)) && node != root->rb_node) 203 while (true) {
144 { 204 /*
145 if (parent->rb_left == node) 205 * Loop invariants:
146 { 206 * - node is black (or NULL on first iteration)
147 other = parent->rb_right; 207 * - node is not the root (parent is not NULL)
148 if (rb_is_red(other)) 208 * - All leaf paths going through parent and node have a
149 { 209 * black node count that is 1 lower than other leaf paths.
150 rb_set_black(other); 210 */
151 rb_set_red(parent); 211 sibling = parent->rb_right;
152 __rb_rotate_left(parent, root); 212 if (node != sibling) { /* node == parent->rb_left */
153 other = parent->rb_right; 213 if (rb_is_red(sibling)) {
214 /*
215 * Case 1 - left rotate at parent
216 *
217 * P S
218 * / \ / \
219 * N s --> p Sr
220 * / \ / \
221 * Sl Sr N Sl
222 */
223 parent->rb_right = tmp1 = sibling->rb_left;
224 sibling->rb_left = parent;
225 rb_set_parent_color(tmp1, parent, RB_BLACK);
226 __rb_rotate_set_parents(parent, sibling, root,
227 RB_RED);
228 augment_rotate(parent, sibling);
229 sibling = tmp1;
154 } 230 }
155 if ((!other->rb_left || rb_is_black(other->rb_left)) && 231 tmp1 = sibling->rb_right;
156 (!other->rb_right || rb_is_black(other->rb_right))) 232 if (!tmp1 || rb_is_black(tmp1)) {
157 { 233 tmp2 = sibling->rb_left;
158 rb_set_red(other); 234 if (!tmp2 || rb_is_black(tmp2)) {
159 node = parent; 235 /*
160 parent = rb_parent(node); 236 * Case 2 - sibling color flip
161 } 237 * (p could be either color here)
162 else 238 *
163 { 239 * (p) (p)
164 if (!other->rb_right || rb_is_black(other->rb_right)) 240 * / \ / \
165 { 241 * N S --> N s
166 rb_set_black(other->rb_left); 242 * / \ / \
167 rb_set_red(other); 243 * Sl Sr Sl Sr
168 __rb_rotate_right(other, root); 244 *
169 other = parent->rb_right; 245 * This leaves us violating 5) which
246 * can be fixed by flipping p to black
247 * if it was red, or by recursing at p.
248 * p is red when coming from Case 1.
249 */
250 rb_set_parent_color(sibling, parent,
251 RB_RED);
252 if (rb_is_red(parent))
253 rb_set_black(parent);
254 else {
255 node = parent;
256 parent = rb_parent(node);
257 if (parent)
258 continue;
259 }
260 break;
170 } 261 }
171 rb_set_color(other, rb_color(parent)); 262 /*
172 rb_set_black(parent); 263 * Case 3 - right rotate at sibling
173 rb_set_black(other->rb_right); 264 * (p could be either color here)
174 __rb_rotate_left(parent, root); 265 *
175 node = root->rb_node; 266 * (p) (p)
176 break; 267 * / \ / \
177 } 268 * N S --> N Sl
178 } 269 * / \ \
179 else 270 * sl Sr s
180 { 271 * \
181 other = parent->rb_left; 272 * Sr
182 if (rb_is_red(other)) 273 */
183 { 274 sibling->rb_left = tmp1 = tmp2->rb_right;
184 rb_set_black(other); 275 tmp2->rb_right = sibling;
185 rb_set_red(parent); 276 parent->rb_right = tmp2;
186 __rb_rotate_right(parent, root); 277 if (tmp1)
187 other = parent->rb_left; 278 rb_set_parent_color(tmp1, sibling,
279 RB_BLACK);
280 augment_rotate(sibling, tmp2);
281 tmp1 = sibling;
282 sibling = tmp2;
188 } 283 }
189 if ((!other->rb_left || rb_is_black(other->rb_left)) && 284 /*
190 (!other->rb_right || rb_is_black(other->rb_right))) 285 * Case 4 - left rotate at parent + color flips
191 { 286 * (p and sl could be either color here.
192 rb_set_red(other); 287 * After rotation, p becomes black, s acquires
193 node = parent; 288 * p's color, and sl keeps its color)
194 parent = rb_parent(node); 289 *
290 * (p) (s)
291 * / \ / \
292 * N S --> P Sr
293 * / \ / \
294 * (sl) sr N (sl)
295 */
296 parent->rb_right = tmp2 = sibling->rb_left;
297 sibling->rb_left = parent;
298 rb_set_parent_color(tmp1, sibling, RB_BLACK);
299 if (tmp2)
300 rb_set_parent(tmp2, parent);
301 __rb_rotate_set_parents(parent, sibling, root,
302 RB_BLACK);
303 augment_rotate(parent, sibling);
304 break;
305 } else {
306 sibling = parent->rb_left;
307 if (rb_is_red(sibling)) {
308 /* Case 1 - right rotate at parent */
309 parent->rb_left = tmp1 = sibling->rb_right;
310 sibling->rb_right = parent;
311 rb_set_parent_color(tmp1, parent, RB_BLACK);
312 __rb_rotate_set_parents(parent, sibling, root,
313 RB_RED);
314 augment_rotate(parent, sibling);
315 sibling = tmp1;
195 } 316 }
196 else 317 tmp1 = sibling->rb_left;
197 { 318 if (!tmp1 || rb_is_black(tmp1)) {
198 if (!other->rb_left || rb_is_black(other->rb_left)) 319 tmp2 = sibling->rb_right;
199 { 320 if (!tmp2 || rb_is_black(tmp2)) {
200 rb_set_black(other->rb_right); 321 /* Case 2 - sibling color flip */
201 rb_set_red(other); 322 rb_set_parent_color(sibling, parent,
202 __rb_rotate_left(other, root); 323 RB_RED);
203 other = parent->rb_left; 324 if (rb_is_red(parent))
325 rb_set_black(parent);
326 else {
327 node = parent;
328 parent = rb_parent(node);
329 if (parent)
330 continue;
331 }
332 break;
204 } 333 }
205 rb_set_color(other, rb_color(parent)); 334 /* Case 3 - right rotate at sibling */
206 rb_set_black(parent); 335 sibling->rb_right = tmp1 = tmp2->rb_left;
207 rb_set_black(other->rb_left); 336 tmp2->rb_left = sibling;
208 __rb_rotate_right(parent, root); 337 parent->rb_left = tmp2;
209 node = root->rb_node; 338 if (tmp1)
210 break; 339 rb_set_parent_color(tmp1, sibling,
340 RB_BLACK);
341 augment_rotate(sibling, tmp2);
342 tmp1 = sibling;
343 sibling = tmp2;
211 } 344 }
345 /* Case 4 - left rotate at parent + color flips */
346 parent->rb_left = tmp2 = sibling->rb_right;
347 sibling->rb_right = parent;
348 rb_set_parent_color(tmp1, sibling, RB_BLACK);
349 if (tmp2)
350 rb_set_parent(tmp2, parent);
351 __rb_rotate_set_parents(parent, sibling, root,
352 RB_BLACK);
353 augment_rotate(parent, sibling);
354 break;
212 } 355 }
213 } 356 }
214 if (node)
215 rb_set_black(node);
216} 357}
358EXPORT_SYMBOL(__rb_erase_color);
217 359
218void rb_erase(struct rb_node *node, struct rb_root *root) 360/*
219{ 361 * Non-augmented rbtree manipulation functions.
220 struct rb_node *child, *parent; 362 *
221 int color; 363 * We use dummy augmented callbacks here, and have the compiler optimize them
222 364 * out of the rb_insert_color() and rb_erase() function definitions.
223 if (!node->rb_left) 365 */
224 child = node->rb_right;
225 else if (!node->rb_right)
226 child = node->rb_left;
227 else
228 {
229 struct rb_node *old = node, *left;
230
231 node = node->rb_right;
232 while ((left = node->rb_left) != NULL)
233 node = left;
234
235 if (rb_parent(old)) {
236 if (rb_parent(old)->rb_left == old)
237 rb_parent(old)->rb_left = node;
238 else
239 rb_parent(old)->rb_right = node;
240 } else
241 root->rb_node = node;
242
243 child = node->rb_right;
244 parent = rb_parent(node);
245 color = rb_color(node);
246
247 if (parent == old) {
248 parent = node;
249 } else {
250 if (child)
251 rb_set_parent(child, parent);
252 parent->rb_left = child;
253
254 node->rb_right = old->rb_right;
255 rb_set_parent(old->rb_right, node);
256 }
257
258 node->rb_parent_color = old->rb_parent_color;
259 node->rb_left = old->rb_left;
260 rb_set_parent(old->rb_left, node);
261 366
262 goto color; 367static inline void dummy_propagate(struct rb_node *node, struct rb_node *stop) {}
263 } 368static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {}
369static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {}
264 370
265 parent = rb_parent(node); 371static const struct rb_augment_callbacks dummy_callbacks = {
266 color = rb_color(node); 372 dummy_propagate, dummy_copy, dummy_rotate
267 373};
268 if (child)
269 rb_set_parent(child, parent);
270 if (parent)
271 {
272 if (parent->rb_left == node)
273 parent->rb_left = child;
274 else
275 parent->rb_right = child;
276 }
277 else
278 root->rb_node = child;
279 374
280 color: 375void rb_insert_color(struct rb_node *node, struct rb_root *root)
281 if (color == RB_BLACK)
282 __rb_erase_color(child, parent, root);
283}
284EXPORT_SYMBOL(rb_erase);
285
286static void rb_augment_path(struct rb_node *node, rb_augment_f func, void *data)
287{ 376{
288 struct rb_node *parent; 377 __rb_insert(node, root, dummy_rotate);
289
290up:
291 func(node, data);
292 parent = rb_parent(node);
293 if (!parent)
294 return;
295
296 if (node == parent->rb_left && parent->rb_right)
297 func(parent->rb_right, data);
298 else if (parent->rb_left)
299 func(parent->rb_left, data);
300
301 node = parent;
302 goto up;
303} 378}
379EXPORT_SYMBOL(rb_insert_color);
304 380
305/* 381void rb_erase(struct rb_node *node, struct rb_root *root)
306 * after inserting @node into the tree, update the tree to account for
307 * both the new entry and any damage done by rebalance
308 */
309void rb_augment_insert(struct rb_node *node, rb_augment_f func, void *data)
310{ 382{
311 if (node->rb_left) 383 rb_erase_augmented(node, root, &dummy_callbacks);
312 node = node->rb_left;
313 else if (node->rb_right)
314 node = node->rb_right;
315
316 rb_augment_path(node, func, data);
317} 384}
318EXPORT_SYMBOL(rb_augment_insert); 385EXPORT_SYMBOL(rb_erase);
319 386
320/* 387/*
321 * before removing the node, find the deepest node on the rebalance path 388 * Augmented rbtree manipulation functions.
322 * that will still be there after @node gets removed 389 *
390 * This instantiates the same __always_inline functions as in the non-augmented
391 * case, but this time with user-defined callbacks.
323 */ 392 */
324struct rb_node *rb_augment_erase_begin(struct rb_node *node)
325{
326 struct rb_node *deepest;
327
328 if (!node->rb_right && !node->rb_left)
329 deepest = rb_parent(node);
330 else if (!node->rb_right)
331 deepest = node->rb_left;
332 else if (!node->rb_left)
333 deepest = node->rb_right;
334 else {
335 deepest = rb_next(node);
336 if (deepest->rb_right)
337 deepest = deepest->rb_right;
338 else if (rb_parent(deepest) != node)
339 deepest = rb_parent(deepest);
340 }
341
342 return deepest;
343}
344EXPORT_SYMBOL(rb_augment_erase_begin);
345 393
346/* 394void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
347 * after removal, update the tree to account for the removed entry 395 void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
348 * and any rebalance damage.
349 */
350void rb_augment_erase_end(struct rb_node *node, rb_augment_f func, void *data)
351{ 396{
352 if (node) 397 __rb_insert(node, root, augment_rotate);
353 rb_augment_path(node, func, data);
354} 398}
355EXPORT_SYMBOL(rb_augment_erase_end); 399EXPORT_SYMBOL(__rb_insert_augmented);
356 400
357/* 401/*
358 * This function returns the first node (in sort order) of the tree. 402 * This function returns the first node (in sort order) of the tree.
@@ -387,11 +431,13 @@ struct rb_node *rb_next(const struct rb_node *node)
387{ 431{
388 struct rb_node *parent; 432 struct rb_node *parent;
389 433
390 if (rb_parent(node) == node) 434 if (RB_EMPTY_NODE(node))
391 return NULL; 435 return NULL;
392 436
393 /* If we have a right-hand child, go down and then left as far 437 /*
394 as we can. */ 438 * If we have a right-hand child, go down and then left as far
439 * as we can.
440 */
395 if (node->rb_right) { 441 if (node->rb_right) {
396 node = node->rb_right; 442 node = node->rb_right;
397 while (node->rb_left) 443 while (node->rb_left)
@@ -399,12 +445,13 @@ struct rb_node *rb_next(const struct rb_node *node)
399 return (struct rb_node *)node; 445 return (struct rb_node *)node;
400 } 446 }
401 447
402 /* No right-hand children. Everything down and left is 448 /*
403 smaller than us, so any 'next' node must be in the general 449 * No right-hand children. Everything down and left is smaller than us,
404 direction of our parent. Go up the tree; any time the 450 * so any 'next' node must be in the general direction of our parent.
405 ancestor is a right-hand child of its parent, keep going 451 * Go up the tree; any time the ancestor is a right-hand child of its
406 up. First time it's a left-hand child of its parent, said 452 * parent, keep going up. First time it's a left-hand child of its
407 parent is our 'next' node. */ 453 * parent, said parent is our 'next' node.
454 */
408 while ((parent = rb_parent(node)) && node == parent->rb_right) 455 while ((parent = rb_parent(node)) && node == parent->rb_right)
409 node = parent; 456 node = parent;
410 457
@@ -416,11 +463,13 @@ struct rb_node *rb_prev(const struct rb_node *node)
416{ 463{
417 struct rb_node *parent; 464 struct rb_node *parent;
418 465
419 if (rb_parent(node) == node) 466 if (RB_EMPTY_NODE(node))
420 return NULL; 467 return NULL;
421 468
422 /* If we have a left-hand child, go down and then right as far 469 /*
423 as we can. */ 470 * If we have a left-hand child, go down and then right as far
471 * as we can.
472 */
424 if (node->rb_left) { 473 if (node->rb_left) {
425 node = node->rb_left; 474 node = node->rb_left;
426 while (node->rb_right) 475 while (node->rb_right)
@@ -428,8 +477,10 @@ struct rb_node *rb_prev(const struct rb_node *node)
428 return (struct rb_node *)node; 477 return (struct rb_node *)node;
429 } 478 }
430 479
431 /* No left-hand children. Go up till we find an ancestor which 480 /*
432 is a right-hand child of its parent */ 481 * No left-hand children. Go up till we find an ancestor which
482 * is a right-hand child of its parent.
483 */
433 while ((parent = rb_parent(node)) && node == parent->rb_left) 484 while ((parent = rb_parent(node)) && node == parent->rb_left)
434 node = parent; 485 node = parent;
435 486
@@ -443,14 +494,7 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new,
443 struct rb_node *parent = rb_parent(victim); 494 struct rb_node *parent = rb_parent(victim);
444 495
445 /* Set the surrounding nodes to point to the replacement */ 496 /* Set the surrounding nodes to point to the replacement */
446 if (parent) { 497 __rb_change_child(victim, new, parent, root);
447 if (victim == parent->rb_left)
448 parent->rb_left = new;
449 else
450 parent->rb_right = new;
451 } else {
452 root->rb_node = new;
453 }
454 if (victim->rb_left) 498 if (victim->rb_left)
455 rb_set_parent(victim->rb_left, new); 499 rb_set_parent(victim->rb_left, new);
456 if (victim->rb_right) 500 if (victim->rb_right)
diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c
new file mode 100644
index 000000000000..268b23951fec
--- /dev/null
+++ b/lib/rbtree_test.c
@@ -0,0 +1,234 @@
1#include <linux/module.h>
2#include <linux/rbtree_augmented.h>
3#include <linux/random.h>
4#include <asm/timex.h>
5
6#define NODES 100
7#define PERF_LOOPS 100000
8#define CHECK_LOOPS 100
9
10struct test_node {
11 struct rb_node rb;
12 u32 key;
13
14 /* following fields used for testing augmented rbtree functionality */
15 u32 val;
16 u32 augmented;
17};
18
19static struct rb_root root = RB_ROOT;
20static struct test_node nodes[NODES];
21
22static struct rnd_state rnd;
23
24static void insert(struct test_node *node, struct rb_root *root)
25{
26 struct rb_node **new = &root->rb_node, *parent = NULL;
27 u32 key = node->key;
28
29 while (*new) {
30 parent = *new;
31 if (key < rb_entry(parent, struct test_node, rb)->key)
32 new = &parent->rb_left;
33 else
34 new = &parent->rb_right;
35 }
36
37 rb_link_node(&node->rb, parent, new);
38 rb_insert_color(&node->rb, root);
39}
40
41static inline void erase(struct test_node *node, struct rb_root *root)
42{
43 rb_erase(&node->rb, root);
44}
45
46static inline u32 augment_recompute(struct test_node *node)
47{
48 u32 max = node->val, child_augmented;
49 if (node->rb.rb_left) {
50 child_augmented = rb_entry(node->rb.rb_left, struct test_node,
51 rb)->augmented;
52 if (max < child_augmented)
53 max = child_augmented;
54 }
55 if (node->rb.rb_right) {
56 child_augmented = rb_entry(node->rb.rb_right, struct test_node,
57 rb)->augmented;
58 if (max < child_augmented)
59 max = child_augmented;
60 }
61 return max;
62}
63
64RB_DECLARE_CALLBACKS(static, augment_callbacks, struct test_node, rb,
65 u32, augmented, augment_recompute)
66
67static void insert_augmented(struct test_node *node, struct rb_root *root)
68{
69 struct rb_node **new = &root->rb_node, *rb_parent = NULL;
70 u32 key = node->key;
71 u32 val = node->val;
72 struct test_node *parent;
73
74 while (*new) {
75 rb_parent = *new;
76 parent = rb_entry(rb_parent, struct test_node, rb);
77 if (parent->augmented < val)
78 parent->augmented = val;
79 if (key < parent->key)
80 new = &parent->rb.rb_left;
81 else
82 new = &parent->rb.rb_right;
83 }
84
85 node->augmented = val;
86 rb_link_node(&node->rb, rb_parent, new);
87 rb_insert_augmented(&node->rb, root, &augment_callbacks);
88}
89
90static void erase_augmented(struct test_node *node, struct rb_root *root)
91{
92 rb_erase_augmented(&node->rb, root, &augment_callbacks);
93}
94
95static void init(void)
96{
97 int i;
98 for (i = 0; i < NODES; i++) {
99 nodes[i].key = prandom32(&rnd);
100 nodes[i].val = prandom32(&rnd);
101 }
102}
103
104static bool is_red(struct rb_node *rb)
105{
106 return !(rb->__rb_parent_color & 1);
107}
108
109static int black_path_count(struct rb_node *rb)
110{
111 int count;
112 for (count = 0; rb; rb = rb_parent(rb))
113 count += !is_red(rb);
114 return count;
115}
116
117static void check(int nr_nodes)
118{
119 struct rb_node *rb;
120 int count = 0;
121 int blacks;
122 u32 prev_key = 0;
123
124 for (rb = rb_first(&root); rb; rb = rb_next(rb)) {
125 struct test_node *node = rb_entry(rb, struct test_node, rb);
126 WARN_ON_ONCE(node->key < prev_key);
127 WARN_ON_ONCE(is_red(rb) &&
128 (!rb_parent(rb) || is_red(rb_parent(rb))));
129 if (!count)
130 blacks = black_path_count(rb);
131 else
132 WARN_ON_ONCE((!rb->rb_left || !rb->rb_right) &&
133 blacks != black_path_count(rb));
134 prev_key = node->key;
135 count++;
136 }
137 WARN_ON_ONCE(count != nr_nodes);
138}
139
140static void check_augmented(int nr_nodes)
141{
142 struct rb_node *rb;
143
144 check(nr_nodes);
145 for (rb = rb_first(&root); rb; rb = rb_next(rb)) {
146 struct test_node *node = rb_entry(rb, struct test_node, rb);
147 WARN_ON_ONCE(node->augmented != augment_recompute(node));
148 }
149}
150
151static int rbtree_test_init(void)
152{
153 int i, j;
154 cycles_t time1, time2, time;
155
156 printk(KERN_ALERT "rbtree testing");
157
158 prandom32_seed(&rnd, 3141592653589793238ULL);
159 init();
160
161 time1 = get_cycles();
162
163 for (i = 0; i < PERF_LOOPS; i++) {
164 for (j = 0; j < NODES; j++)
165 insert(nodes + j, &root);
166 for (j = 0; j < NODES; j++)
167 erase(nodes + j, &root);
168 }
169
170 time2 = get_cycles();
171 time = time2 - time1;
172
173 time = div_u64(time, PERF_LOOPS);
174 printk(" -> %llu cycles\n", (unsigned long long)time);
175
176 for (i = 0; i < CHECK_LOOPS; i++) {
177 init();
178 for (j = 0; j < NODES; j++) {
179 check(j);
180 insert(nodes + j, &root);
181 }
182 for (j = 0; j < NODES; j++) {
183 check(NODES - j);
184 erase(nodes + j, &root);
185 }
186 check(0);
187 }
188
189 printk(KERN_ALERT "augmented rbtree testing");
190
191 init();
192
193 time1 = get_cycles();
194
195 for (i = 0; i < PERF_LOOPS; i++) {
196 for (j = 0; j < NODES; j++)
197 insert_augmented(nodes + j, &root);
198 for (j = 0; j < NODES; j++)
199 erase_augmented(nodes + j, &root);
200 }
201
202 time2 = get_cycles();
203 time = time2 - time1;
204
205 time = div_u64(time, PERF_LOOPS);
206 printk(" -> %llu cycles\n", (unsigned long long)time);
207
208 for (i = 0; i < CHECK_LOOPS; i++) {
209 init();
210 for (j = 0; j < NODES; j++) {
211 check_augmented(j);
212 insert_augmented(nodes + j, &root);
213 }
214 for (j = 0; j < NODES; j++) {
215 check_augmented(NODES - j);
216 erase_augmented(nodes + j, &root);
217 }
218 check_augmented(0);
219 }
220
221 return -EAGAIN; /* Fail will directly unload the module */
222}
223
224static void rbtree_test_exit(void)
225{
226 printk(KERN_ALERT "test exit\n");
227}
228
229module_init(rbtree_test_init)
230module_exit(rbtree_test_exit)
231
232MODULE_LICENSE("GPL");
233MODULE_AUTHOR("Michel Lespinasse");
234MODULE_DESCRIPTION("Red Black Tree test");
diff --git a/mm/Kconfig b/mm/Kconfig
index d5c8019c6627..a3f8dddaaab3 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -191,6 +191,7 @@ config SPLIT_PTLOCK_CPUS
191# support for memory compaction 191# support for memory compaction
192config COMPACTION 192config COMPACTION
193 bool "Allow for memory compaction" 193 bool "Allow for memory compaction"
194 def_bool y
194 select MIGRATION 195 select MIGRATION
195 depends on MMU 196 depends on MMU
196 help 197 help
@@ -318,7 +319,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
318 319
319config TRANSPARENT_HUGEPAGE 320config TRANSPARENT_HUGEPAGE
320 bool "Transparent Hugepage Support" 321 bool "Transparent Hugepage Support"
321 depends on X86 && MMU 322 depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
322 select COMPACTION 323 select COMPACTION
323 help 324 help
324 Transparent Hugepages allows the kernel to use huge pages and 325 Transparent Hugepages allows the kernel to use huge pages and
diff --git a/mm/Makefile b/mm/Makefile
index 92753e2d82da..6b025f80af34 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -14,9 +14,9 @@ endif
14obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ 14obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
15 maccess.o page_alloc.o page-writeback.o \ 15 maccess.o page_alloc.o page-writeback.o \
16 readahead.o swap.o truncate.o vmscan.o shmem.o \ 16 readahead.o swap.o truncate.o vmscan.o shmem.o \
17 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 17 util.o mmzone.o vmstat.o backing-dev.o \
18 mm_init.o mmu_context.o percpu.o slab_common.o \ 18 mm_init.o mmu_context.o percpu.o slab_common.o \
19 compaction.o $(mmu-y) 19 compaction.o interval_tree.o $(mmu-y)
20 20
21obj-y += init-mm.o 21obj-y += init-mm.o
22 22
diff --git a/mm/bootmem.c b/mm/bootmem.c
index f468185b3b28..434be4ae7a04 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -198,6 +198,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
198 int order = ilog2(BITS_PER_LONG); 198 int order = ilog2(BITS_PER_LONG);
199 199
200 __free_pages_bootmem(pfn_to_page(start), order); 200 __free_pages_bootmem(pfn_to_page(start), order);
201 fixup_zone_present_pages(page_to_nid(pfn_to_page(start)),
202 start, start + BITS_PER_LONG);
201 count += BITS_PER_LONG; 203 count += BITS_PER_LONG;
202 start += BITS_PER_LONG; 204 start += BITS_PER_LONG;
203 } else { 205 } else {
@@ -208,6 +210,9 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
208 if (vec & 1) { 210 if (vec & 1) {
209 page = pfn_to_page(start + off); 211 page = pfn_to_page(start + off);
210 __free_pages_bootmem(page, 0); 212 __free_pages_bootmem(page, 0);
213 fixup_zone_present_pages(
214 page_to_nid(page),
215 start + off, start + off + 1);
211 count++; 216 count++;
212 } 217 }
213 vec >>= 1; 218 vec >>= 1;
@@ -221,8 +226,11 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
221 pages = bdata->node_low_pfn - bdata->node_min_pfn; 226 pages = bdata->node_low_pfn - bdata->node_min_pfn;
222 pages = bootmem_bootmap_pages(pages); 227 pages = bootmem_bootmap_pages(pages);
223 count += pages; 228 count += pages;
224 while (pages--) 229 while (pages--) {
230 fixup_zone_present_pages(page_to_nid(page),
231 page_to_pfn(page), page_to_pfn(page) + 1);
225 __free_pages_bootmem(page++, 0); 232 __free_pages_bootmem(page++, 0);
233 }
226 234
227 bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); 235 bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
228 236
diff --git a/mm/compaction.c b/mm/compaction.c
index 7fcd3a52e68d..2c4ce17651d8 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -50,6 +50,111 @@ static inline bool migrate_async_suitable(int migratetype)
50 return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; 50 return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
51} 51}
52 52
53#ifdef CONFIG_COMPACTION
54/* Returns true if the pageblock should be scanned for pages to isolate. */
55static inline bool isolation_suitable(struct compact_control *cc,
56 struct page *page)
57{
58 if (cc->ignore_skip_hint)
59 return true;
60
61 return !get_pageblock_skip(page);
62}
63
64/*
65 * This function is called to clear all cached information on pageblocks that
66 * should be skipped for page isolation when the migrate and free page scanner
67 * meet.
68 */
69static void __reset_isolation_suitable(struct zone *zone)
70{
71 unsigned long start_pfn = zone->zone_start_pfn;
72 unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
73 unsigned long pfn;
74
75 zone->compact_cached_migrate_pfn = start_pfn;
76 zone->compact_cached_free_pfn = end_pfn;
77 zone->compact_blockskip_flush = false;
78
79 /* Walk the zone and mark every pageblock as suitable for isolation */
80 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
81 struct page *page;
82
83 cond_resched();
84
85 if (!pfn_valid(pfn))
86 continue;
87
88 page = pfn_to_page(pfn);
89 if (zone != page_zone(page))
90 continue;
91
92 clear_pageblock_skip(page);
93 }
94}
95
96void reset_isolation_suitable(pg_data_t *pgdat)
97{
98 int zoneid;
99
100 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
101 struct zone *zone = &pgdat->node_zones[zoneid];
102 if (!populated_zone(zone))
103 continue;
104
105 /* Only flush if a full compaction finished recently */
106 if (zone->compact_blockskip_flush)
107 __reset_isolation_suitable(zone);
108 }
109}
110
111/*
112 * If no pages were isolated then mark this pageblock to be skipped in the
113 * future. The information is later cleared by __reset_isolation_suitable().
114 */
115static void update_pageblock_skip(struct compact_control *cc,
116 struct page *page, unsigned long nr_isolated,
117 bool migrate_scanner)
118{
119 struct zone *zone = cc->zone;
120 if (!page)
121 return;
122
123 if (!nr_isolated) {
124 unsigned long pfn = page_to_pfn(page);
125 set_pageblock_skip(page);
126
127 /* Update where compaction should restart */
128 if (migrate_scanner) {
129 if (!cc->finished_update_migrate &&
130 pfn > zone->compact_cached_migrate_pfn)
131 zone->compact_cached_migrate_pfn = pfn;
132 } else {
133 if (!cc->finished_update_free &&
134 pfn < zone->compact_cached_free_pfn)
135 zone->compact_cached_free_pfn = pfn;
136 }
137 }
138}
139#else
140static inline bool isolation_suitable(struct compact_control *cc,
141 struct page *page)
142{
143 return true;
144}
145
146static void update_pageblock_skip(struct compact_control *cc,
147 struct page *page, unsigned long nr_isolated,
148 bool migrate_scanner)
149{
150}
151#endif /* CONFIG_COMPACTION */
152
153static inline bool should_release_lock(spinlock_t *lock)
154{
155 return need_resched() || spin_is_contended(lock);
156}
157
53/* 158/*
54 * Compaction requires the taking of some coarse locks that are potentially 159 * Compaction requires the taking of some coarse locks that are potentially
55 * very heavily contended. Check if the process needs to be scheduled or 160 * very heavily contended. Check if the process needs to be scheduled or
@@ -62,7 +167,7 @@ static inline bool migrate_async_suitable(int migratetype)
62static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, 167static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
63 bool locked, struct compact_control *cc) 168 bool locked, struct compact_control *cc)
64{ 169{
65 if (need_resched() || spin_is_contended(lock)) { 170 if (should_release_lock(lock)) {
66 if (locked) { 171 if (locked) {
67 spin_unlock_irqrestore(lock, *flags); 172 spin_unlock_irqrestore(lock, *flags);
68 locked = false; 173 locked = false;
@@ -70,14 +175,11 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
70 175
71 /* async aborts if taking too long or contended */ 176 /* async aborts if taking too long or contended */
72 if (!cc->sync) { 177 if (!cc->sync) {
73 if (cc->contended) 178 cc->contended = true;
74 *cc->contended = true;
75 return false; 179 return false;
76 } 180 }
77 181
78 cond_resched(); 182 cond_resched();
79 if (fatal_signal_pending(current))
80 return false;
81 } 183 }
82 184
83 if (!locked) 185 if (!locked)
@@ -91,44 +193,139 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock,
91 return compact_checklock_irqsave(lock, flags, false, cc); 193 return compact_checklock_irqsave(lock, flags, false, cc);
92} 194}
93 195
196/* Returns true if the page is within a block suitable for migration to */
197static bool suitable_migration_target(struct page *page)
198{
199 int migratetype = get_pageblock_migratetype(page);
200
201 /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
202 if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
203 return false;
204
205 /* If the page is a large free page, then allow migration */
206 if (PageBuddy(page) && page_order(page) >= pageblock_order)
207 return true;
208
209 /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
210 if (migrate_async_suitable(migratetype))
211 return true;
212
213 /* Otherwise skip the block */
214 return false;
215}
216
217static void compact_capture_page(struct compact_control *cc)
218{
219 unsigned long flags;
220 int mtype, mtype_low, mtype_high;
221
222 if (!cc->page || *cc->page)
223 return;
224
225 /*
226 * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
227 * regardless of the migratetype of the freelist is is captured from.
228 * This is fine because the order for a high-order MIGRATE_MOVABLE
229 * allocation is typically at least a pageblock size and overall
230 * fragmentation is not impaired. Other allocation types must
231 * capture pages from their own migratelist because otherwise they
232 * could pollute other pageblocks like MIGRATE_MOVABLE with
233 * difficult to move pages and making fragmentation worse overall.
234 */
235 if (cc->migratetype == MIGRATE_MOVABLE) {
236 mtype_low = 0;
237 mtype_high = MIGRATE_PCPTYPES;
238 } else {
239 mtype_low = cc->migratetype;
240 mtype_high = cc->migratetype + 1;
241 }
242
243 /* Speculatively examine the free lists without zone lock */
244 for (mtype = mtype_low; mtype < mtype_high; mtype++) {
245 int order;
246 for (order = cc->order; order < MAX_ORDER; order++) {
247 struct page *page;
248 struct free_area *area;
249 area = &(cc->zone->free_area[order]);
250 if (list_empty(&area->free_list[mtype]))
251 continue;
252
253 /* Take the lock and attempt capture of the page */
254 if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
255 return;
256 if (!list_empty(&area->free_list[mtype])) {
257 page = list_entry(area->free_list[mtype].next,
258 struct page, lru);
259 if (capture_free_page(page, cc->order, mtype)) {
260 spin_unlock_irqrestore(&cc->zone->lock,
261 flags);
262 *cc->page = page;
263 return;
264 }
265 }
266 spin_unlock_irqrestore(&cc->zone->lock, flags);
267 }
268 }
269}
270
94/* 271/*
95 * Isolate free pages onto a private freelist. Caller must hold zone->lock. 272 * Isolate free pages onto a private freelist. Caller must hold zone->lock.
96 * If @strict is true, will abort returning 0 on any invalid PFNs or non-free 273 * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
97 * pages inside of the pageblock (even though it may still end up isolating 274 * pages inside of the pageblock (even though it may still end up isolating
98 * some pages). 275 * some pages).
99 */ 276 */
100static unsigned long isolate_freepages_block(unsigned long blockpfn, 277static unsigned long isolate_freepages_block(struct compact_control *cc,
278 unsigned long blockpfn,
101 unsigned long end_pfn, 279 unsigned long end_pfn,
102 struct list_head *freelist, 280 struct list_head *freelist,
103 bool strict) 281 bool strict)
104{ 282{
105 int nr_scanned = 0, total_isolated = 0; 283 int nr_scanned = 0, total_isolated = 0;
106 struct page *cursor; 284 struct page *cursor, *valid_page = NULL;
285 unsigned long nr_strict_required = end_pfn - blockpfn;
286 unsigned long flags;
287 bool locked = false;
107 288
108 cursor = pfn_to_page(blockpfn); 289 cursor = pfn_to_page(blockpfn);
109 290
110 /* Isolate free pages. This assumes the block is valid */ 291 /* Isolate free pages. */
111 for (; blockpfn < end_pfn; blockpfn++, cursor++) { 292 for (; blockpfn < end_pfn; blockpfn++, cursor++) {
112 int isolated, i; 293 int isolated, i;
113 struct page *page = cursor; 294 struct page *page = cursor;
114 295
115 if (!pfn_valid_within(blockpfn)) {
116 if (strict)
117 return 0;
118 continue;
119 }
120 nr_scanned++; 296 nr_scanned++;
297 if (!pfn_valid_within(blockpfn))
298 continue;
299 if (!valid_page)
300 valid_page = page;
301 if (!PageBuddy(page))
302 continue;
121 303
122 if (!PageBuddy(page)) { 304 /*
123 if (strict) 305 * The zone lock must be held to isolate freepages.
124 return 0; 306 * Unfortunately this is a very coarse lock and can be
307 * heavily contended if there are parallel allocations
308 * or parallel compactions. For async compaction do not
309 * spin on the lock and we acquire the lock as late as
310 * possible.
311 */
312 locked = compact_checklock_irqsave(&cc->zone->lock, &flags,
313 locked, cc);
314 if (!locked)
315 break;
316
317 /* Recheck this is a suitable migration target under lock */
318 if (!strict && !suitable_migration_target(page))
319 break;
320
321 /* Recheck this is a buddy page under lock */
322 if (!PageBuddy(page))
125 continue; 323 continue;
126 }
127 324
128 /* Found a free page, break it into order-0 pages */ 325 /* Found a free page, break it into order-0 pages */
129 isolated = split_free_page(page); 326 isolated = split_free_page(page);
130 if (!isolated && strict) 327 if (!isolated && strict)
131 return 0; 328 break;
132 total_isolated += isolated; 329 total_isolated += isolated;
133 for (i = 0; i < isolated; i++) { 330 for (i = 0; i < isolated; i++) {
134 list_add(&page->lru, freelist); 331 list_add(&page->lru, freelist);
@@ -143,6 +340,22 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn,
143 } 340 }
144 341
145 trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); 342 trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
343
344 /*
345 * If strict isolation is requested by CMA then check that all the
346 * pages requested were isolated. If there were any failures, 0 is
347 * returned and CMA will fail.
348 */
349 if (strict && nr_strict_required != total_isolated)
350 total_isolated = 0;
351
352 if (locked)
353 spin_unlock_irqrestore(&cc->zone->lock, flags);
354
355 /* Update the pageblock-skip if the whole pageblock was scanned */
356 if (blockpfn == end_pfn)
357 update_pageblock_skip(cc, valid_page, total_isolated, false);
358
146 return total_isolated; 359 return total_isolated;
147} 360}
148 361
@@ -160,17 +373,14 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn,
160 * a free page). 373 * a free page).
161 */ 374 */
162unsigned long 375unsigned long
163isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn) 376isolate_freepages_range(struct compact_control *cc,
377 unsigned long start_pfn, unsigned long end_pfn)
164{ 378{
165 unsigned long isolated, pfn, block_end_pfn, flags; 379 unsigned long isolated, pfn, block_end_pfn;
166 struct zone *zone = NULL;
167 LIST_HEAD(freelist); 380 LIST_HEAD(freelist);
168 381
169 if (pfn_valid(start_pfn))
170 zone = page_zone(pfn_to_page(start_pfn));
171
172 for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) { 382 for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) {
173 if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn))) 383 if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn)))
174 break; 384 break;
175 385
176 /* 386 /*
@@ -180,10 +390,8 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
180 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); 390 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
181 block_end_pfn = min(block_end_pfn, end_pfn); 391 block_end_pfn = min(block_end_pfn, end_pfn);
182 392
183 spin_lock_irqsave(&zone->lock, flags); 393 isolated = isolate_freepages_block(cc, pfn, block_end_pfn,
184 isolated = isolate_freepages_block(pfn, block_end_pfn,
185 &freelist, true); 394 &freelist, true);
186 spin_unlock_irqrestore(&zone->lock, flags);
187 395
188 /* 396 /*
189 * In strict mode, isolate_freepages_block() returns 0 if 397 * In strict mode, isolate_freepages_block() returns 0 if
@@ -253,6 +461,7 @@ static bool too_many_isolated(struct zone *zone)
253 * @cc: Compaction control structure. 461 * @cc: Compaction control structure.
254 * @low_pfn: The first PFN of the range. 462 * @low_pfn: The first PFN of the range.
255 * @end_pfn: The one-past-the-last PFN of the range. 463 * @end_pfn: The one-past-the-last PFN of the range.
464 * @unevictable: true if it allows to isolate unevictable pages
256 * 465 *
257 * Isolate all pages that can be migrated from the range specified by 466 * Isolate all pages that can be migrated from the range specified by
258 * [low_pfn, end_pfn). Returns zero if there is a fatal signal 467 * [low_pfn, end_pfn). Returns zero if there is a fatal signal
@@ -268,7 +477,7 @@ static bool too_many_isolated(struct zone *zone)
268 */ 477 */
269unsigned long 478unsigned long
270isolate_migratepages_range(struct zone *zone, struct compact_control *cc, 479isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
271 unsigned long low_pfn, unsigned long end_pfn) 480 unsigned long low_pfn, unsigned long end_pfn, bool unevictable)
272{ 481{
273 unsigned long last_pageblock_nr = 0, pageblock_nr; 482 unsigned long last_pageblock_nr = 0, pageblock_nr;
274 unsigned long nr_scanned = 0, nr_isolated = 0; 483 unsigned long nr_scanned = 0, nr_isolated = 0;
@@ -276,7 +485,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
276 isolate_mode_t mode = 0; 485 isolate_mode_t mode = 0;
277 struct lruvec *lruvec; 486 struct lruvec *lruvec;
278 unsigned long flags; 487 unsigned long flags;
279 bool locked; 488 bool locked = false;
489 struct page *page = NULL, *valid_page = NULL;
280 490
281 /* 491 /*
282 * Ensure that there are not too many pages isolated from the LRU 492 * Ensure that there are not too many pages isolated from the LRU
@@ -296,23 +506,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
296 506
297 /* Time to isolate some pages for migration */ 507 /* Time to isolate some pages for migration */
298 cond_resched(); 508 cond_resched();
299 spin_lock_irqsave(&zone->lru_lock, flags);
300 locked = true;
301 for (; low_pfn < end_pfn; low_pfn++) { 509 for (; low_pfn < end_pfn; low_pfn++) {
302 struct page *page;
303
304 /* give a chance to irqs before checking need_resched() */ 510 /* give a chance to irqs before checking need_resched() */
305 if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) { 511 if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) {
306 spin_unlock_irqrestore(&zone->lru_lock, flags); 512 if (should_release_lock(&zone->lru_lock)) {
307 locked = false; 513 spin_unlock_irqrestore(&zone->lru_lock, flags);
514 locked = false;
515 }
308 } 516 }
309 517
310 /* Check if it is ok to still hold the lock */
311 locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
312 locked, cc);
313 if (!locked)
314 break;
315
316 /* 518 /*
317 * migrate_pfn does not necessarily start aligned to a 519 * migrate_pfn does not necessarily start aligned to a
318 * pageblock. Ensure that pfn_valid is called when moving 520 * pageblock. Ensure that pfn_valid is called when moving
@@ -340,6 +542,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
340 if (page_zone(page) != zone) 542 if (page_zone(page) != zone)
341 continue; 543 continue;
342 544
545 if (!valid_page)
546 valid_page = page;
547
548 /* If isolation recently failed, do not retry */
549 pageblock_nr = low_pfn >> pageblock_order;
550 if (!isolation_suitable(cc, page))
551 goto next_pageblock;
552
343 /* Skip if free */ 553 /* Skip if free */
344 if (PageBuddy(page)) 554 if (PageBuddy(page))
345 continue; 555 continue;
@@ -349,24 +559,43 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
349 * migration is optimistic to see if the minimum amount of work 559 * migration is optimistic to see if the minimum amount of work
350 * satisfies the allocation 560 * satisfies the allocation
351 */ 561 */
352 pageblock_nr = low_pfn >> pageblock_order;
353 if (!cc->sync && last_pageblock_nr != pageblock_nr && 562 if (!cc->sync && last_pageblock_nr != pageblock_nr &&
354 !migrate_async_suitable(get_pageblock_migratetype(page))) { 563 !migrate_async_suitable(get_pageblock_migratetype(page))) {
355 low_pfn += pageblock_nr_pages; 564 cc->finished_update_migrate = true;
356 low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; 565 goto next_pageblock;
357 last_pageblock_nr = pageblock_nr;
358 continue;
359 } 566 }
360 567
568 /* Check may be lockless but that's ok as we recheck later */
361 if (!PageLRU(page)) 569 if (!PageLRU(page))
362 continue; 570 continue;
363 571
364 /* 572 /*
365 * PageLRU is set, and lru_lock excludes isolation, 573 * PageLRU is set. lru_lock normally excludes isolation
366 * splitting and collapsing (collapsing has already 574 * splitting and collapsing (collapsing has already happened
367 * happened if PageLRU is set). 575 * if PageLRU is set) but the lock is not necessarily taken
576 * here and it is wasteful to take it just to check transhuge.
577 * Check TransHuge without lock and skip the whole pageblock if
578 * it's either a transhuge or hugetlbfs page, as calling
579 * compound_order() without preventing THP from splitting the
580 * page underneath us may return surprising results.
368 */ 581 */
369 if (PageTransHuge(page)) { 582 if (PageTransHuge(page)) {
583 if (!locked)
584 goto next_pageblock;
585 low_pfn += (1 << compound_order(page)) - 1;
586 continue;
587 }
588
589 /* Check if it is ok to still hold the lock */
590 locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
591 locked, cc);
592 if (!locked || fatal_signal_pending(current))
593 break;
594
595 /* Recheck PageLRU and PageTransHuge under lock */
596 if (!PageLRU(page))
597 continue;
598 if (PageTransHuge(page)) {
370 low_pfn += (1 << compound_order(page)) - 1; 599 low_pfn += (1 << compound_order(page)) - 1;
371 continue; 600 continue;
372 } 601 }
@@ -374,6 +603,9 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
374 if (!cc->sync) 603 if (!cc->sync)
375 mode |= ISOLATE_ASYNC_MIGRATE; 604 mode |= ISOLATE_ASYNC_MIGRATE;
376 605
606 if (unevictable)
607 mode |= ISOLATE_UNEVICTABLE;
608
377 lruvec = mem_cgroup_page_lruvec(page, zone); 609 lruvec = mem_cgroup_page_lruvec(page, zone);
378 610
379 /* Try isolate the page */ 611 /* Try isolate the page */
@@ -383,6 +615,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
383 VM_BUG_ON(PageTransCompound(page)); 615 VM_BUG_ON(PageTransCompound(page));
384 616
385 /* Successfully isolated */ 617 /* Successfully isolated */
618 cc->finished_update_migrate = true;
386 del_page_from_lru_list(page, lruvec, page_lru(page)); 619 del_page_from_lru_list(page, lruvec, page_lru(page));
387 list_add(&page->lru, migratelist); 620 list_add(&page->lru, migratelist);
388 cc->nr_migratepages++; 621 cc->nr_migratepages++;
@@ -393,6 +626,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
393 ++low_pfn; 626 ++low_pfn;
394 break; 627 break;
395 } 628 }
629
630 continue;
631
632next_pageblock:
633 low_pfn += pageblock_nr_pages;
634 low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
635 last_pageblock_nr = pageblock_nr;
396 } 636 }
397 637
398 acct_isolated(zone, locked, cc); 638 acct_isolated(zone, locked, cc);
@@ -400,6 +640,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
400 if (locked) 640 if (locked)
401 spin_unlock_irqrestore(&zone->lru_lock, flags); 641 spin_unlock_irqrestore(&zone->lru_lock, flags);
402 642
643 /* Update the pageblock-skip if the whole pageblock was scanned */
644 if (low_pfn == end_pfn)
645 update_pageblock_skip(cc, valid_page, nr_isolated, true);
646
403 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); 647 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
404 648
405 return low_pfn; 649 return low_pfn;
@@ -407,43 +651,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
407 651
408#endif /* CONFIG_COMPACTION || CONFIG_CMA */ 652#endif /* CONFIG_COMPACTION || CONFIG_CMA */
409#ifdef CONFIG_COMPACTION 653#ifdef CONFIG_COMPACTION
410
411/* Returns true if the page is within a block suitable for migration to */
412static bool suitable_migration_target(struct page *page)
413{
414
415 int migratetype = get_pageblock_migratetype(page);
416
417 /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
418 if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
419 return false;
420
421 /* If the page is a large free page, then allow migration */
422 if (PageBuddy(page) && page_order(page) >= pageblock_order)
423 return true;
424
425 /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
426 if (migrate_async_suitable(migratetype))
427 return true;
428
429 /* Otherwise skip the block */
430 return false;
431}
432
433/*
434 * Returns the start pfn of the last page block in a zone. This is the starting
435 * point for full compaction of a zone. Compaction searches for free pages from
436 * the end of each zone, while isolate_freepages_block scans forward inside each
437 * page block.
438 */
439static unsigned long start_free_pfn(struct zone *zone)
440{
441 unsigned long free_pfn;
442 free_pfn = zone->zone_start_pfn + zone->spanned_pages;
443 free_pfn &= ~(pageblock_nr_pages-1);
444 return free_pfn;
445}
446
447/* 654/*
448 * Based on information in the current compact_control, find blocks 655 * Based on information in the current compact_control, find blocks
449 * suitable for isolating free pages from and then isolate them. 656 * suitable for isolating free pages from and then isolate them.
@@ -453,7 +660,6 @@ static void isolate_freepages(struct zone *zone,
453{ 660{
454 struct page *page; 661 struct page *page;
455 unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn; 662 unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn;
456 unsigned long flags;
457 int nr_freepages = cc->nr_freepages; 663 int nr_freepages = cc->nr_freepages;
458 struct list_head *freelist = &cc->freepages; 664 struct list_head *freelist = &cc->freepages;
459 665
@@ -501,30 +707,16 @@ static void isolate_freepages(struct zone *zone,
501 if (!suitable_migration_target(page)) 707 if (!suitable_migration_target(page))
502 continue; 708 continue;
503 709
504 /* 710 /* If isolation recently failed, do not retry */
505 * Found a block suitable for isolating free pages from. Now 711 if (!isolation_suitable(cc, page))
506 * we disabled interrupts, double check things are ok and 712 continue;
507 * isolate the pages. This is to minimise the time IRQs
508 * are disabled
509 */
510 isolated = 0;
511 713
512 /* 714 /* Found a block suitable for isolating free pages from */
513 * The zone lock must be held to isolate freepages. This 715 isolated = 0;
514 * unfortunately this is a very coarse lock and can be 716 end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
515 * heavily contended if there are parallel allocations 717 isolated = isolate_freepages_block(cc, pfn, end_pfn,
516 * or parallel compactions. For async compaction do not 718 freelist, false);
517 * spin on the lock 719 nr_freepages += isolated;
518 */
519 if (!compact_trylock_irqsave(&zone->lock, &flags, cc))
520 break;
521 if (suitable_migration_target(page)) {
522 end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
523 isolated = isolate_freepages_block(pfn, end_pfn,
524 freelist, false);
525 nr_freepages += isolated;
526 }
527 spin_unlock_irqrestore(&zone->lock, flags);
528 720
529 /* 721 /*
530 * Record the highest PFN we isolated pages from. When next 722 * Record the highest PFN we isolated pages from. When next
@@ -532,17 +724,8 @@ static void isolate_freepages(struct zone *zone,
532 * page migration may have returned some pages to the allocator 724 * page migration may have returned some pages to the allocator
533 */ 725 */
534 if (isolated) { 726 if (isolated) {
727 cc->finished_update_free = true;
535 high_pfn = max(high_pfn, pfn); 728 high_pfn = max(high_pfn, pfn);
536
537 /*
538 * If the free scanner has wrapped, update
539 * compact_cached_free_pfn to point to the highest
540 * pageblock with free pages. This reduces excessive
541 * scanning of full pageblocks near the end of the
542 * zone
543 */
544 if (cc->order > 0 && cc->wrapped)
545 zone->compact_cached_free_pfn = high_pfn;
546 } 729 }
547 } 730 }
548 731
@@ -551,11 +734,6 @@ static void isolate_freepages(struct zone *zone,
551 734
552 cc->free_pfn = high_pfn; 735 cc->free_pfn = high_pfn;
553 cc->nr_freepages = nr_freepages; 736 cc->nr_freepages = nr_freepages;
554
555 /* If compact_cached_free_pfn is reset then set it now */
556 if (cc->order > 0 && !cc->wrapped &&
557 zone->compact_cached_free_pfn == start_free_pfn(zone))
558 zone->compact_cached_free_pfn = high_pfn;
559} 737}
560 738
561/* 739/*
@@ -633,8 +811,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
633 } 811 }
634 812
635 /* Perform the isolation */ 813 /* Perform the isolation */
636 low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn); 814 low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn, false);
637 if (!low_pfn) 815 if (!low_pfn || cc->contended)
638 return ISOLATE_ABORT; 816 return ISOLATE_ABORT;
639 817
640 cc->migrate_pfn = low_pfn; 818 cc->migrate_pfn = low_pfn;
@@ -645,33 +823,24 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
645static int compact_finished(struct zone *zone, 823static int compact_finished(struct zone *zone,
646 struct compact_control *cc) 824 struct compact_control *cc)
647{ 825{
648 unsigned int order;
649 unsigned long watermark; 826 unsigned long watermark;
650 827
651 if (fatal_signal_pending(current)) 828 if (fatal_signal_pending(current))
652 return COMPACT_PARTIAL; 829 return COMPACT_PARTIAL;
653 830
654 /* 831 /* Compaction run completes if the migrate and free scanner meet */
655 * A full (order == -1) compaction run starts at the beginning and
656 * end of a zone; it completes when the migrate and free scanner meet.
657 * A partial (order > 0) compaction can start with the free scanner
658 * at a random point in the zone, and may have to restart.
659 */
660 if (cc->free_pfn <= cc->migrate_pfn) { 832 if (cc->free_pfn <= cc->migrate_pfn) {
661 if (cc->order > 0 && !cc->wrapped) { 833 /*
662 /* We started partway through; restart at the end. */ 834 * Mark that the PG_migrate_skip information should be cleared
663 unsigned long free_pfn = start_free_pfn(zone); 835 * by kswapd when it goes to sleep. kswapd does not set the
664 zone->compact_cached_free_pfn = free_pfn; 836 * flag itself as the decision to be clear should be directly
665 cc->free_pfn = free_pfn; 837 * based on an allocation request.
666 cc->wrapped = 1; 838 */
667 return COMPACT_CONTINUE; 839 if (!current_is_kswapd())
668 } 840 zone->compact_blockskip_flush = true;
669 return COMPACT_COMPLETE;
670 }
671 841
672 /* We wrapped around and ended up where we started. */
673 if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn)
674 return COMPACT_COMPLETE; 842 return COMPACT_COMPLETE;
843 }
675 844
676 /* 845 /*
677 * order == -1 is expected when compacting via 846 * order == -1 is expected when compacting via
@@ -688,14 +857,22 @@ static int compact_finished(struct zone *zone,
688 return COMPACT_CONTINUE; 857 return COMPACT_CONTINUE;
689 858
690 /* Direct compactor: Is a suitable page free? */ 859 /* Direct compactor: Is a suitable page free? */
691 for (order = cc->order; order < MAX_ORDER; order++) { 860 if (cc->page) {
692 /* Job done if page is free of the right migratetype */ 861 /* Was a suitable page captured? */
693 if (!list_empty(&zone->free_area[order].free_list[cc->migratetype])) 862 if (*cc->page)
694 return COMPACT_PARTIAL;
695
696 /* Job done if allocation would set block type */
697 if (order >= pageblock_order && zone->free_area[order].nr_free)
698 return COMPACT_PARTIAL; 863 return COMPACT_PARTIAL;
864 } else {
865 unsigned int order;
866 for (order = cc->order; order < MAX_ORDER; order++) {
867 struct free_area *area = &zone->free_area[cc->order];
868 /* Job done if page is free of the right migratetype */
869 if (!list_empty(&area->free_list[cc->migratetype]))
870 return COMPACT_PARTIAL;
871
872 /* Job done if allocation would set block type */
873 if (cc->order >= pageblock_order && area->nr_free)
874 return COMPACT_PARTIAL;
875 }
699 } 876 }
700 877
701 return COMPACT_CONTINUE; 878 return COMPACT_CONTINUE;
@@ -754,6 +931,8 @@ unsigned long compaction_suitable(struct zone *zone, int order)
754static int compact_zone(struct zone *zone, struct compact_control *cc) 931static int compact_zone(struct zone *zone, struct compact_control *cc)
755{ 932{
756 int ret; 933 int ret;
934 unsigned long start_pfn = zone->zone_start_pfn;
935 unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
757 936
758 ret = compaction_suitable(zone, cc->order); 937 ret = compaction_suitable(zone, cc->order);
759 switch (ret) { 938 switch (ret) {
@@ -766,18 +945,30 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
766 ; 945 ;
767 } 946 }
768 947
769 /* Setup to move all movable pages to the end of the zone */ 948 /*
770 cc->migrate_pfn = zone->zone_start_pfn; 949 * Setup to move all movable pages to the end of the zone. Used cached
771 950 * information on where the scanners should start but check that it
772 if (cc->order > 0) { 951 * is initialised by ensuring the values are within zone boundaries.
773 /* Incremental compaction. Start where the last one stopped. */ 952 */
774 cc->free_pfn = zone->compact_cached_free_pfn; 953 cc->migrate_pfn = zone->compact_cached_migrate_pfn;
775 cc->start_free_pfn = cc->free_pfn; 954 cc->free_pfn = zone->compact_cached_free_pfn;
776 } else { 955 if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
777 /* Order == -1 starts at the end of the zone. */ 956 cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
778 cc->free_pfn = start_free_pfn(zone); 957 zone->compact_cached_free_pfn = cc->free_pfn;
958 }
959 if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
960 cc->migrate_pfn = start_pfn;
961 zone->compact_cached_migrate_pfn = cc->migrate_pfn;
779 } 962 }
780 963
964 /*
965 * Clear pageblock skip if there were failures recently and compaction
966 * is about to be retried after being deferred. kswapd does not do
967 * this reset as it'll reset the cached information when going to sleep.
968 */
969 if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
970 __reset_isolation_suitable(zone);
971
781 migrate_prep_local(); 972 migrate_prep_local();
782 973
783 while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { 974 while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
@@ -787,6 +978,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
787 switch (isolate_migratepages(zone, cc)) { 978 switch (isolate_migratepages(zone, cc)) {
788 case ISOLATE_ABORT: 979 case ISOLATE_ABORT:
789 ret = COMPACT_PARTIAL; 980 ret = COMPACT_PARTIAL;
981 putback_lru_pages(&cc->migratepages);
982 cc->nr_migratepages = 0;
790 goto out; 983 goto out;
791 case ISOLATE_NONE: 984 case ISOLATE_NONE:
792 continue; 985 continue;
@@ -817,6 +1010,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
817 goto out; 1010 goto out;
818 } 1011 }
819 } 1012 }
1013
1014 /* Capture a page now if it is a suitable size */
1015 compact_capture_page(cc);
820 } 1016 }
821 1017
822out: 1018out:
@@ -829,8 +1025,10 @@ out:
829 1025
830static unsigned long compact_zone_order(struct zone *zone, 1026static unsigned long compact_zone_order(struct zone *zone,
831 int order, gfp_t gfp_mask, 1027 int order, gfp_t gfp_mask,
832 bool sync, bool *contended) 1028 bool sync, bool *contended,
1029 struct page **page)
833{ 1030{
1031 unsigned long ret;
834 struct compact_control cc = { 1032 struct compact_control cc = {
835 .nr_freepages = 0, 1033 .nr_freepages = 0,
836 .nr_migratepages = 0, 1034 .nr_migratepages = 0,
@@ -838,12 +1036,18 @@ static unsigned long compact_zone_order(struct zone *zone,
838 .migratetype = allocflags_to_migratetype(gfp_mask), 1036 .migratetype = allocflags_to_migratetype(gfp_mask),
839 .zone = zone, 1037 .zone = zone,
840 .sync = sync, 1038 .sync = sync,
841 .contended = contended, 1039 .page = page,
842 }; 1040 };
843 INIT_LIST_HEAD(&cc.freepages); 1041 INIT_LIST_HEAD(&cc.freepages);
844 INIT_LIST_HEAD(&cc.migratepages); 1042 INIT_LIST_HEAD(&cc.migratepages);
845 1043
846 return compact_zone(zone, &cc); 1044 ret = compact_zone(zone, &cc);
1045
1046 VM_BUG_ON(!list_empty(&cc.freepages));
1047 VM_BUG_ON(!list_empty(&cc.migratepages));
1048
1049 *contended = cc.contended;
1050 return ret;
847} 1051}
848 1052
849int sysctl_extfrag_threshold = 500; 1053int sysctl_extfrag_threshold = 500;
@@ -855,12 +1059,14 @@ int sysctl_extfrag_threshold = 500;
855 * @gfp_mask: The GFP mask of the current allocation 1059 * @gfp_mask: The GFP mask of the current allocation
856 * @nodemask: The allowed nodes to allocate from 1060 * @nodemask: The allowed nodes to allocate from
857 * @sync: Whether migration is synchronous or not 1061 * @sync: Whether migration is synchronous or not
1062 * @contended: Return value that is true if compaction was aborted due to lock contention
1063 * @page: Optionally capture a free page of the requested order during compaction
858 * 1064 *
859 * This is the main entry point for direct page compaction. 1065 * This is the main entry point for direct page compaction.
860 */ 1066 */
861unsigned long try_to_compact_pages(struct zonelist *zonelist, 1067unsigned long try_to_compact_pages(struct zonelist *zonelist,
862 int order, gfp_t gfp_mask, nodemask_t *nodemask, 1068 int order, gfp_t gfp_mask, nodemask_t *nodemask,
863 bool sync, bool *contended) 1069 bool sync, bool *contended, struct page **page)
864{ 1070{
865 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 1071 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
866 int may_enter_fs = gfp_mask & __GFP_FS; 1072 int may_enter_fs = gfp_mask & __GFP_FS;
@@ -868,28 +1074,30 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
868 struct zoneref *z; 1074 struct zoneref *z;
869 struct zone *zone; 1075 struct zone *zone;
870 int rc = COMPACT_SKIPPED; 1076 int rc = COMPACT_SKIPPED;
1077 int alloc_flags = 0;
871 1078
872 /* 1079 /* Check if the GFP flags allow compaction */
873 * Check whether it is worth even starting compaction. The order check is
874 * made because an assumption is made that the page allocator can satisfy
875 * the "cheaper" orders without taking special steps
876 */
877 if (!order || !may_enter_fs || !may_perform_io) 1080 if (!order || !may_enter_fs || !may_perform_io)
878 return rc; 1081 return rc;
879 1082
880 count_vm_event(COMPACTSTALL); 1083 count_vm_event(COMPACTSTALL);
881 1084
1085#ifdef CONFIG_CMA
1086 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
1087 alloc_flags |= ALLOC_CMA;
1088#endif
882 /* Compact each zone in the list */ 1089 /* Compact each zone in the list */
883 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, 1090 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
884 nodemask) { 1091 nodemask) {
885 int status; 1092 int status;
886 1093
887 status = compact_zone_order(zone, order, gfp_mask, sync, 1094 status = compact_zone_order(zone, order, gfp_mask, sync,
888 contended); 1095 contended, page);
889 rc = max(status, rc); 1096 rc = max(status, rc);
890 1097
891 /* If a normal allocation would succeed, stop compacting */ 1098 /* If a normal allocation would succeed, stop compacting */
892 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) 1099 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0,
1100 alloc_flags))
893 break; 1101 break;
894 } 1102 }
895 1103
@@ -940,6 +1148,7 @@ int compact_pgdat(pg_data_t *pgdat, int order)
940 struct compact_control cc = { 1148 struct compact_control cc = {
941 .order = order, 1149 .order = order,
942 .sync = false, 1150 .sync = false,
1151 .page = NULL,
943 }; 1152 };
944 1153
945 return __compact_pgdat(pgdat, &cc); 1154 return __compact_pgdat(pgdat, &cc);
@@ -950,6 +1159,7 @@ static int compact_node(int nid)
950 struct compact_control cc = { 1159 struct compact_control cc = {
951 .order = -1, 1160 .order = -1,
952 .sync = true, 1161 .sync = true,
1162 .page = NULL,
953 }; 1163 };
954 1164
955 return __compact_pgdat(NODE_DATA(nid), &cc); 1165 return __compact_pgdat(NODE_DATA(nid), &cc);
diff --git a/mm/filemap.c b/mm/filemap.c
index 384344575c37..83efee76a5c0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1607,13 +1607,13 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1607 * Do we have something in the page cache already? 1607 * Do we have something in the page cache already?
1608 */ 1608 */
1609 page = find_get_page(mapping, offset); 1609 page = find_get_page(mapping, offset);
1610 if (likely(page)) { 1610 if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
1611 /* 1611 /*
1612 * We found the page, so try async readahead before 1612 * We found the page, so try async readahead before
1613 * waiting for the lock. 1613 * waiting for the lock.
1614 */ 1614 */
1615 do_async_mmap_readahead(vma, ra, file, page, offset); 1615 do_async_mmap_readahead(vma, ra, file, page, offset);
1616 } else { 1616 } else if (!page) {
1617 /* No page in the page cache at all */ 1617 /* No page in the page cache at all */
1618 do_sync_mmap_readahead(vma, ra, file, offset); 1618 do_sync_mmap_readahead(vma, ra, file, offset);
1619 count_vm_event(PGMAJFAULT); 1619 count_vm_event(PGMAJFAULT);
@@ -1737,6 +1737,7 @@ EXPORT_SYMBOL(filemap_page_mkwrite);
1737const struct vm_operations_struct generic_file_vm_ops = { 1737const struct vm_operations_struct generic_file_vm_ops = {
1738 .fault = filemap_fault, 1738 .fault = filemap_fault,
1739 .page_mkwrite = filemap_page_mkwrite, 1739 .page_mkwrite = filemap_page_mkwrite,
1740 .remap_pages = generic_file_remap_pages,
1740}; 1741};
1741 1742
1742/* This is used for a general mmap of a disk file */ 1743/* This is used for a general mmap of a disk file */
@@ -1749,7 +1750,6 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1749 return -ENOEXEC; 1750 return -ENOEXEC;
1750 file_accessed(file); 1751 file_accessed(file);
1751 vma->vm_ops = &generic_file_vm_ops; 1752 vma->vm_ops = &generic_file_vm_ops;
1752 vma->vm_flags |= VM_CAN_NONLINEAR;
1753 return 0; 1753 return 0;
1754} 1754}
1755 1755
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 13e013b1270c..a912da6ddfd4 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -167,7 +167,6 @@ __xip_unmap (struct address_space * mapping,
167{ 167{
168 struct vm_area_struct *vma; 168 struct vm_area_struct *vma;
169 struct mm_struct *mm; 169 struct mm_struct *mm;
170 struct prio_tree_iter iter;
171 unsigned long address; 170 unsigned long address;
172 pte_t *pte; 171 pte_t *pte;
173 pte_t pteval; 172 pte_t pteval;
@@ -184,7 +183,7 @@ __xip_unmap (struct address_space * mapping,
184 183
185retry: 184retry:
186 mutex_lock(&mapping->i_mmap_mutex); 185 mutex_lock(&mapping->i_mmap_mutex);
187 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 186 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
188 mm = vma->vm_mm; 187 mm = vma->vm_mm;
189 address = vma->vm_start + 188 address = vma->vm_start +
190 ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 189 ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
@@ -193,11 +192,13 @@ retry:
193 if (pte) { 192 if (pte) {
194 /* Nuke the page table entry. */ 193 /* Nuke the page table entry. */
195 flush_cache_page(vma, address, pte_pfn(*pte)); 194 flush_cache_page(vma, address, pte_pfn(*pte));
196 pteval = ptep_clear_flush_notify(vma, address, pte); 195 pteval = ptep_clear_flush(vma, address, pte);
197 page_remove_rmap(page); 196 page_remove_rmap(page);
198 dec_mm_counter(mm, MM_FILEPAGES); 197 dec_mm_counter(mm, MM_FILEPAGES);
199 BUG_ON(pte_dirty(pteval)); 198 BUG_ON(pte_dirty(pteval));
200 pte_unmap_unlock(pte, ptl); 199 pte_unmap_unlock(pte, ptl);
200 /* must invalidate_page _before_ freeing the page */
201 mmu_notifier_invalidate_page(mm, address);
201 page_cache_release(page); 202 page_cache_release(page);
202 } 203 }
203 } 204 }
@@ -305,6 +306,7 @@ out:
305static const struct vm_operations_struct xip_file_vm_ops = { 306static const struct vm_operations_struct xip_file_vm_ops = {
306 .fault = xip_file_fault, 307 .fault = xip_file_fault,
307 .page_mkwrite = filemap_page_mkwrite, 308 .page_mkwrite = filemap_page_mkwrite,
309 .remap_pages = generic_file_remap_pages,
308}; 310};
309 311
310int xip_file_mmap(struct file * file, struct vm_area_struct * vma) 312int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
@@ -313,7 +315,7 @@ int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
313 315
314 file_accessed(file); 316 file_accessed(file);
315 vma->vm_ops = &xip_file_vm_ops; 317 vma->vm_ops = &xip_file_vm_ops;
316 vma->vm_flags |= VM_CAN_NONLINEAR | VM_MIXEDMAP; 318 vma->vm_flags |= VM_MIXEDMAP;
317 return 0; 319 return 0;
318} 320}
319EXPORT_SYMBOL_GPL(xip_file_mmap); 321EXPORT_SYMBOL_GPL(xip_file_mmap);
diff --git a/mm/fremap.c b/mm/fremap.c
index 048659c0c03d..3899a86851ce 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -5,6 +5,7 @@
5 * 5 *
6 * started by Ingo Molnar, Copyright (C) 2002, 2003 6 * started by Ingo Molnar, Copyright (C) 2002, 2003
7 */ 7 */
8#include <linux/export.h>
8#include <linux/backing-dev.h> 9#include <linux/backing-dev.h>
9#include <linux/mm.h> 10#include <linux/mm.h>
10#include <linux/swap.h> 11#include <linux/swap.h>
@@ -80,9 +81,10 @@ out:
80 return err; 81 return err;
81} 82}
82 83
83static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma, 84int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
84 unsigned long addr, unsigned long size, pgoff_t pgoff) 85 unsigned long size, pgoff_t pgoff)
85{ 86{
87 struct mm_struct *mm = vma->vm_mm;
86 int err; 88 int err;
87 89
88 do { 90 do {
@@ -95,9 +97,9 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma,
95 pgoff++; 97 pgoff++;
96 } while (size); 98 } while (size);
97 99
98 return 0; 100 return 0;
99
100} 101}
102EXPORT_SYMBOL(generic_file_remap_pages);
101 103
102/** 104/**
103 * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma 105 * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma
@@ -167,7 +169,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
167 if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR)) 169 if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR))
168 goto out; 170 goto out;
169 171
170 if (!(vma->vm_flags & VM_CAN_NONLINEAR)) 172 if (!vma->vm_ops->remap_pages)
171 goto out; 173 goto out;
172 174
173 if (start < vma->vm_start || start + size > vma->vm_end) 175 if (start < vma->vm_start || start + size > vma->vm_end)
@@ -212,7 +214,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
212 mutex_lock(&mapping->i_mmap_mutex); 214 mutex_lock(&mapping->i_mmap_mutex);
213 flush_dcache_mmap_lock(mapping); 215 flush_dcache_mmap_lock(mapping);
214 vma->vm_flags |= VM_NONLINEAR; 216 vma->vm_flags |= VM_NONLINEAR;
215 vma_prio_tree_remove(vma, &mapping->i_mmap); 217 vma_interval_tree_remove(vma, &mapping->i_mmap);
216 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); 218 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
217 flush_dcache_mmap_unlock(mapping); 219 flush_dcache_mmap_unlock(mapping);
218 mutex_unlock(&mapping->i_mmap_mutex); 220 mutex_unlock(&mapping->i_mmap_mutex);
@@ -228,7 +230,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
228 } 230 }
229 231
230 mmu_notifier_invalidate_range_start(mm, start, start + size); 232 mmu_notifier_invalidate_range_start(mm, start, start + size);
231 err = populate_range(mm, vma, start, size, pgoff); 233 err = vma->vm_ops->remap_pages(vma, start, size, pgoff);
232 mmu_notifier_invalidate_range_end(mm, start, start + size); 234 mmu_notifier_invalidate_range_end(mm, start, start + size);
233 if (!err && !(flags & MAP_NONBLOCK)) { 235 if (!err && !(flags & MAP_NONBLOCK)) {
234 if (vma->vm_flags & VM_LOCKED) { 236 if (vma->vm_flags & VM_LOCKED) {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 141dbb695097..a863af26c79c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -102,10 +102,7 @@ static int set_recommended_min_free_kbytes(void)
102 unsigned long recommended_min; 102 unsigned long recommended_min;
103 extern int min_free_kbytes; 103 extern int min_free_kbytes;
104 104
105 if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG, 105 if (!khugepaged_enabled())
106 &transparent_hugepage_flags) &&
107 !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
108 &transparent_hugepage_flags))
109 return 0; 106 return 0;
110 107
111 for_each_populated_zone(zone) 108 for_each_populated_zone(zone)
@@ -139,12 +136,6 @@ static int start_khugepaged(void)
139{ 136{
140 int err = 0; 137 int err = 0;
141 if (khugepaged_enabled()) { 138 if (khugepaged_enabled()) {
142 int wakeup;
143 if (unlikely(!mm_slot_cache || !mm_slots_hash)) {
144 err = -ENOMEM;
145 goto out;
146 }
147 mutex_lock(&khugepaged_mutex);
148 if (!khugepaged_thread) 139 if (!khugepaged_thread)
149 khugepaged_thread = kthread_run(khugepaged, NULL, 140 khugepaged_thread = kthread_run(khugepaged, NULL,
150 "khugepaged"); 141 "khugepaged");
@@ -154,16 +145,16 @@ static int start_khugepaged(void)
154 err = PTR_ERR(khugepaged_thread); 145 err = PTR_ERR(khugepaged_thread);
155 khugepaged_thread = NULL; 146 khugepaged_thread = NULL;
156 } 147 }
157 wakeup = !list_empty(&khugepaged_scan.mm_head); 148
158 mutex_unlock(&khugepaged_mutex); 149 if (!list_empty(&khugepaged_scan.mm_head))
159 if (wakeup)
160 wake_up_interruptible(&khugepaged_wait); 150 wake_up_interruptible(&khugepaged_wait);
161 151
162 set_recommended_min_free_kbytes(); 152 set_recommended_min_free_kbytes();
163 } else 153 } else if (khugepaged_thread) {
164 /* wakeup to exit */ 154 kthread_stop(khugepaged_thread);
165 wake_up_interruptible(&khugepaged_wait); 155 khugepaged_thread = NULL;
166out: 156 }
157
167 return err; 158 return err;
168} 159}
169 160
@@ -224,18 +215,16 @@ static ssize_t enabled_store(struct kobject *kobj,
224 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); 215 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
225 216
226 if (ret > 0) { 217 if (ret > 0) {
227 int err = start_khugepaged(); 218 int err;
219
220 mutex_lock(&khugepaged_mutex);
221 err = start_khugepaged();
222 mutex_unlock(&khugepaged_mutex);
223
228 if (err) 224 if (err)
229 ret = err; 225 ret = err;
230 } 226 }
231 227
232 if (ret > 0 &&
233 (test_bit(TRANSPARENT_HUGEPAGE_FLAG,
234 &transparent_hugepage_flags) ||
235 test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
236 &transparent_hugepage_flags)))
237 set_recommended_min_free_kbytes();
238
239 return ret; 228 return ret;
240} 229}
241static struct kobj_attribute enabled_attr = 230static struct kobj_attribute enabled_attr =
@@ -570,8 +559,6 @@ static int __init hugepage_init(void)
570 559
571 start_khugepaged(); 560 start_khugepaged();
572 561
573 set_recommended_min_free_kbytes();
574
575 return 0; 562 return 0;
576out: 563out:
577 hugepage_exit_sysfs(hugepage_kobj); 564 hugepage_exit_sysfs(hugepage_kobj);
@@ -611,19 +598,6 @@ out:
611} 598}
612__setup("transparent_hugepage=", setup_transparent_hugepage); 599__setup("transparent_hugepage=", setup_transparent_hugepage);
613 600
614static void prepare_pmd_huge_pte(pgtable_t pgtable,
615 struct mm_struct *mm)
616{
617 assert_spin_locked(&mm->page_table_lock);
618
619 /* FIFO */
620 if (!mm->pmd_huge_pte)
621 INIT_LIST_HEAD(&pgtable->lru);
622 else
623 list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
624 mm->pmd_huge_pte = pgtable;
625}
626
627static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) 601static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
628{ 602{
629 if (likely(vma->vm_flags & VM_WRITE)) 603 if (likely(vma->vm_flags & VM_WRITE))
@@ -665,7 +639,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
665 */ 639 */
666 page_add_new_anon_rmap(page, vma, haddr); 640 page_add_new_anon_rmap(page, vma, haddr);
667 set_pmd_at(mm, haddr, pmd, entry); 641 set_pmd_at(mm, haddr, pmd, entry);
668 prepare_pmd_huge_pte(pgtable, mm); 642 pgtable_trans_huge_deposit(mm, pgtable);
669 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); 643 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
670 mm->nr_ptes++; 644 mm->nr_ptes++;
671 spin_unlock(&mm->page_table_lock); 645 spin_unlock(&mm->page_table_lock);
@@ -791,7 +765,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
791 pmdp_set_wrprotect(src_mm, addr, src_pmd); 765 pmdp_set_wrprotect(src_mm, addr, src_pmd);
792 pmd = pmd_mkold(pmd_wrprotect(pmd)); 766 pmd = pmd_mkold(pmd_wrprotect(pmd));
793 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 767 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
794 prepare_pmd_huge_pte(pgtable, dst_mm); 768 pgtable_trans_huge_deposit(dst_mm, pgtable);
795 dst_mm->nr_ptes++; 769 dst_mm->nr_ptes++;
796 770
797 ret = 0; 771 ret = 0;
@@ -802,25 +776,6 @@ out:
802 return ret; 776 return ret;
803} 777}
804 778
805/* no "address" argument so destroys page coloring of some arch */
806pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
807{
808 pgtable_t pgtable;
809
810 assert_spin_locked(&mm->page_table_lock);
811
812 /* FIFO */
813 pgtable = mm->pmd_huge_pte;
814 if (list_empty(&pgtable->lru))
815 mm->pmd_huge_pte = NULL;
816 else {
817 mm->pmd_huge_pte = list_entry(pgtable->lru.next,
818 struct page, lru);
819 list_del(&pgtable->lru);
820 }
821 return pgtable;
822}
823
824static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, 779static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
825 struct vm_area_struct *vma, 780 struct vm_area_struct *vma,
826 unsigned long address, 781 unsigned long address,
@@ -832,6 +787,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
832 pmd_t _pmd; 787 pmd_t _pmd;
833 int ret = 0, i; 788 int ret = 0, i;
834 struct page **pages; 789 struct page **pages;
790 unsigned long mmun_start; /* For mmu_notifiers */
791 unsigned long mmun_end; /* For mmu_notifiers */
835 792
836 pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, 793 pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
837 GFP_KERNEL); 794 GFP_KERNEL);
@@ -868,15 +825,19 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
868 cond_resched(); 825 cond_resched();
869 } 826 }
870 827
828 mmun_start = haddr;
829 mmun_end = haddr + HPAGE_PMD_SIZE;
830 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
831
871 spin_lock(&mm->page_table_lock); 832 spin_lock(&mm->page_table_lock);
872 if (unlikely(!pmd_same(*pmd, orig_pmd))) 833 if (unlikely(!pmd_same(*pmd, orig_pmd)))
873 goto out_free_pages; 834 goto out_free_pages;
874 VM_BUG_ON(!PageHead(page)); 835 VM_BUG_ON(!PageHead(page));
875 836
876 pmdp_clear_flush_notify(vma, haddr, pmd); 837 pmdp_clear_flush(vma, haddr, pmd);
877 /* leave pmd empty until pte is filled */ 838 /* leave pmd empty until pte is filled */
878 839
879 pgtable = get_pmd_huge_pte(mm); 840 pgtable = pgtable_trans_huge_withdraw(mm);
880 pmd_populate(mm, &_pmd, pgtable); 841 pmd_populate(mm, &_pmd, pgtable);
881 842
882 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 843 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -896,6 +857,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
896 page_remove_rmap(page); 857 page_remove_rmap(page);
897 spin_unlock(&mm->page_table_lock); 858 spin_unlock(&mm->page_table_lock);
898 859
860 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
861
899 ret |= VM_FAULT_WRITE; 862 ret |= VM_FAULT_WRITE;
900 put_page(page); 863 put_page(page);
901 864
@@ -904,6 +867,7 @@ out:
904 867
905out_free_pages: 868out_free_pages:
906 spin_unlock(&mm->page_table_lock); 869 spin_unlock(&mm->page_table_lock);
870 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
907 mem_cgroup_uncharge_start(); 871 mem_cgroup_uncharge_start();
908 for (i = 0; i < HPAGE_PMD_NR; i++) { 872 for (i = 0; i < HPAGE_PMD_NR; i++) {
909 mem_cgroup_uncharge_page(pages[i]); 873 mem_cgroup_uncharge_page(pages[i]);
@@ -920,6 +884,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
920 int ret = 0; 884 int ret = 0;
921 struct page *page, *new_page; 885 struct page *page, *new_page;
922 unsigned long haddr; 886 unsigned long haddr;
887 unsigned long mmun_start; /* For mmu_notifiers */
888 unsigned long mmun_end; /* For mmu_notifiers */
923 889
924 VM_BUG_ON(!vma->anon_vma); 890 VM_BUG_ON(!vma->anon_vma);
925 spin_lock(&mm->page_table_lock); 891 spin_lock(&mm->page_table_lock);
@@ -934,7 +900,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
934 entry = pmd_mkyoung(orig_pmd); 900 entry = pmd_mkyoung(orig_pmd);
935 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 901 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
936 if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) 902 if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1))
937 update_mmu_cache(vma, address, entry); 903 update_mmu_cache_pmd(vma, address, pmd);
938 ret |= VM_FAULT_WRITE; 904 ret |= VM_FAULT_WRITE;
939 goto out_unlock; 905 goto out_unlock;
940 } 906 }
@@ -970,38 +936,47 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
970 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); 936 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
971 __SetPageUptodate(new_page); 937 __SetPageUptodate(new_page);
972 938
939 mmun_start = haddr;
940 mmun_end = haddr + HPAGE_PMD_SIZE;
941 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
942
973 spin_lock(&mm->page_table_lock); 943 spin_lock(&mm->page_table_lock);
974 put_page(page); 944 put_page(page);
975 if (unlikely(!pmd_same(*pmd, orig_pmd))) { 945 if (unlikely(!pmd_same(*pmd, orig_pmd))) {
976 spin_unlock(&mm->page_table_lock); 946 spin_unlock(&mm->page_table_lock);
977 mem_cgroup_uncharge_page(new_page); 947 mem_cgroup_uncharge_page(new_page);
978 put_page(new_page); 948 put_page(new_page);
979 goto out; 949 goto out_mn;
980 } else { 950 } else {
981 pmd_t entry; 951 pmd_t entry;
982 VM_BUG_ON(!PageHead(page)); 952 VM_BUG_ON(!PageHead(page));
983 entry = mk_pmd(new_page, vma->vm_page_prot); 953 entry = mk_pmd(new_page, vma->vm_page_prot);
984 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 954 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
985 entry = pmd_mkhuge(entry); 955 entry = pmd_mkhuge(entry);
986 pmdp_clear_flush_notify(vma, haddr, pmd); 956 pmdp_clear_flush(vma, haddr, pmd);
987 page_add_new_anon_rmap(new_page, vma, haddr); 957 page_add_new_anon_rmap(new_page, vma, haddr);
988 set_pmd_at(mm, haddr, pmd, entry); 958 set_pmd_at(mm, haddr, pmd, entry);
989 update_mmu_cache(vma, address, entry); 959 update_mmu_cache_pmd(vma, address, pmd);
990 page_remove_rmap(page); 960 page_remove_rmap(page);
991 put_page(page); 961 put_page(page);
992 ret |= VM_FAULT_WRITE; 962 ret |= VM_FAULT_WRITE;
993 } 963 }
994out_unlock:
995 spin_unlock(&mm->page_table_lock); 964 spin_unlock(&mm->page_table_lock);
965out_mn:
966 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
996out: 967out:
997 return ret; 968 return ret;
969out_unlock:
970 spin_unlock(&mm->page_table_lock);
971 return ret;
998} 972}
999 973
1000struct page *follow_trans_huge_pmd(struct mm_struct *mm, 974struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1001 unsigned long addr, 975 unsigned long addr,
1002 pmd_t *pmd, 976 pmd_t *pmd,
1003 unsigned int flags) 977 unsigned int flags)
1004{ 978{
979 struct mm_struct *mm = vma->vm_mm;
1005 struct page *page = NULL; 980 struct page *page = NULL;
1006 981
1007 assert_spin_locked(&mm->page_table_lock); 982 assert_spin_locked(&mm->page_table_lock);
@@ -1024,6 +999,14 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm,
1024 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); 999 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
1025 set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); 1000 set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
1026 } 1001 }
1002 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
1003 if (page->mapping && trylock_page(page)) {
1004 lru_add_drain();
1005 if (page->mapping)
1006 mlock_vma_page(page);
1007 unlock_page(page);
1008 }
1009 }
1027 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; 1010 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
1028 VM_BUG_ON(!PageCompound(page)); 1011 VM_BUG_ON(!PageCompound(page));
1029 if (flags & FOLL_GET) 1012 if (flags & FOLL_GET)
@@ -1041,9 +1024,10 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1041 if (__pmd_trans_huge_lock(pmd, vma) == 1) { 1024 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1042 struct page *page; 1025 struct page *page;
1043 pgtable_t pgtable; 1026 pgtable_t pgtable;
1044 pgtable = get_pmd_huge_pte(tlb->mm); 1027 pmd_t orig_pmd;
1045 page = pmd_page(*pmd); 1028 pgtable = pgtable_trans_huge_withdraw(tlb->mm);
1046 pmd_clear(pmd); 1029 orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
1030 page = pmd_page(orig_pmd);
1047 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1031 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1048 page_remove_rmap(page); 1032 page_remove_rmap(page);
1049 VM_BUG_ON(page_mapcount(page) < 0); 1033 VM_BUG_ON(page_mapcount(page) < 0);
@@ -1207,7 +1191,11 @@ static int __split_huge_page_splitting(struct page *page,
1207 struct mm_struct *mm = vma->vm_mm; 1191 struct mm_struct *mm = vma->vm_mm;
1208 pmd_t *pmd; 1192 pmd_t *pmd;
1209 int ret = 0; 1193 int ret = 0;
1194 /* For mmu_notifiers */
1195 const unsigned long mmun_start = address;
1196 const unsigned long mmun_end = address + HPAGE_PMD_SIZE;
1210 1197
1198 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1211 spin_lock(&mm->page_table_lock); 1199 spin_lock(&mm->page_table_lock);
1212 pmd = page_check_address_pmd(page, mm, address, 1200 pmd = page_check_address_pmd(page, mm, address,
1213 PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); 1201 PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
@@ -1219,10 +1207,11 @@ static int __split_huge_page_splitting(struct page *page,
1219 * and it won't wait on the anon_vma->root->mutex to 1207 * and it won't wait on the anon_vma->root->mutex to
1220 * serialize against split_huge_page*. 1208 * serialize against split_huge_page*.
1221 */ 1209 */
1222 pmdp_splitting_flush_notify(vma, address, pmd); 1210 pmdp_splitting_flush(vma, address, pmd);
1223 ret = 1; 1211 ret = 1;
1224 } 1212 }
1225 spin_unlock(&mm->page_table_lock); 1213 spin_unlock(&mm->page_table_lock);
1214 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1226 1215
1227 return ret; 1216 return ret;
1228} 1217}
@@ -1358,11 +1347,11 @@ static int __split_huge_page_map(struct page *page,
1358 pmd = page_check_address_pmd(page, mm, address, 1347 pmd = page_check_address_pmd(page, mm, address,
1359 PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); 1348 PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
1360 if (pmd) { 1349 if (pmd) {
1361 pgtable = get_pmd_huge_pte(mm); 1350 pgtable = pgtable_trans_huge_withdraw(mm);
1362 pmd_populate(mm, &_pmd, pgtable); 1351 pmd_populate(mm, &_pmd, pgtable);
1363 1352
1364 for (i = 0, haddr = address; i < HPAGE_PMD_NR; 1353 haddr = address;
1365 i++, haddr += PAGE_SIZE) { 1354 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
1366 pte_t *pte, entry; 1355 pte_t *pte, entry;
1367 BUG_ON(PageCompound(page+i)); 1356 BUG_ON(PageCompound(page+i));
1368 entry = mk_pte(page + i, vma->vm_page_prot); 1357 entry = mk_pte(page + i, vma->vm_page_prot);
@@ -1406,8 +1395,7 @@ static int __split_huge_page_map(struct page *page,
1406 * SMP TLB and finally we write the non-huge version 1395 * SMP TLB and finally we write the non-huge version
1407 * of the pmd entry with pmd_populate. 1396 * of the pmd entry with pmd_populate.
1408 */ 1397 */
1409 set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd)); 1398 pmdp_invalidate(vma, address, pmd);
1410 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
1411 pmd_populate(mm, pmd, pgtable); 1399 pmd_populate(mm, pmd, pgtable);
1412 ret = 1; 1400 ret = 1;
1413 } 1401 }
@@ -1421,18 +1409,17 @@ static void __split_huge_page(struct page *page,
1421 struct anon_vma *anon_vma) 1409 struct anon_vma *anon_vma)
1422{ 1410{
1423 int mapcount, mapcount2; 1411 int mapcount, mapcount2;
1412 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1424 struct anon_vma_chain *avc; 1413 struct anon_vma_chain *avc;
1425 1414
1426 BUG_ON(!PageHead(page)); 1415 BUG_ON(!PageHead(page));
1427 BUG_ON(PageTail(page)); 1416 BUG_ON(PageTail(page));
1428 1417
1429 mapcount = 0; 1418 mapcount = 0;
1430 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { 1419 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1431 struct vm_area_struct *vma = avc->vma; 1420 struct vm_area_struct *vma = avc->vma;
1432 unsigned long addr = vma_address(page, vma); 1421 unsigned long addr = vma_address(page, vma);
1433 BUG_ON(is_vma_temporary_stack(vma)); 1422 BUG_ON(is_vma_temporary_stack(vma));
1434 if (addr == -EFAULT)
1435 continue;
1436 mapcount += __split_huge_page_splitting(page, vma, addr); 1423 mapcount += __split_huge_page_splitting(page, vma, addr);
1437 } 1424 }
1438 /* 1425 /*
@@ -1453,12 +1440,10 @@ static void __split_huge_page(struct page *page,
1453 __split_huge_page_refcount(page); 1440 __split_huge_page_refcount(page);
1454 1441
1455 mapcount2 = 0; 1442 mapcount2 = 0;
1456 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { 1443 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1457 struct vm_area_struct *vma = avc->vma; 1444 struct vm_area_struct *vma = avc->vma;
1458 unsigned long addr = vma_address(page, vma); 1445 unsigned long addr = vma_address(page, vma);
1459 BUG_ON(is_vma_temporary_stack(vma)); 1446 BUG_ON(is_vma_temporary_stack(vma));
1460 if (addr == -EFAULT)
1461 continue;
1462 mapcount2 += __split_huge_page_map(page, vma, addr); 1447 mapcount2 += __split_huge_page_map(page, vma, addr);
1463 } 1448 }
1464 if (mapcount != mapcount2) 1449 if (mapcount != mapcount2)
@@ -1491,12 +1476,13 @@ out:
1491 return ret; 1476 return ret;
1492} 1477}
1493 1478
1494#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \ 1479#define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
1495 VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
1496 1480
1497int hugepage_madvise(struct vm_area_struct *vma, 1481int hugepage_madvise(struct vm_area_struct *vma,
1498 unsigned long *vm_flags, int advice) 1482 unsigned long *vm_flags, int advice)
1499{ 1483{
1484 struct mm_struct *mm = vma->vm_mm;
1485
1500 switch (advice) { 1486 switch (advice) {
1501 case MADV_HUGEPAGE: 1487 case MADV_HUGEPAGE:
1502 /* 1488 /*
@@ -1504,6 +1490,8 @@ int hugepage_madvise(struct vm_area_struct *vma,
1504 */ 1490 */
1505 if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) 1491 if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
1506 return -EINVAL; 1492 return -EINVAL;
1493 if (mm->def_flags & VM_NOHUGEPAGE)
1494 return -EINVAL;
1507 *vm_flags &= ~VM_NOHUGEPAGE; 1495 *vm_flags &= ~VM_NOHUGEPAGE;
1508 *vm_flags |= VM_HUGEPAGE; 1496 *vm_flags |= VM_HUGEPAGE;
1509 /* 1497 /*
@@ -1655,11 +1643,7 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
1655 if (vma->vm_ops) 1643 if (vma->vm_ops)
1656 /* khugepaged not yet working on file or special mappings */ 1644 /* khugepaged not yet working on file or special mappings */
1657 return 0; 1645 return 0;
1658 /* 1646 VM_BUG_ON(vma->vm_flags & VM_NO_THP);
1659 * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
1660 * true too, verify it here.
1661 */
1662 VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
1663 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 1647 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
1664 hend = vma->vm_end & HPAGE_PMD_MASK; 1648 hend = vma->vm_end & HPAGE_PMD_MASK;
1665 if (hstart < hend) 1649 if (hstart < hend)
@@ -1833,28 +1817,35 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
1833 } 1817 }
1834} 1818}
1835 1819
1836static void collapse_huge_page(struct mm_struct *mm, 1820static void khugepaged_alloc_sleep(void)
1837 unsigned long address,
1838 struct page **hpage,
1839 struct vm_area_struct *vma,
1840 int node)
1841{ 1821{
1842 pgd_t *pgd; 1822 wait_event_freezable_timeout(khugepaged_wait, false,
1843 pud_t *pud; 1823 msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
1844 pmd_t *pmd, _pmd; 1824}
1845 pte_t *pte;
1846 pgtable_t pgtable;
1847 struct page *new_page;
1848 spinlock_t *ptl;
1849 int isolated;
1850 unsigned long hstart, hend;
1851 1825
1852 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1826#ifdef CONFIG_NUMA
1853#ifndef CONFIG_NUMA 1827static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
1854 up_read(&mm->mmap_sem); 1828{
1855 VM_BUG_ON(!*hpage); 1829 if (IS_ERR(*hpage)) {
1856 new_page = *hpage; 1830 if (!*wait)
1857#else 1831 return false;
1832
1833 *wait = false;
1834 *hpage = NULL;
1835 khugepaged_alloc_sleep();
1836 } else if (*hpage) {
1837 put_page(*hpage);
1838 *hpage = NULL;
1839 }
1840
1841 return true;
1842}
1843
1844static struct page
1845*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
1846 struct vm_area_struct *vma, unsigned long address,
1847 int node)
1848{
1858 VM_BUG_ON(*hpage); 1849 VM_BUG_ON(*hpage);
1859 /* 1850 /*
1860 * Allocate the page while the vma is still valid and under 1851 * Allocate the page while the vma is still valid and under
@@ -1866,7 +1857,7 @@ static void collapse_huge_page(struct mm_struct *mm,
1866 * mmap_sem in read mode is good idea also to allow greater 1857 * mmap_sem in read mode is good idea also to allow greater
1867 * scalability. 1858 * scalability.
1868 */ 1859 */
1869 new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address, 1860 *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
1870 node, __GFP_OTHER_NODE); 1861 node, __GFP_OTHER_NODE);
1871 1862
1872 /* 1863 /*
@@ -1874,20 +1865,85 @@ static void collapse_huge_page(struct mm_struct *mm,
1874 * preparation for taking it in write mode. 1865 * preparation for taking it in write mode.
1875 */ 1866 */
1876 up_read(&mm->mmap_sem); 1867 up_read(&mm->mmap_sem);
1877 if (unlikely(!new_page)) { 1868 if (unlikely(!*hpage)) {
1878 count_vm_event(THP_COLLAPSE_ALLOC_FAILED); 1869 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
1879 *hpage = ERR_PTR(-ENOMEM); 1870 *hpage = ERR_PTR(-ENOMEM);
1880 return; 1871 return NULL;
1881 } 1872 }
1882#endif
1883 1873
1884 count_vm_event(THP_COLLAPSE_ALLOC); 1874 count_vm_event(THP_COLLAPSE_ALLOC);
1885 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { 1875 return *hpage;
1886#ifdef CONFIG_NUMA 1876}
1887 put_page(new_page); 1877#else
1878static struct page *khugepaged_alloc_hugepage(bool *wait)
1879{
1880 struct page *hpage;
1881
1882 do {
1883 hpage = alloc_hugepage(khugepaged_defrag());
1884 if (!hpage) {
1885 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
1886 if (!*wait)
1887 return NULL;
1888
1889 *wait = false;
1890 khugepaged_alloc_sleep();
1891 } else
1892 count_vm_event(THP_COLLAPSE_ALLOC);
1893 } while (unlikely(!hpage) && likely(khugepaged_enabled()));
1894
1895 return hpage;
1896}
1897
1898static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
1899{
1900 if (!*hpage)
1901 *hpage = khugepaged_alloc_hugepage(wait);
1902
1903 if (unlikely(!*hpage))
1904 return false;
1905
1906 return true;
1907}
1908
1909static struct page
1910*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
1911 struct vm_area_struct *vma, unsigned long address,
1912 int node)
1913{
1914 up_read(&mm->mmap_sem);
1915 VM_BUG_ON(!*hpage);
1916 return *hpage;
1917}
1888#endif 1918#endif
1919
1920static void collapse_huge_page(struct mm_struct *mm,
1921 unsigned long address,
1922 struct page **hpage,
1923 struct vm_area_struct *vma,
1924 int node)
1925{
1926 pgd_t *pgd;
1927 pud_t *pud;
1928 pmd_t *pmd, _pmd;
1929 pte_t *pte;
1930 pgtable_t pgtable;
1931 struct page *new_page;
1932 spinlock_t *ptl;
1933 int isolated;
1934 unsigned long hstart, hend;
1935 unsigned long mmun_start; /* For mmu_notifiers */
1936 unsigned long mmun_end; /* For mmu_notifiers */
1937
1938 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1939
1940 /* release the mmap_sem read lock. */
1941 new_page = khugepaged_alloc_page(hpage, mm, vma, address, node);
1942 if (!new_page)
1943 return;
1944
1945 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
1889 return; 1946 return;
1890 }
1891 1947
1892 /* 1948 /*
1893 * Prevent all access to pagetables with the exception of 1949 * Prevent all access to pagetables with the exception of
@@ -1912,11 +1968,7 @@ static void collapse_huge_page(struct mm_struct *mm,
1912 goto out; 1968 goto out;
1913 if (is_vma_temporary_stack(vma)) 1969 if (is_vma_temporary_stack(vma))
1914 goto out; 1970 goto out;
1915 /* 1971 VM_BUG_ON(vma->vm_flags & VM_NO_THP);
1916 * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
1917 * true too, verify it here.
1918 */
1919 VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
1920 1972
1921 pgd = pgd_offset(mm, address); 1973 pgd = pgd_offset(mm, address);
1922 if (!pgd_present(*pgd)) 1974 if (!pgd_present(*pgd))
@@ -1936,6 +1988,9 @@ static void collapse_huge_page(struct mm_struct *mm,
1936 pte = pte_offset_map(pmd, address); 1988 pte = pte_offset_map(pmd, address);
1937 ptl = pte_lockptr(mm, pmd); 1989 ptl = pte_lockptr(mm, pmd);
1938 1990
1991 mmun_start = address;
1992 mmun_end = address + HPAGE_PMD_SIZE;
1993 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1939 spin_lock(&mm->page_table_lock); /* probably unnecessary */ 1994 spin_lock(&mm->page_table_lock); /* probably unnecessary */
1940 /* 1995 /*
1941 * After this gup_fast can't run anymore. This also removes 1996 * After this gup_fast can't run anymore. This also removes
@@ -1943,8 +1998,9 @@ static void collapse_huge_page(struct mm_struct *mm,
1943 * huge and small TLB entries for the same virtual address 1998 * huge and small TLB entries for the same virtual address
1944 * to avoid the risk of CPU bugs in that area. 1999 * to avoid the risk of CPU bugs in that area.
1945 */ 2000 */
1946 _pmd = pmdp_clear_flush_notify(vma, address, pmd); 2001 _pmd = pmdp_clear_flush(vma, address, pmd);
1947 spin_unlock(&mm->page_table_lock); 2002 spin_unlock(&mm->page_table_lock);
2003 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1948 2004
1949 spin_lock(ptl); 2005 spin_lock(ptl);
1950 isolated = __collapse_huge_page_isolate(vma, address, pte); 2006 isolated = __collapse_huge_page_isolate(vma, address, pte);
@@ -1970,8 +2026,6 @@ static void collapse_huge_page(struct mm_struct *mm,
1970 pte_unmap(pte); 2026 pte_unmap(pte);
1971 __SetPageUptodate(new_page); 2027 __SetPageUptodate(new_page);
1972 pgtable = pmd_pgtable(_pmd); 2028 pgtable = pmd_pgtable(_pmd);
1973 VM_BUG_ON(page_count(pgtable) != 1);
1974 VM_BUG_ON(page_mapcount(pgtable) != 0);
1975 2029
1976 _pmd = mk_pmd(new_page, vma->vm_page_prot); 2030 _pmd = mk_pmd(new_page, vma->vm_page_prot);
1977 _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); 2031 _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
@@ -1988,13 +2042,12 @@ static void collapse_huge_page(struct mm_struct *mm,
1988 BUG_ON(!pmd_none(*pmd)); 2042 BUG_ON(!pmd_none(*pmd));
1989 page_add_new_anon_rmap(new_page, vma, address); 2043 page_add_new_anon_rmap(new_page, vma, address);
1990 set_pmd_at(mm, address, pmd, _pmd); 2044 set_pmd_at(mm, address, pmd, _pmd);
1991 update_mmu_cache(vma, address, _pmd); 2045 update_mmu_cache_pmd(vma, address, pmd);
1992 prepare_pmd_huge_pte(pgtable, mm); 2046 pgtable_trans_huge_deposit(mm, pgtable);
1993 spin_unlock(&mm->page_table_lock); 2047 spin_unlock(&mm->page_table_lock);
1994 2048
1995#ifndef CONFIG_NUMA
1996 *hpage = NULL; 2049 *hpage = NULL;
1997#endif 2050
1998 khugepaged_pages_collapsed++; 2051 khugepaged_pages_collapsed++;
1999out_up_write: 2052out_up_write:
2000 up_write(&mm->mmap_sem); 2053 up_write(&mm->mmap_sem);
@@ -2002,9 +2055,6 @@ out_up_write:
2002 2055
2003out: 2056out:
2004 mem_cgroup_uncharge_page(new_page); 2057 mem_cgroup_uncharge_page(new_page);
2005#ifdef CONFIG_NUMA
2006 put_page(new_page);
2007#endif
2008 goto out_up_write; 2058 goto out_up_write;
2009} 2059}
2010 2060
@@ -2154,12 +2204,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
2154 goto skip; 2204 goto skip;
2155 if (is_vma_temporary_stack(vma)) 2205 if (is_vma_temporary_stack(vma))
2156 goto skip; 2206 goto skip;
2157 /* 2207 VM_BUG_ON(vma->vm_flags & VM_NO_THP);
2158 * If is_pfn_mapping() is true is_learn_pfn_mapping()
2159 * must be true too, verify it here.
2160 */
2161 VM_BUG_ON(is_linear_pfn_mapping(vma) ||
2162 vma->vm_flags & VM_NO_THP);
2163 2208
2164 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2209 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2165 hend = vma->vm_end & HPAGE_PMD_MASK; 2210 hend = vma->vm_end & HPAGE_PMD_MASK;
@@ -2234,32 +2279,23 @@ static int khugepaged_has_work(void)
2234static int khugepaged_wait_event(void) 2279static int khugepaged_wait_event(void)
2235{ 2280{
2236 return !list_empty(&khugepaged_scan.mm_head) || 2281 return !list_empty(&khugepaged_scan.mm_head) ||
2237 !khugepaged_enabled(); 2282 kthread_should_stop();
2238} 2283}
2239 2284
2240static void khugepaged_do_scan(struct page **hpage) 2285static void khugepaged_do_scan(void)
2241{ 2286{
2287 struct page *hpage = NULL;
2242 unsigned int progress = 0, pass_through_head = 0; 2288 unsigned int progress = 0, pass_through_head = 0;
2243 unsigned int pages = khugepaged_pages_to_scan; 2289 unsigned int pages = khugepaged_pages_to_scan;
2290 bool wait = true;
2244 2291
2245 barrier(); /* write khugepaged_pages_to_scan to local stack */ 2292 barrier(); /* write khugepaged_pages_to_scan to local stack */
2246 2293
2247 while (progress < pages) { 2294 while (progress < pages) {
2248 cond_resched(); 2295 if (!khugepaged_prealloc_page(&hpage, &wait))
2249
2250#ifndef CONFIG_NUMA
2251 if (!*hpage) {
2252 *hpage = alloc_hugepage(khugepaged_defrag());
2253 if (unlikely(!*hpage)) {
2254 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2255 break;
2256 }
2257 count_vm_event(THP_COLLAPSE_ALLOC);
2258 }
2259#else
2260 if (IS_ERR(*hpage))
2261 break; 2296 break;
2262#endif 2297
2298 cond_resched();
2263 2299
2264 if (unlikely(kthread_should_stop() || freezing(current))) 2300 if (unlikely(kthread_should_stop() || freezing(current)))
2265 break; 2301 break;
@@ -2270,73 +2306,32 @@ static void khugepaged_do_scan(struct page **hpage)
2270 if (khugepaged_has_work() && 2306 if (khugepaged_has_work() &&
2271 pass_through_head < 2) 2307 pass_through_head < 2)
2272 progress += khugepaged_scan_mm_slot(pages - progress, 2308 progress += khugepaged_scan_mm_slot(pages - progress,
2273 hpage); 2309 &hpage);
2274 else 2310 else
2275 progress = pages; 2311 progress = pages;
2276 spin_unlock(&khugepaged_mm_lock); 2312 spin_unlock(&khugepaged_mm_lock);
2277 } 2313 }
2278}
2279 2314
2280static void khugepaged_alloc_sleep(void) 2315 if (!IS_ERR_OR_NULL(hpage))
2281{ 2316 put_page(hpage);
2282 wait_event_freezable_timeout(khugepaged_wait, false,
2283 msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
2284} 2317}
2285 2318
2286#ifndef CONFIG_NUMA 2319static void khugepaged_wait_work(void)
2287static struct page *khugepaged_alloc_hugepage(void)
2288{ 2320{
2289 struct page *hpage; 2321 try_to_freeze();
2290
2291 do {
2292 hpage = alloc_hugepage(khugepaged_defrag());
2293 if (!hpage) {
2294 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2295 khugepaged_alloc_sleep();
2296 } else
2297 count_vm_event(THP_COLLAPSE_ALLOC);
2298 } while (unlikely(!hpage) &&
2299 likely(khugepaged_enabled()));
2300 return hpage;
2301}
2302#endif
2303 2322
2304static void khugepaged_loop(void) 2323 if (khugepaged_has_work()) {
2305{ 2324 if (!khugepaged_scan_sleep_millisecs)
2306 struct page *hpage; 2325 return;
2307 2326
2308#ifdef CONFIG_NUMA 2327 wait_event_freezable_timeout(khugepaged_wait,
2309 hpage = NULL; 2328 kthread_should_stop(),
2310#endif 2329 msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
2311 while (likely(khugepaged_enabled())) { 2330 return;
2312#ifndef CONFIG_NUMA
2313 hpage = khugepaged_alloc_hugepage();
2314 if (unlikely(!hpage))
2315 break;
2316#else
2317 if (IS_ERR(hpage)) {
2318 khugepaged_alloc_sleep();
2319 hpage = NULL;
2320 }
2321#endif
2322
2323 khugepaged_do_scan(&hpage);
2324#ifndef CONFIG_NUMA
2325 if (hpage)
2326 put_page(hpage);
2327#endif
2328 try_to_freeze();
2329 if (unlikely(kthread_should_stop()))
2330 break;
2331 if (khugepaged_has_work()) {
2332 if (!khugepaged_scan_sleep_millisecs)
2333 continue;
2334 wait_event_freezable_timeout(khugepaged_wait, false,
2335 msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
2336 } else if (khugepaged_enabled())
2337 wait_event_freezable(khugepaged_wait,
2338 khugepaged_wait_event());
2339 } 2331 }
2332
2333 if (khugepaged_enabled())
2334 wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
2340} 2335}
2341 2336
2342static int khugepaged(void *none) 2337static int khugepaged(void *none)
@@ -2346,20 +2341,9 @@ static int khugepaged(void *none)
2346 set_freezable(); 2341 set_freezable();
2347 set_user_nice(current, 19); 2342 set_user_nice(current, 19);
2348 2343
2349 /* serialize with start_khugepaged() */ 2344 while (!kthread_should_stop()) {
2350 mutex_lock(&khugepaged_mutex); 2345 khugepaged_do_scan();
2351 2346 khugepaged_wait_work();
2352 for (;;) {
2353 mutex_unlock(&khugepaged_mutex);
2354 VM_BUG_ON(khugepaged_thread != current);
2355 khugepaged_loop();
2356 VM_BUG_ON(khugepaged_thread != current);
2357
2358 mutex_lock(&khugepaged_mutex);
2359 if (!khugepaged_enabled())
2360 break;
2361 if (unlikely(kthread_should_stop()))
2362 break;
2363 } 2347 }
2364 2348
2365 spin_lock(&khugepaged_mm_lock); 2349 spin_lock(&khugepaged_mm_lock);
@@ -2368,10 +2352,6 @@ static int khugepaged(void *none)
2368 if (mm_slot) 2352 if (mm_slot)
2369 collect_mm_slot(mm_slot); 2353 collect_mm_slot(mm_slot);
2370 spin_unlock(&khugepaged_mm_lock); 2354 spin_unlock(&khugepaged_mm_lock);
2371
2372 khugepaged_thread = NULL;
2373 mutex_unlock(&khugepaged_mutex);
2374
2375 return 0; 2355 return 0;
2376} 2356}
2377 2357
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bc727122dd44..59a0059b39e2 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -30,7 +30,6 @@
30#include <linux/hugetlb.h> 30#include <linux/hugetlb.h>
31#include <linux/hugetlb_cgroup.h> 31#include <linux/hugetlb_cgroup.h>
32#include <linux/node.h> 32#include <linux/node.h>
33#include <linux/hugetlb_cgroup.h>
34#include "internal.h" 33#include "internal.h"
35 34
36const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 35const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
@@ -637,6 +636,7 @@ static void free_huge_page(struct page *page)
637 h->surplus_huge_pages--; 636 h->surplus_huge_pages--;
638 h->surplus_huge_pages_node[nid]--; 637 h->surplus_huge_pages_node[nid]--;
639 } else { 638 } else {
639 arch_clear_hugepage_flags(page);
640 enqueue_huge_page(h, page); 640 enqueue_huge_page(h, page);
641 } 641 }
642 spin_unlock(&hugetlb_lock); 642 spin_unlock(&hugetlb_lock);
@@ -671,6 +671,11 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
671 } 671 }
672} 672}
673 673
674/*
675 * PageHuge() only returns true for hugetlbfs pages, but not for normal or
676 * transparent huge pages. See the PageTransHuge() documentation for more
677 * details.
678 */
674int PageHuge(struct page *page) 679int PageHuge(struct page *page)
675{ 680{
676 compound_page_dtor *dtor; 681 compound_page_dtor *dtor;
@@ -2355,13 +2360,15 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
2355 struct page *page; 2360 struct page *page;
2356 struct hstate *h = hstate_vma(vma); 2361 struct hstate *h = hstate_vma(vma);
2357 unsigned long sz = huge_page_size(h); 2362 unsigned long sz = huge_page_size(h);
2363 const unsigned long mmun_start = start; /* For mmu_notifiers */
2364 const unsigned long mmun_end = end; /* For mmu_notifiers */
2358 2365
2359 WARN_ON(!is_vm_hugetlb_page(vma)); 2366 WARN_ON(!is_vm_hugetlb_page(vma));
2360 BUG_ON(start & ~huge_page_mask(h)); 2367 BUG_ON(start & ~huge_page_mask(h));
2361 BUG_ON(end & ~huge_page_mask(h)); 2368 BUG_ON(end & ~huge_page_mask(h));
2362 2369
2363 tlb_start_vma(tlb, vma); 2370 tlb_start_vma(tlb, vma);
2364 mmu_notifier_invalidate_range_start(mm, start, end); 2371 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2365again: 2372again:
2366 spin_lock(&mm->page_table_lock); 2373 spin_lock(&mm->page_table_lock);
2367 for (address = start; address < end; address += sz) { 2374 for (address = start; address < end; address += sz) {
@@ -2425,7 +2432,7 @@ again:
2425 if (address < end && !ref_page) 2432 if (address < end && !ref_page)
2426 goto again; 2433 goto again;
2427 } 2434 }
2428 mmu_notifier_invalidate_range_end(mm, start, end); 2435 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2429 tlb_end_vma(tlb, vma); 2436 tlb_end_vma(tlb, vma);
2430} 2437}
2431 2438
@@ -2473,7 +2480,6 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2473 struct hstate *h = hstate_vma(vma); 2480 struct hstate *h = hstate_vma(vma);
2474 struct vm_area_struct *iter_vma; 2481 struct vm_area_struct *iter_vma;
2475 struct address_space *mapping; 2482 struct address_space *mapping;
2476 struct prio_tree_iter iter;
2477 pgoff_t pgoff; 2483 pgoff_t pgoff;
2478 2484
2479 /* 2485 /*
@@ -2481,7 +2487,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2481 * from page cache lookup which is in HPAGE_SIZE units. 2487 * from page cache lookup which is in HPAGE_SIZE units.
2482 */ 2488 */
2483 address = address & huge_page_mask(h); 2489 address = address & huge_page_mask(h);
2484 pgoff = vma_hugecache_offset(h, vma, address); 2490 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
2491 vma->vm_pgoff;
2485 mapping = vma->vm_file->f_dentry->d_inode->i_mapping; 2492 mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
2486 2493
2487 /* 2494 /*
@@ -2490,7 +2497,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2490 * __unmap_hugepage_range() is called as the lock is already held 2497 * __unmap_hugepage_range() is called as the lock is already held
2491 */ 2498 */
2492 mutex_lock(&mapping->i_mmap_mutex); 2499 mutex_lock(&mapping->i_mmap_mutex);
2493 vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 2500 vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
2494 /* Do not unmap the current VMA */ 2501 /* Do not unmap the current VMA */
2495 if (iter_vma == vma) 2502 if (iter_vma == vma)
2496 continue; 2503 continue;
@@ -2525,6 +2532,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
2525 struct page *old_page, *new_page; 2532 struct page *old_page, *new_page;
2526 int avoidcopy; 2533 int avoidcopy;
2527 int outside_reserve = 0; 2534 int outside_reserve = 0;
2535 unsigned long mmun_start; /* For mmu_notifiers */
2536 unsigned long mmun_end; /* For mmu_notifiers */
2528 2537
2529 old_page = pte_page(pte); 2538 old_page = pte_page(pte);
2530 2539
@@ -2611,6 +2620,9 @@ retry_avoidcopy:
2611 pages_per_huge_page(h)); 2620 pages_per_huge_page(h));
2612 __SetPageUptodate(new_page); 2621 __SetPageUptodate(new_page);
2613 2622
2623 mmun_start = address & huge_page_mask(h);
2624 mmun_end = mmun_start + huge_page_size(h);
2625 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2614 /* 2626 /*
2615 * Retake the page_table_lock to check for racing updates 2627 * Retake the page_table_lock to check for racing updates
2616 * before the page tables are altered 2628 * before the page tables are altered
@@ -2619,9 +2631,6 @@ retry_avoidcopy:
2619 ptep = huge_pte_offset(mm, address & huge_page_mask(h)); 2631 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
2620 if (likely(pte_same(huge_ptep_get(ptep), pte))) { 2632 if (likely(pte_same(huge_ptep_get(ptep), pte))) {
2621 /* Break COW */ 2633 /* Break COW */
2622 mmu_notifier_invalidate_range_start(mm,
2623 address & huge_page_mask(h),
2624 (address & huge_page_mask(h)) + huge_page_size(h));
2625 huge_ptep_clear_flush(vma, address, ptep); 2634 huge_ptep_clear_flush(vma, address, ptep);
2626 set_huge_pte_at(mm, address, ptep, 2635 set_huge_pte_at(mm, address, ptep,
2627 make_huge_pte(vma, new_page, 1)); 2636 make_huge_pte(vma, new_page, 1));
@@ -2629,10 +2638,11 @@ retry_avoidcopy:
2629 hugepage_add_new_anon_rmap(new_page, vma, address); 2638 hugepage_add_new_anon_rmap(new_page, vma, address);
2630 /* Make the old page be freed below */ 2639 /* Make the old page be freed below */
2631 new_page = old_page; 2640 new_page = old_page;
2632 mmu_notifier_invalidate_range_end(mm,
2633 address & huge_page_mask(h),
2634 (address & huge_page_mask(h)) + huge_page_size(h));
2635 } 2641 }
2642 spin_unlock(&mm->page_table_lock);
2643 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2644 /* Caller expects lock to be held */
2645 spin_lock(&mm->page_table_lock);
2636 page_cache_release(new_page); 2646 page_cache_release(new_page);
2637 page_cache_release(old_page); 2647 page_cache_release(old_page);
2638 return 0; 2648 return 0;
diff --git a/mm/internal.h b/mm/internal.h
index b8c91b342e24..a4fa284f6bc2 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -118,26 +118,27 @@ struct compact_control {
118 unsigned long nr_freepages; /* Number of isolated free pages */ 118 unsigned long nr_freepages; /* Number of isolated free pages */
119 unsigned long nr_migratepages; /* Number of pages to migrate */ 119 unsigned long nr_migratepages; /* Number of pages to migrate */
120 unsigned long free_pfn; /* isolate_freepages search base */ 120 unsigned long free_pfn; /* isolate_freepages search base */
121 unsigned long start_free_pfn; /* where we started the search */
122 unsigned long migrate_pfn; /* isolate_migratepages search base */ 121 unsigned long migrate_pfn; /* isolate_migratepages search base */
123 bool sync; /* Synchronous migration */ 122 bool sync; /* Synchronous migration */
124 bool wrapped; /* Order > 0 compactions are 123 bool ignore_skip_hint; /* Scan blocks even if marked skip */
125 incremental, once free_pfn 124 bool finished_update_free; /* True when the zone cached pfns are
126 and migrate_pfn meet, we restart 125 * no longer being updated
127 from the top of the zone; 126 */
128 remember we wrapped around. */ 127 bool finished_update_migrate;
129 128
130 int order; /* order a direct compactor needs */ 129 int order; /* order a direct compactor needs */
131 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 130 int migratetype; /* MOVABLE, RECLAIMABLE etc */
132 struct zone *zone; 131 struct zone *zone;
133 bool *contended; /* True if a lock was contended */ 132 bool contended; /* True if a lock was contended */
133 struct page **page; /* Page captured of requested size */
134}; 134};
135 135
136unsigned long 136unsigned long
137isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn); 137isolate_freepages_range(struct compact_control *cc,
138 unsigned long start_pfn, unsigned long end_pfn);
138unsigned long 139unsigned long
139isolate_migratepages_range(struct zone *zone, struct compact_control *cc, 140isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
140 unsigned long low_pfn, unsigned long end_pfn); 141 unsigned long low_pfn, unsigned long end_pfn, bool unevictable);
141 142
142#endif 143#endif
143 144
@@ -167,9 +168,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
167} 168}
168 169
169/* 170/*
170 * Called only in fault path via page_evictable() for a new page 171 * Called only in fault path, to determine if a new page is being
171 * to determine if it's being mapped into a LOCKED vma. 172 * mapped into a LOCKED vma. If it is, mark page as mlocked.
172 * If so, mark page as mlocked.
173 */ 173 */
174static inline int mlocked_vma_newpage(struct vm_area_struct *vma, 174static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
175 struct page *page) 175 struct page *page)
@@ -180,7 +180,8 @@ static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
180 return 0; 180 return 0;
181 181
182 if (!TestSetPageMlocked(page)) { 182 if (!TestSetPageMlocked(page)) {
183 inc_zone_page_state(page, NR_MLOCK); 183 mod_zone_page_state(page_zone(page), NR_MLOCK,
184 hpage_nr_pages(page));
184 count_vm_event(UNEVICTABLE_PGMLOCKED); 185 count_vm_event(UNEVICTABLE_PGMLOCKED);
185 } 186 }
186 return 1; 187 return 1;
@@ -201,12 +202,7 @@ extern void munlock_vma_page(struct page *page);
201 * If called for a page that is still mapped by mlocked vmas, all we do 202 * If called for a page that is still mapped by mlocked vmas, all we do
202 * is revert to lazy LRU behaviour -- semantics are not broken. 203 * is revert to lazy LRU behaviour -- semantics are not broken.
203 */ 204 */
204extern void __clear_page_mlock(struct page *page); 205extern void clear_page_mlock(struct page *page);
205static inline void clear_page_mlock(struct page *page)
206{
207 if (unlikely(TestClearPageMlocked(page)))
208 __clear_page_mlock(page);
209}
210 206
211/* 207/*
212 * mlock_migrate_page - called only from migrate_page_copy() to 208 * mlock_migrate_page - called only from migrate_page_copy() to
@@ -340,7 +336,6 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
340#define ZONE_RECLAIM_FULL -1 336#define ZONE_RECLAIM_FULL -1
341#define ZONE_RECLAIM_SOME 0 337#define ZONE_RECLAIM_SOME 0
342#define ZONE_RECLAIM_SUCCESS 1 338#define ZONE_RECLAIM_SUCCESS 1
343#endif
344 339
345extern int hwpoison_filter(struct page *p); 340extern int hwpoison_filter(struct page *p);
346 341
@@ -356,3 +351,20 @@ extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
356 unsigned long, unsigned long); 351 unsigned long, unsigned long);
357 352
358extern void set_pageblock_order(void); 353extern void set_pageblock_order(void);
354unsigned long reclaim_clean_pages_from_list(struct zone *zone,
355 struct list_head *page_list);
356/* The ALLOC_WMARK bits are used as an index to zone->watermark */
357#define ALLOC_WMARK_MIN WMARK_MIN
358#define ALLOC_WMARK_LOW WMARK_LOW
359#define ALLOC_WMARK_HIGH WMARK_HIGH
360#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
361
362/* Mask to get the watermark bits */
363#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
364
365#define ALLOC_HARDER 0x10 /* try to alloc harder */
366#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
367#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
368#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
369
370#endif /* __MM_INTERNAL_H */
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
new file mode 100644
index 000000000000..4a5822a586e6
--- /dev/null
+++ b/mm/interval_tree.c
@@ -0,0 +1,112 @@
1/*
2 * mm/interval_tree.c - interval tree for mapping->i_mmap
3 *
4 * Copyright (C) 2012, Michel Lespinasse <walken@google.com>
5 *
6 * This file is released under the GPL v2.
7 */
8
9#include <linux/mm.h>
10#include <linux/fs.h>
11#include <linux/rmap.h>
12#include <linux/interval_tree_generic.h>
13
14static inline unsigned long vma_start_pgoff(struct vm_area_struct *v)
15{
16 return v->vm_pgoff;
17}
18
19static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
20{
21 return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1;
22}
23
24INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb,
25 unsigned long, shared.linear.rb_subtree_last,
26 vma_start_pgoff, vma_last_pgoff,, vma_interval_tree)
27
28/* Insert node immediately after prev in the interval tree */
29void vma_interval_tree_insert_after(struct vm_area_struct *node,
30 struct vm_area_struct *prev,
31 struct rb_root *root)
32{
33 struct rb_node **link;
34 struct vm_area_struct *parent;
35 unsigned long last = vma_last_pgoff(node);
36
37 VM_BUG_ON(vma_start_pgoff(node) != vma_start_pgoff(prev));
38
39 if (!prev->shared.linear.rb.rb_right) {
40 parent = prev;
41 link = &prev->shared.linear.rb.rb_right;
42 } else {
43 parent = rb_entry(prev->shared.linear.rb.rb_right,
44 struct vm_area_struct, shared.linear.rb);
45 if (parent->shared.linear.rb_subtree_last < last)
46 parent->shared.linear.rb_subtree_last = last;
47 while (parent->shared.linear.rb.rb_left) {
48 parent = rb_entry(parent->shared.linear.rb.rb_left,
49 struct vm_area_struct, shared.linear.rb);
50 if (parent->shared.linear.rb_subtree_last < last)
51 parent->shared.linear.rb_subtree_last = last;
52 }
53 link = &parent->shared.linear.rb.rb_left;
54 }
55
56 node->shared.linear.rb_subtree_last = last;
57 rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link);
58 rb_insert_augmented(&node->shared.linear.rb, root,
59 &vma_interval_tree_augment);
60}
61
62static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc)
63{
64 return vma_start_pgoff(avc->vma);
65}
66
67static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc)
68{
69 return vma_last_pgoff(avc->vma);
70}
71
72INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last,
73 avc_start_pgoff, avc_last_pgoff,
74 static inline, __anon_vma_interval_tree)
75
76void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
77 struct rb_root *root)
78{
79#ifdef CONFIG_DEBUG_VM_RB
80 node->cached_vma_start = avc_start_pgoff(node);
81 node->cached_vma_last = avc_last_pgoff(node);
82#endif
83 __anon_vma_interval_tree_insert(node, root);
84}
85
86void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
87 struct rb_root *root)
88{
89 __anon_vma_interval_tree_remove(node, root);
90}
91
92struct anon_vma_chain *
93anon_vma_interval_tree_iter_first(struct rb_root *root,
94 unsigned long first, unsigned long last)
95{
96 return __anon_vma_interval_tree_iter_first(root, first, last);
97}
98
99struct anon_vma_chain *
100anon_vma_interval_tree_iter_next(struct anon_vma_chain *node,
101 unsigned long first, unsigned long last)
102{
103 return __anon_vma_interval_tree_iter_next(node, first, last);
104}
105
106#ifdef CONFIG_DEBUG_VM_RB
107void anon_vma_interval_tree_verify(struct anon_vma_chain *node)
108{
109 WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node));
110 WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node));
111}
112#endif
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 0de83b4541e9..a217cc544060 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -29,7 +29,7 @@
29 * - kmemleak_lock (rwlock): protects the object_list modifications and 29 * - kmemleak_lock (rwlock): protects the object_list modifications and
30 * accesses to the object_tree_root. The object_list is the main list 30 * accesses to the object_tree_root. The object_list is the main list
31 * holding the metadata (struct kmemleak_object) for the allocated memory 31 * holding the metadata (struct kmemleak_object) for the allocated memory
32 * blocks. The object_tree_root is a priority search tree used to look-up 32 * blocks. The object_tree_root is a red black tree used to look-up
33 * metadata based on a pointer to the corresponding memory block. The 33 * metadata based on a pointer to the corresponding memory block. The
34 * kmemleak_object structures are added to the object_list and 34 * kmemleak_object structures are added to the object_list and
35 * object_tree_root in the create_object() function called from the 35 * object_tree_root in the create_object() function called from the
@@ -71,7 +71,7 @@
71#include <linux/delay.h> 71#include <linux/delay.h>
72#include <linux/export.h> 72#include <linux/export.h>
73#include <linux/kthread.h> 73#include <linux/kthread.h>
74#include <linux/prio_tree.h> 74#include <linux/rbtree.h>
75#include <linux/fs.h> 75#include <linux/fs.h>
76#include <linux/debugfs.h> 76#include <linux/debugfs.h>
77#include <linux/seq_file.h> 77#include <linux/seq_file.h>
@@ -132,7 +132,7 @@ struct kmemleak_scan_area {
132 * Structure holding the metadata for each allocated memory block. 132 * Structure holding the metadata for each allocated memory block.
133 * Modifications to such objects should be made while holding the 133 * Modifications to such objects should be made while holding the
134 * object->lock. Insertions or deletions from object_list, gray_list or 134 * object->lock. Insertions or deletions from object_list, gray_list or
135 * tree_node are already protected by the corresponding locks or mutex (see 135 * rb_node are already protected by the corresponding locks or mutex (see
136 * the notes on locking above). These objects are reference-counted 136 * the notes on locking above). These objects are reference-counted
137 * (use_count) and freed using the RCU mechanism. 137 * (use_count) and freed using the RCU mechanism.
138 */ 138 */
@@ -141,7 +141,7 @@ struct kmemleak_object {
141 unsigned long flags; /* object status flags */ 141 unsigned long flags; /* object status flags */
142 struct list_head object_list; 142 struct list_head object_list;
143 struct list_head gray_list; 143 struct list_head gray_list;
144 struct prio_tree_node tree_node; 144 struct rb_node rb_node;
145 struct rcu_head rcu; /* object_list lockless traversal */ 145 struct rcu_head rcu; /* object_list lockless traversal */
146 /* object usage count; object freed when use_count == 0 */ 146 /* object usage count; object freed when use_count == 0 */
147 atomic_t use_count; 147 atomic_t use_count;
@@ -182,9 +182,9 @@ struct kmemleak_object {
182static LIST_HEAD(object_list); 182static LIST_HEAD(object_list);
183/* the list of gray-colored objects (see color_gray comment below) */ 183/* the list of gray-colored objects (see color_gray comment below) */
184static LIST_HEAD(gray_list); 184static LIST_HEAD(gray_list);
185/* prio search tree for object boundaries */ 185/* search tree for object boundaries */
186static struct prio_tree_root object_tree_root; 186static struct rb_root object_tree_root = RB_ROOT;
187/* rw_lock protecting the access to object_list and prio_tree_root */ 187/* rw_lock protecting the access to object_list and object_tree_root */
188static DEFINE_RWLOCK(kmemleak_lock); 188static DEFINE_RWLOCK(kmemleak_lock);
189 189
190/* allocation caches for kmemleak internal data */ 190/* allocation caches for kmemleak internal data */
@@ -380,7 +380,7 @@ static void dump_object_info(struct kmemleak_object *object)
380 trace.entries = object->trace; 380 trace.entries = object->trace;
381 381
382 pr_notice("Object 0x%08lx (size %zu):\n", 382 pr_notice("Object 0x%08lx (size %zu):\n",
383 object->tree_node.start, object->size); 383 object->pointer, object->size);
384 pr_notice(" comm \"%s\", pid %d, jiffies %lu\n", 384 pr_notice(" comm \"%s\", pid %d, jiffies %lu\n",
385 object->comm, object->pid, object->jiffies); 385 object->comm, object->pid, object->jiffies);
386 pr_notice(" min_count = %d\n", object->min_count); 386 pr_notice(" min_count = %d\n", object->min_count);
@@ -392,32 +392,32 @@ static void dump_object_info(struct kmemleak_object *object)
392} 392}
393 393
394/* 394/*
395 * Look-up a memory block metadata (kmemleak_object) in the priority search 395 * Look-up a memory block metadata (kmemleak_object) in the object search
396 * tree based on a pointer value. If alias is 0, only values pointing to the 396 * tree based on a pointer value. If alias is 0, only values pointing to the
397 * beginning of the memory block are allowed. The kmemleak_lock must be held 397 * beginning of the memory block are allowed. The kmemleak_lock must be held
398 * when calling this function. 398 * when calling this function.
399 */ 399 */
400static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) 400static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
401{ 401{
402 struct prio_tree_node *node; 402 struct rb_node *rb = object_tree_root.rb_node;
403 struct prio_tree_iter iter; 403
404 struct kmemleak_object *object; 404 while (rb) {
405 405 struct kmemleak_object *object =
406 prio_tree_iter_init(&iter, &object_tree_root, ptr, ptr); 406 rb_entry(rb, struct kmemleak_object, rb_node);
407 node = prio_tree_next(&iter); 407 if (ptr < object->pointer)
408 if (node) { 408 rb = object->rb_node.rb_left;
409 object = prio_tree_entry(node, struct kmemleak_object, 409 else if (object->pointer + object->size <= ptr)
410 tree_node); 410 rb = object->rb_node.rb_right;
411 if (!alias && object->pointer != ptr) { 411 else if (object->pointer == ptr || alias)
412 return object;
413 else {
412 kmemleak_warn("Found object by alias at 0x%08lx\n", 414 kmemleak_warn("Found object by alias at 0x%08lx\n",
413 ptr); 415 ptr);
414 dump_object_info(object); 416 dump_object_info(object);
415 object = NULL; 417 break;
416 } 418 }
417 } else 419 }
418 object = NULL; 420 return NULL;
419
420 return object;
421} 421}
422 422
423/* 423/*
@@ -471,7 +471,7 @@ static void put_object(struct kmemleak_object *object)
471} 471}
472 472
473/* 473/*
474 * Look up an object in the prio search tree and increase its use_count. 474 * Look up an object in the object search tree and increase its use_count.
475 */ 475 */
476static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) 476static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
477{ 477{
@@ -516,8 +516,8 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
516 int min_count, gfp_t gfp) 516 int min_count, gfp_t gfp)
517{ 517{
518 unsigned long flags; 518 unsigned long flags;
519 struct kmemleak_object *object; 519 struct kmemleak_object *object, *parent;
520 struct prio_tree_node *node; 520 struct rb_node **link, *rb_parent;
521 521
522 object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); 522 object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
523 if (!object) { 523 if (!object) {
@@ -560,31 +560,34 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
560 /* kernel backtrace */ 560 /* kernel backtrace */
561 object->trace_len = __save_stack_trace(object->trace); 561 object->trace_len = __save_stack_trace(object->trace);
562 562
563 INIT_PRIO_TREE_NODE(&object->tree_node);
564 object->tree_node.start = ptr;
565 object->tree_node.last = ptr + size - 1;
566
567 write_lock_irqsave(&kmemleak_lock, flags); 563 write_lock_irqsave(&kmemleak_lock, flags);
568 564
569 min_addr = min(min_addr, ptr); 565 min_addr = min(min_addr, ptr);
570 max_addr = max(max_addr, ptr + size); 566 max_addr = max(max_addr, ptr + size);
571 node = prio_tree_insert(&object_tree_root, &object->tree_node); 567 link = &object_tree_root.rb_node;
572 /* 568 rb_parent = NULL;
573 * The code calling the kernel does not yet have the pointer to the 569 while (*link) {
574 * memory block to be able to free it. However, we still hold the 570 rb_parent = *link;
575 * kmemleak_lock here in case parts of the kernel started freeing 571 parent = rb_entry(rb_parent, struct kmemleak_object, rb_node);
576 * random memory blocks. 572 if (ptr + size <= parent->pointer)
577 */ 573 link = &parent->rb_node.rb_left;
578 if (node != &object->tree_node) { 574 else if (parent->pointer + parent->size <= ptr)
579 kmemleak_stop("Cannot insert 0x%lx into the object search tree " 575 link = &parent->rb_node.rb_right;
580 "(already existing)\n", ptr); 576 else {
581 object = lookup_object(ptr, 1); 577 kmemleak_stop("Cannot insert 0x%lx into the object "
582 spin_lock(&object->lock); 578 "search tree (overlaps existing)\n",
583 dump_object_info(object); 579 ptr);
584 spin_unlock(&object->lock); 580 kmem_cache_free(object_cache, object);
585 581 object = parent;
586 goto out; 582 spin_lock(&object->lock);
583 dump_object_info(object);
584 spin_unlock(&object->lock);
585 goto out;
586 }
587 } 587 }
588 rb_link_node(&object->rb_node, rb_parent, link);
589 rb_insert_color(&object->rb_node, &object_tree_root);
590
588 list_add_tail_rcu(&object->object_list, &object_list); 591 list_add_tail_rcu(&object->object_list, &object_list);
589out: 592out:
590 write_unlock_irqrestore(&kmemleak_lock, flags); 593 write_unlock_irqrestore(&kmemleak_lock, flags);
@@ -600,7 +603,7 @@ static void __delete_object(struct kmemleak_object *object)
600 unsigned long flags; 603 unsigned long flags;
601 604
602 write_lock_irqsave(&kmemleak_lock, flags); 605 write_lock_irqsave(&kmemleak_lock, flags);
603 prio_tree_remove(&object_tree_root, &object->tree_node); 606 rb_erase(&object->rb_node, &object_tree_root);
604 list_del_rcu(&object->object_list); 607 list_del_rcu(&object->object_list);
605 write_unlock_irqrestore(&kmemleak_lock, flags); 608 write_unlock_irqrestore(&kmemleak_lock, flags);
606 609
@@ -1766,7 +1769,6 @@ void __init kmemleak_init(void)
1766 1769
1767 object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); 1770 object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
1768 scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); 1771 scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
1769 INIT_PRIO_TREE_ROOT(&object_tree_root);
1770 1772
1771 if (crt_early_log >= ARRAY_SIZE(early_log)) 1773 if (crt_early_log >= ARRAY_SIZE(early_log))
1772 pr_warning("Early log buffer exceeded (%d), please increase " 1774 pr_warning("Early log buffer exceeded (%d), please increase "
diff --git a/mm/ksm.c b/mm/ksm.c
index 47c885368890..ae539f0b8aa1 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -709,15 +709,22 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
709 spinlock_t *ptl; 709 spinlock_t *ptl;
710 int swapped; 710 int swapped;
711 int err = -EFAULT; 711 int err = -EFAULT;
712 unsigned long mmun_start; /* For mmu_notifiers */
713 unsigned long mmun_end; /* For mmu_notifiers */
712 714
713 addr = page_address_in_vma(page, vma); 715 addr = page_address_in_vma(page, vma);
714 if (addr == -EFAULT) 716 if (addr == -EFAULT)
715 goto out; 717 goto out;
716 718
717 BUG_ON(PageTransCompound(page)); 719 BUG_ON(PageTransCompound(page));
720
721 mmun_start = addr;
722 mmun_end = addr + PAGE_SIZE;
723 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
724
718 ptep = page_check_address(page, mm, addr, &ptl, 0); 725 ptep = page_check_address(page, mm, addr, &ptl, 0);
719 if (!ptep) 726 if (!ptep)
720 goto out; 727 goto out_mn;
721 728
722 if (pte_write(*ptep) || pte_dirty(*ptep)) { 729 if (pte_write(*ptep) || pte_dirty(*ptep)) {
723 pte_t entry; 730 pte_t entry;
@@ -752,6 +759,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
752 759
753out_unlock: 760out_unlock:
754 pte_unmap_unlock(ptep, ptl); 761 pte_unmap_unlock(ptep, ptl);
762out_mn:
763 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
755out: 764out:
756 return err; 765 return err;
757} 766}
@@ -776,6 +785,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
776 spinlock_t *ptl; 785 spinlock_t *ptl;
777 unsigned long addr; 786 unsigned long addr;
778 int err = -EFAULT; 787 int err = -EFAULT;
788 unsigned long mmun_start; /* For mmu_notifiers */
789 unsigned long mmun_end; /* For mmu_notifiers */
779 790
780 addr = page_address_in_vma(page, vma); 791 addr = page_address_in_vma(page, vma);
781 if (addr == -EFAULT) 792 if (addr == -EFAULT)
@@ -794,10 +805,14 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
794 if (!pmd_present(*pmd)) 805 if (!pmd_present(*pmd))
795 goto out; 806 goto out;
796 807
808 mmun_start = addr;
809 mmun_end = addr + PAGE_SIZE;
810 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
811
797 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); 812 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
798 if (!pte_same(*ptep, orig_pte)) { 813 if (!pte_same(*ptep, orig_pte)) {
799 pte_unmap_unlock(ptep, ptl); 814 pte_unmap_unlock(ptep, ptl);
800 goto out; 815 goto out_mn;
801 } 816 }
802 817
803 get_page(kpage); 818 get_page(kpage);
@@ -814,6 +829,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
814 829
815 pte_unmap_unlock(ptep, ptl); 830 pte_unmap_unlock(ptep, ptl);
816 err = 0; 831 err = 0;
832out_mn:
833 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
817out: 834out:
818 return err; 835 return err;
819} 836}
@@ -1469,10 +1486,14 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
1469 */ 1486 */
1470 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | 1487 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
1471 VM_PFNMAP | VM_IO | VM_DONTEXPAND | 1488 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
1472 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | 1489 VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP))
1473 VM_NONLINEAR | VM_MIXEDMAP | VM_SAO))
1474 return 0; /* just ignore the advice */ 1490 return 0; /* just ignore the advice */
1475 1491
1492#ifdef VM_SAO
1493 if (*vm_flags & VM_SAO)
1494 return 0;
1495#endif
1496
1476 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { 1497 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
1477 err = __ksm_enter(mm); 1498 err = __ksm_enter(mm);
1478 if (err) 1499 if (err)
@@ -1582,7 +1603,7 @@ struct page *ksm_does_need_to_copy(struct page *page,
1582 SetPageSwapBacked(new_page); 1603 SetPageSwapBacked(new_page);
1583 __set_page_locked(new_page); 1604 __set_page_locked(new_page);
1584 1605
1585 if (page_evictable(new_page, vma)) 1606 if (!mlocked_vma_newpage(vma, new_page))
1586 lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); 1607 lru_cache_add_lru(new_page, LRU_ACTIVE_ANON);
1587 else 1608 else
1588 add_page_to_unevictable_list(new_page); 1609 add_page_to_unevictable_list(new_page);
@@ -1614,7 +1635,8 @@ again:
1614 struct vm_area_struct *vma; 1635 struct vm_area_struct *vma;
1615 1636
1616 anon_vma_lock(anon_vma); 1637 anon_vma_lock(anon_vma);
1617 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { 1638 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
1639 0, ULONG_MAX) {
1618 vma = vmac->vma; 1640 vma = vmac->vma;
1619 if (rmap_item->address < vma->vm_start || 1641 if (rmap_item->address < vma->vm_start ||
1620 rmap_item->address >= vma->vm_end) 1642 rmap_item->address >= vma->vm_end)
@@ -1667,7 +1689,8 @@ again:
1667 struct vm_area_struct *vma; 1689 struct vm_area_struct *vma;
1668 1690
1669 anon_vma_lock(anon_vma); 1691 anon_vma_lock(anon_vma);
1670 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { 1692 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
1693 0, ULONG_MAX) {
1671 vma = vmac->vma; 1694 vma = vmac->vma;
1672 if (rmap_item->address < vma->vm_start || 1695 if (rmap_item->address < vma->vm_start ||
1673 rmap_item->address >= vma->vm_end) 1696 rmap_item->address >= vma->vm_end)
@@ -1719,7 +1742,8 @@ again:
1719 struct vm_area_struct *vma; 1742 struct vm_area_struct *vma;
1720 1743
1721 anon_vma_lock(anon_vma); 1744 anon_vma_lock(anon_vma);
1722 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { 1745 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
1746 0, ULONG_MAX) {
1723 vma = vmac->vma; 1747 vma = vmac->vma;
1724 if (rmap_item->address < vma->vm_start || 1748 if (rmap_item->address < vma->vm_start ||
1725 rmap_item->address >= vma->vm_end) 1749 rmap_item->address >= vma->vm_end)
diff --git a/mm/madvise.c b/mm/madvise.c
index 14d260fa0d17..03dfa5c7adb3 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -69,10 +69,14 @@ static long madvise_behavior(struct vm_area_struct * vma,
69 new_flags &= ~VM_DONTCOPY; 69 new_flags &= ~VM_DONTCOPY;
70 break; 70 break;
71 case MADV_DONTDUMP: 71 case MADV_DONTDUMP:
72 new_flags |= VM_NODUMP; 72 new_flags |= VM_DONTDUMP;
73 break; 73 break;
74 case MADV_DODUMP: 74 case MADV_DODUMP:
75 new_flags &= ~VM_NODUMP; 75 if (new_flags & VM_SPECIAL) {
76 error = -EINVAL;
77 goto out;
78 }
79 new_flags &= ~VM_DONTDUMP;
76 break; 80 break;
77 case MADV_MERGEABLE: 81 case MADV_MERGEABLE:
78 case MADV_UNMERGEABLE: 82 case MADV_UNMERGEABLE:
diff --git a/mm/memblock.c b/mm/memblock.c
index 82aa349d2f7a..931eef145af5 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -41,7 +41,8 @@ static int memblock_memory_in_slab __initdata_memblock = 0;
41static int memblock_reserved_in_slab __initdata_memblock = 0; 41static int memblock_reserved_in_slab __initdata_memblock = 0;
42 42
43/* inline so we don't get a warning when pr_debug is compiled out */ 43/* inline so we don't get a warning when pr_debug is compiled out */
44static inline const char *memblock_type_name(struct memblock_type *type) 44static __init_memblock const char *
45memblock_type_name(struct memblock_type *type)
45{ 46{
46 if (type == &memblock.memory) 47 if (type == &memblock.memory)
47 return "memory"; 48 return "memory";
@@ -756,7 +757,7 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
756 return ret; 757 return ret;
757 758
758 for (i = start_rgn; i < end_rgn; i++) 759 for (i = start_rgn; i < end_rgn; i++)
759 type->regions[i].nid = nid; 760 memblock_set_region_node(&type->regions[i], nid);
760 761
761 memblock_merge_regions(type); 762 memblock_merge_regions(type);
762 return 0; 763 return 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a72f2ffdc3d0..7acf43bf04a2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -51,6 +51,7 @@
51#include <linux/oom.h> 51#include <linux/oom.h>
52#include "internal.h" 52#include "internal.h"
53#include <net/sock.h> 53#include <net/sock.h>
54#include <net/ip.h>
54#include <net/tcp_memcontrol.h> 55#include <net/tcp_memcontrol.h>
55 56
56#include <asm/uaccess.h> 57#include <asm/uaccess.h>
@@ -326,7 +327,7 @@ struct mem_cgroup {
326 struct mem_cgroup_stat_cpu nocpu_base; 327 struct mem_cgroup_stat_cpu nocpu_base;
327 spinlock_t pcp_counter_lock; 328 spinlock_t pcp_counter_lock;
328 329
329#ifdef CONFIG_INET 330#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
330 struct tcp_memcontrol tcp_mem; 331 struct tcp_memcontrol tcp_mem;
331#endif 332#endif
332}; 333};
@@ -411,12 +412,14 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
411 return container_of(s, struct mem_cgroup, css); 412 return container_of(s, struct mem_cgroup, css);
412} 413}
413 414
415static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
416{
417 return (memcg == root_mem_cgroup);
418}
419
414/* Writing them here to avoid exposing memcg's inner layout */ 420/* Writing them here to avoid exposing memcg's inner layout */
415#ifdef CONFIG_MEMCG_KMEM 421#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
416#include <net/sock.h>
417#include <net/ip.h>
418 422
419static bool mem_cgroup_is_root(struct mem_cgroup *memcg);
420void sock_update_memcg(struct sock *sk) 423void sock_update_memcg(struct sock *sk)
421{ 424{
422 if (mem_cgroup_sockets_enabled) { 425 if (mem_cgroup_sockets_enabled) {
@@ -461,7 +464,6 @@ void sock_release_memcg(struct sock *sk)
461 } 464 }
462} 465}
463 466
464#ifdef CONFIG_INET
465struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) 467struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
466{ 468{
467 if (!memcg || mem_cgroup_is_root(memcg)) 469 if (!memcg || mem_cgroup_is_root(memcg))
@@ -470,10 +472,7 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
470 return &memcg->tcp_mem.cg_proto; 472 return &memcg->tcp_mem.cg_proto;
471} 473}
472EXPORT_SYMBOL(tcp_proto_cgroup); 474EXPORT_SYMBOL(tcp_proto_cgroup);
473#endif /* CONFIG_INET */
474#endif /* CONFIG_MEMCG_KMEM */
475 475
476#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
477static void disarm_sock_keys(struct mem_cgroup *memcg) 476static void disarm_sock_keys(struct mem_cgroup *memcg)
478{ 477{
479 if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) 478 if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
@@ -1016,11 +1015,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
1016 iter != NULL; \ 1015 iter != NULL; \
1017 iter = mem_cgroup_iter(NULL, iter, NULL)) 1016 iter = mem_cgroup_iter(NULL, iter, NULL))
1018 1017
1019static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
1020{
1021 return (memcg == root_mem_cgroup);
1022}
1023
1024void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) 1018void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
1025{ 1019{
1026 struct mem_cgroup *memcg; 1020 struct mem_cgroup *memcg;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index a6e2141a6610..6c5899b9034a 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -400,18 +400,21 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
400 struct vm_area_struct *vma; 400 struct vm_area_struct *vma;
401 struct task_struct *tsk; 401 struct task_struct *tsk;
402 struct anon_vma *av; 402 struct anon_vma *av;
403 pgoff_t pgoff;
403 404
404 av = page_lock_anon_vma(page); 405 av = page_lock_anon_vma(page);
405 if (av == NULL) /* Not actually mapped anymore */ 406 if (av == NULL) /* Not actually mapped anymore */
406 return; 407 return;
407 408
409 pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
408 read_lock(&tasklist_lock); 410 read_lock(&tasklist_lock);
409 for_each_process (tsk) { 411 for_each_process (tsk) {
410 struct anon_vma_chain *vmac; 412 struct anon_vma_chain *vmac;
411 413
412 if (!task_early_kill(tsk)) 414 if (!task_early_kill(tsk))
413 continue; 415 continue;
414 list_for_each_entry(vmac, &av->head, same_anon_vma) { 416 anon_vma_interval_tree_foreach(vmac, &av->rb_root,
417 pgoff, pgoff) {
415 vma = vmac->vma; 418 vma = vmac->vma;
416 if (!page_mapped_in_vma(page, vma)) 419 if (!page_mapped_in_vma(page, vma))
417 continue; 420 continue;
@@ -431,7 +434,6 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
431{ 434{
432 struct vm_area_struct *vma; 435 struct vm_area_struct *vma;
433 struct task_struct *tsk; 436 struct task_struct *tsk;
434 struct prio_tree_iter iter;
435 struct address_space *mapping = page->mapping; 437 struct address_space *mapping = page->mapping;
436 438
437 mutex_lock(&mapping->i_mmap_mutex); 439 mutex_lock(&mapping->i_mmap_mutex);
@@ -442,7 +444,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
442 if (!task_early_kill(tsk)) 444 if (!task_early_kill(tsk))
443 continue; 445 continue;
444 446
445 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, 447 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
446 pgoff) { 448 pgoff) {
447 /* 449 /*
448 * Send early kill signal to tasks where a vma covers 450 * Send early kill signal to tasks where a vma covers
diff --git a/mm/memory.c b/mm/memory.c
index 57361708d1a5..fb135ba4aba9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -712,7 +712,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
712 add_taint(TAINT_BAD_PAGE); 712 add_taint(TAINT_BAD_PAGE);
713} 713}
714 714
715static inline int is_cow_mapping(vm_flags_t flags) 715static inline bool is_cow_mapping(vm_flags_t flags)
716{ 716{
717 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 717 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
718} 718}
@@ -1039,6 +1039,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1039 unsigned long next; 1039 unsigned long next;
1040 unsigned long addr = vma->vm_start; 1040 unsigned long addr = vma->vm_start;
1041 unsigned long end = vma->vm_end; 1041 unsigned long end = vma->vm_end;
1042 unsigned long mmun_start; /* For mmu_notifiers */
1043 unsigned long mmun_end; /* For mmu_notifiers */
1044 bool is_cow;
1042 int ret; 1045 int ret;
1043 1046
1044 /* 1047 /*
@@ -1047,7 +1050,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1047 * readonly mappings. The tradeoff is that copy_page_range is more 1050 * readonly mappings. The tradeoff is that copy_page_range is more
1048 * efficient than faulting. 1051 * efficient than faulting.
1049 */ 1052 */
1050 if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) { 1053 if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR |
1054 VM_PFNMAP | VM_MIXEDMAP))) {
1051 if (!vma->anon_vma) 1055 if (!vma->anon_vma)
1052 return 0; 1056 return 0;
1053 } 1057 }
@@ -1055,12 +1059,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1055 if (is_vm_hugetlb_page(vma)) 1059 if (is_vm_hugetlb_page(vma))
1056 return copy_hugetlb_page_range(dst_mm, src_mm, vma); 1060 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
1057 1061
1058 if (unlikely(is_pfn_mapping(vma))) { 1062 if (unlikely(vma->vm_flags & VM_PFNMAP)) {
1059 /* 1063 /*
1060 * We do not free on error cases below as remove_vma 1064 * We do not free on error cases below as remove_vma
1061 * gets called on error from higher level routine 1065 * gets called on error from higher level routine
1062 */ 1066 */
1063 ret = track_pfn_vma_copy(vma); 1067 ret = track_pfn_copy(vma);
1064 if (ret) 1068 if (ret)
1065 return ret; 1069 return ret;
1066 } 1070 }
@@ -1071,8 +1075,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1071 * parent mm. And a permission downgrade will only happen if 1075 * parent mm. And a permission downgrade will only happen if
1072 * is_cow_mapping() returns true. 1076 * is_cow_mapping() returns true.
1073 */ 1077 */
1074 if (is_cow_mapping(vma->vm_flags)) 1078 is_cow = is_cow_mapping(vma->vm_flags);
1075 mmu_notifier_invalidate_range_start(src_mm, addr, end); 1079 mmun_start = addr;
1080 mmun_end = end;
1081 if (is_cow)
1082 mmu_notifier_invalidate_range_start(src_mm, mmun_start,
1083 mmun_end);
1076 1084
1077 ret = 0; 1085 ret = 0;
1078 dst_pgd = pgd_offset(dst_mm, addr); 1086 dst_pgd = pgd_offset(dst_mm, addr);
@@ -1088,9 +1096,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1088 } 1096 }
1089 } while (dst_pgd++, src_pgd++, addr = next, addr != end); 1097 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
1090 1098
1091 if (is_cow_mapping(vma->vm_flags)) 1099 if (is_cow)
1092 mmu_notifier_invalidate_range_end(src_mm, 1100 mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
1093 vma->vm_start, end);
1094 return ret; 1101 return ret;
1095} 1102}
1096 1103
@@ -1327,8 +1334,8 @@ static void unmap_single_vma(struct mmu_gather *tlb,
1327 if (vma->vm_file) 1334 if (vma->vm_file)
1328 uprobe_munmap(vma, start, end); 1335 uprobe_munmap(vma, start, end);
1329 1336
1330 if (unlikely(is_pfn_mapping(vma))) 1337 if (unlikely(vma->vm_flags & VM_PFNMAP))
1331 untrack_pfn_vma(vma, 0, 0); 1338 untrack_pfn(vma, 0, 0);
1332 1339
1333 if (start != end) { 1340 if (start != end) {
1334 if (unlikely(is_vm_hugetlb_page(vma))) { 1341 if (unlikely(is_vm_hugetlb_page(vma))) {
@@ -1521,7 +1528,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1521 spin_unlock(&mm->page_table_lock); 1528 spin_unlock(&mm->page_table_lock);
1522 wait_split_huge_page(vma->anon_vma, pmd); 1529 wait_split_huge_page(vma->anon_vma, pmd);
1523 } else { 1530 } else {
1524 page = follow_trans_huge_pmd(mm, address, 1531 page = follow_trans_huge_pmd(vma, address,
1525 pmd, flags); 1532 pmd, flags);
1526 spin_unlock(&mm->page_table_lock); 1533 spin_unlock(&mm->page_table_lock);
1527 goto out; 1534 goto out;
@@ -1576,12 +1583,12 @@ split_fallthrough:
1576 if (page->mapping && trylock_page(page)) { 1583 if (page->mapping && trylock_page(page)) {
1577 lru_add_drain(); /* push cached pages to LRU */ 1584 lru_add_drain(); /* push cached pages to LRU */
1578 /* 1585 /*
1579 * Because we lock page here and migration is 1586 * Because we lock page here, and migration is
1580 * blocked by the pte's page reference, we need 1587 * blocked by the pte's page reference, and we
1581 * only check for file-cache page truncation. 1588 * know the page is still mapped, we don't even
1589 * need to check for file-cache page truncation.
1582 */ 1590 */
1583 if (page->mapping) 1591 mlock_vma_page(page);
1584 mlock_vma_page(page);
1585 unlock_page(page); 1592 unlock_page(page);
1586 } 1593 }
1587 } 1594 }
@@ -2085,6 +2092,11 @@ out:
2085 * ask for a shared writable mapping! 2092 * ask for a shared writable mapping!
2086 * 2093 *
2087 * The page does not need to be reserved. 2094 * The page does not need to be reserved.
2095 *
2096 * Usually this function is called from f_op->mmap() handler
2097 * under mm->mmap_sem write-lock, so it can change vma->vm_flags.
2098 * Caller must set VM_MIXEDMAP on vma if it wants to call this
2099 * function from other places, for example from page-fault handler.
2088 */ 2100 */
2089int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, 2101int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
2090 struct page *page) 2102 struct page *page)
@@ -2093,7 +2105,11 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
2093 return -EFAULT; 2105 return -EFAULT;
2094 if (!page_count(page)) 2106 if (!page_count(page))
2095 return -EINVAL; 2107 return -EINVAL;
2096 vma->vm_flags |= VM_INSERTPAGE; 2108 if (!(vma->vm_flags & VM_MIXEDMAP)) {
2109 BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
2110 BUG_ON(vma->vm_flags & VM_PFNMAP);
2111 vma->vm_flags |= VM_MIXEDMAP;
2112 }
2097 return insert_page(vma, addr, page, vma->vm_page_prot); 2113 return insert_page(vma, addr, page, vma->vm_page_prot);
2098} 2114}
2099EXPORT_SYMBOL(vm_insert_page); 2115EXPORT_SYMBOL(vm_insert_page);
@@ -2132,7 +2148,7 @@ out:
2132 * @addr: target user address of this page 2148 * @addr: target user address of this page
2133 * @pfn: source kernel pfn 2149 * @pfn: source kernel pfn
2134 * 2150 *
2135 * Similar to vm_inert_page, this allows drivers to insert individual pages 2151 * Similar to vm_insert_page, this allows drivers to insert individual pages
2136 * they've allocated into a user vma. Same comments apply. 2152 * they've allocated into a user vma. Same comments apply.
2137 * 2153 *
2138 * This function should only be called from a vm_ops->fault handler, and 2154 * This function should only be called from a vm_ops->fault handler, and
@@ -2162,14 +2178,11 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2162 2178
2163 if (addr < vma->vm_start || addr >= vma->vm_end) 2179 if (addr < vma->vm_start || addr >= vma->vm_end)
2164 return -EFAULT; 2180 return -EFAULT;
2165 if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE)) 2181 if (track_pfn_insert(vma, &pgprot, pfn))
2166 return -EINVAL; 2182 return -EINVAL;
2167 2183
2168 ret = insert_pfn(vma, addr, pfn, pgprot); 2184 ret = insert_pfn(vma, addr, pfn, pgprot);
2169 2185
2170 if (ret)
2171 untrack_pfn_vma(vma, pfn, PAGE_SIZE);
2172
2173 return ret; 2186 return ret;
2174} 2187}
2175EXPORT_SYMBOL(vm_insert_pfn); 2188EXPORT_SYMBOL(vm_insert_pfn);
@@ -2290,37 +2303,30 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
2290 * rest of the world about it: 2303 * rest of the world about it:
2291 * VM_IO tells people not to look at these pages 2304 * VM_IO tells people not to look at these pages
2292 * (accesses can have side effects). 2305 * (accesses can have side effects).
2293 * VM_RESERVED is specified all over the place, because
2294 * in 2.4 it kept swapout's vma scan off this vma; but
2295 * in 2.6 the LRU scan won't even find its pages, so this
2296 * flag means no more than count its pages in reserved_vm,
2297 * and omit it from core dump, even when VM_IO turned off.
2298 * VM_PFNMAP tells the core MM that the base pages are just 2306 * VM_PFNMAP tells the core MM that the base pages are just
2299 * raw PFN mappings, and do not have a "struct page" associated 2307 * raw PFN mappings, and do not have a "struct page" associated
2300 * with them. 2308 * with them.
2309 * VM_DONTEXPAND
2310 * Disable vma merging and expanding with mremap().
2311 * VM_DONTDUMP
2312 * Omit vma from core dump, even when VM_IO turned off.
2301 * 2313 *
2302 * There's a horrible special case to handle copy-on-write 2314 * There's a horrible special case to handle copy-on-write
2303 * behaviour that some programs depend on. We mark the "original" 2315 * behaviour that some programs depend on. We mark the "original"
2304 * un-COW'ed pages by matching them up with "vma->vm_pgoff". 2316 * un-COW'ed pages by matching them up with "vma->vm_pgoff".
2317 * See vm_normal_page() for details.
2305 */ 2318 */
2306 if (addr == vma->vm_start && end == vma->vm_end) { 2319 if (is_cow_mapping(vma->vm_flags)) {
2320 if (addr != vma->vm_start || end != vma->vm_end)
2321 return -EINVAL;
2307 vma->vm_pgoff = pfn; 2322 vma->vm_pgoff = pfn;
2308 vma->vm_flags |= VM_PFN_AT_MMAP; 2323 }
2309 } else if (is_cow_mapping(vma->vm_flags))
2310 return -EINVAL;
2311
2312 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
2313 2324
2314 err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size)); 2325 err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
2315 if (err) { 2326 if (err)
2316 /*
2317 * To indicate that track_pfn related cleanup is not
2318 * needed from higher level routine calling unmap_vmas
2319 */
2320 vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
2321 vma->vm_flags &= ~VM_PFN_AT_MMAP;
2322 return -EINVAL; 2327 return -EINVAL;
2323 } 2328
2329 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
2324 2330
2325 BUG_ON(addr >= end); 2331 BUG_ON(addr >= end);
2326 pfn -= addr >> PAGE_SHIFT; 2332 pfn -= addr >> PAGE_SHIFT;
@@ -2335,7 +2341,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
2335 } while (pgd++, addr = next, addr != end); 2341 } while (pgd++, addr = next, addr != end);
2336 2342
2337 if (err) 2343 if (err)
2338 untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size)); 2344 untrack_pfn(vma, pfn, PAGE_ALIGN(size));
2339 2345
2340 return err; 2346 return err;
2341} 2347}
@@ -2516,11 +2522,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2516 spinlock_t *ptl, pte_t orig_pte) 2522 spinlock_t *ptl, pte_t orig_pte)
2517 __releases(ptl) 2523 __releases(ptl)
2518{ 2524{
2519 struct page *old_page, *new_page; 2525 struct page *old_page, *new_page = NULL;
2520 pte_t entry; 2526 pte_t entry;
2521 int ret = 0; 2527 int ret = 0;
2522 int page_mkwrite = 0; 2528 int page_mkwrite = 0;
2523 struct page *dirty_page = NULL; 2529 struct page *dirty_page = NULL;
2530 unsigned long mmun_start; /* For mmu_notifiers */
2531 unsigned long mmun_end; /* For mmu_notifiers */
2532 bool mmun_called = false; /* For mmu_notifiers */
2524 2533
2525 old_page = vm_normal_page(vma, address, orig_pte); 2534 old_page = vm_normal_page(vma, address, orig_pte);
2526 if (!old_page) { 2535 if (!old_page) {
@@ -2698,6 +2707,11 @@ gotten:
2698 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) 2707 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2699 goto oom_free_new; 2708 goto oom_free_new;
2700 2709
2710 mmun_start = address & PAGE_MASK;
2711 mmun_end = (address & PAGE_MASK) + PAGE_SIZE;
2712 mmun_called = true;
2713 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2714
2701 /* 2715 /*
2702 * Re-check the pte - we dropped the lock 2716 * Re-check the pte - we dropped the lock
2703 */ 2717 */
@@ -2764,6 +2778,8 @@ gotten:
2764 page_cache_release(new_page); 2778 page_cache_release(new_page);
2765unlock: 2779unlock:
2766 pte_unmap_unlock(page_table, ptl); 2780 pte_unmap_unlock(page_table, ptl);
2781 if (mmun_called)
2782 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2767 if (old_page) { 2783 if (old_page) {
2768 /* 2784 /*
2769 * Don't let another task, with possibly unlocked vma, 2785 * Don't let another task, with possibly unlocked vma,
@@ -2801,14 +2817,13 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2801 zap_page_range_single(vma, start_addr, end_addr - start_addr, details); 2817 zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
2802} 2818}
2803 2819
2804static inline void unmap_mapping_range_tree(struct prio_tree_root *root, 2820static inline void unmap_mapping_range_tree(struct rb_root *root,
2805 struct zap_details *details) 2821 struct zap_details *details)
2806{ 2822{
2807 struct vm_area_struct *vma; 2823 struct vm_area_struct *vma;
2808 struct prio_tree_iter iter;
2809 pgoff_t vba, vea, zba, zea; 2824 pgoff_t vba, vea, zba, zea;
2810 2825
2811 vma_prio_tree_foreach(vma, &iter, root, 2826 vma_interval_tree_foreach(vma, root,
2812 details->first_index, details->last_index) { 2827 details->first_index, details->last_index) {
2813 2828
2814 vba = vma->vm_pgoff; 2829 vba = vma->vm_pgoff;
@@ -2839,7 +2854,7 @@ static inline void unmap_mapping_range_list(struct list_head *head,
2839 * across *all* the pages in each nonlinear VMA, not just the pages 2854 * across *all* the pages in each nonlinear VMA, not just the pages
2840 * whose virtual address lies outside the file truncation point. 2855 * whose virtual address lies outside the file truncation point.
2841 */ 2856 */
2842 list_for_each_entry(vma, head, shared.vm_set.list) { 2857 list_for_each_entry(vma, head, shared.nonlinear) {
2843 details->nonlinear_vma = vma; 2858 details->nonlinear_vma = vma;
2844 unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details); 2859 unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
2845 } 2860 }
@@ -2883,7 +2898,7 @@ void unmap_mapping_range(struct address_space *mapping,
2883 2898
2884 2899
2885 mutex_lock(&mapping->i_mmap_mutex); 2900 mutex_lock(&mapping->i_mmap_mutex);
2886 if (unlikely(!prio_tree_empty(&mapping->i_mmap))) 2901 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
2887 unmap_mapping_range_tree(&mapping->i_mmap, &details); 2902 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2888 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) 2903 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2889 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); 2904 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6a5b90d0cfd7..56b758ae57d2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -106,6 +106,7 @@ static void get_page_bootmem(unsigned long info, struct page *page,
106void __ref put_page_bootmem(struct page *page) 106void __ref put_page_bootmem(struct page *page)
107{ 107{
108 unsigned long type; 108 unsigned long type;
109 struct zone *zone;
109 110
110 type = (unsigned long) page->lru.next; 111 type = (unsigned long) page->lru.next;
111 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || 112 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
@@ -116,6 +117,12 @@ void __ref put_page_bootmem(struct page *page)
116 set_page_private(page, 0); 117 set_page_private(page, 0);
117 INIT_LIST_HEAD(&page->lru); 118 INIT_LIST_HEAD(&page->lru);
118 __free_pages_bootmem(page, 0); 119 __free_pages_bootmem(page, 0);
120
121 zone = page_zone(page);
122 zone_span_writelock(zone);
123 zone->present_pages++;
124 zone_span_writeunlock(zone);
125 totalram_pages++;
119 } 126 }
120 127
121} 128}
@@ -362,11 +369,11 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
362 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); 369 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
363 BUG_ON(nr_pages % PAGES_PER_SECTION); 370 BUG_ON(nr_pages % PAGES_PER_SECTION);
364 371
372 release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE);
373
365 sections_to_remove = nr_pages / PAGES_PER_SECTION; 374 sections_to_remove = nr_pages / PAGES_PER_SECTION;
366 for (i = 0; i < sections_to_remove; i++) { 375 for (i = 0; i < sections_to_remove; i++) {
367 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; 376 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
368 release_mem_region(pfn << PAGE_SHIFT,
369 PAGES_PER_SECTION << PAGE_SHIFT);
370 ret = __remove_section(zone, __pfn_to_section(pfn)); 377 ret = __remove_section(zone, __pfn_to_section(pfn));
371 if (ret) 378 if (ret)
372 break; 379 break;
@@ -756,13 +763,6 @@ static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
756 return 0; 763 return 0;
757} 764}
758 765
759static struct page *
760hotremove_migrate_alloc(struct page *page, unsigned long private, int **x)
761{
762 /* This should be improooooved!! */
763 return alloc_page(GFP_HIGHUSER_MOVABLE);
764}
765
766#define NR_OFFLINE_AT_ONCE_PAGES (256) 766#define NR_OFFLINE_AT_ONCE_PAGES (256)
767static int 767static int
768do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) 768do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
@@ -813,8 +813,12 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
813 putback_lru_pages(&source); 813 putback_lru_pages(&source);
814 goto out; 814 goto out;
815 } 815 }
816 /* this function returns # of failed pages */ 816
817 ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 817 /*
818 * alloc_migrate_target should be improooooved!!
819 * migrate_pages returns # of failed pages.
820 */
821 ret = migrate_pages(&source, alloc_migrate_target, 0,
818 true, MIGRATE_SYNC); 822 true, MIGRATE_SYNC);
819 if (ret) 823 if (ret)
820 putback_lru_pages(&source); 824 putback_lru_pages(&source);
@@ -870,7 +874,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
870 return offlined; 874 return offlined;
871} 875}
872 876
873static int __ref offline_pages(unsigned long start_pfn, 877static int __ref __offline_pages(unsigned long start_pfn,
874 unsigned long end_pfn, unsigned long timeout) 878 unsigned long end_pfn, unsigned long timeout)
875{ 879{
876 unsigned long pfn, nr_pages, expire; 880 unsigned long pfn, nr_pages, expire;
@@ -970,8 +974,13 @@ repeat:
970 974
971 init_per_zone_wmark_min(); 975 init_per_zone_wmark_min();
972 976
973 if (!populated_zone(zone)) 977 if (!populated_zone(zone)) {
974 zone_pcp_reset(zone); 978 zone_pcp_reset(zone);
979 mutex_lock(&zonelists_mutex);
980 build_all_zonelists(NULL, NULL);
981 mutex_unlock(&zonelists_mutex);
982 } else
983 zone_pcp_update(zone);
975 984
976 if (!node_present_pages(node)) { 985 if (!node_present_pages(node)) {
977 node_clear_state(node, N_HIGH_MEMORY); 986 node_clear_state(node, N_HIGH_MEMORY);
@@ -998,15 +1007,55 @@ out:
998 return ret; 1007 return ret;
999} 1008}
1000 1009
1010int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1011{
1012 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
1013}
1014
1001int remove_memory(u64 start, u64 size) 1015int remove_memory(u64 start, u64 size)
1002{ 1016{
1017 struct memory_block *mem = NULL;
1018 struct mem_section *section;
1003 unsigned long start_pfn, end_pfn; 1019 unsigned long start_pfn, end_pfn;
1020 unsigned long pfn, section_nr;
1021 int ret;
1004 1022
1005 start_pfn = PFN_DOWN(start); 1023 start_pfn = PFN_DOWN(start);
1006 end_pfn = start_pfn + PFN_DOWN(size); 1024 end_pfn = start_pfn + PFN_DOWN(size);
1007 return offline_pages(start_pfn, end_pfn, 120 * HZ); 1025
1026 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
1027 section_nr = pfn_to_section_nr(pfn);
1028 if (!present_section_nr(section_nr))
1029 continue;
1030
1031 section = __nr_to_section(section_nr);
1032 /* same memblock? */
1033 if (mem)
1034 if ((section_nr >= mem->start_section_nr) &&
1035 (section_nr <= mem->end_section_nr))
1036 continue;
1037
1038 mem = find_memory_block_hinted(section, mem);
1039 if (!mem)
1040 continue;
1041
1042 ret = offline_memory_block(mem);
1043 if (ret) {
1044 kobject_put(&mem->dev.kobj);
1045 return ret;
1046 }
1047 }
1048
1049 if (mem)
1050 kobject_put(&mem->dev.kobj);
1051
1052 return 0;
1008} 1053}
1009#else 1054#else
1055int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1056{
1057 return -EINVAL;
1058}
1010int remove_memory(u64 start, u64 size) 1059int remove_memory(u64 start, u64 size)
1011{ 1060{
1012 return -EINVAL; 1061 return -EINVAL;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4ada3be6e252..0b78fb9ea65b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -607,6 +607,42 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
607 return first; 607 return first;
608} 608}
609 609
610/*
611 * Apply policy to a single VMA
612 * This must be called with the mmap_sem held for writing.
613 */
614static int vma_replace_policy(struct vm_area_struct *vma,
615 struct mempolicy *pol)
616{
617 int err;
618 struct mempolicy *old;
619 struct mempolicy *new;
620
621 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
622 vma->vm_start, vma->vm_end, vma->vm_pgoff,
623 vma->vm_ops, vma->vm_file,
624 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
625
626 new = mpol_dup(pol);
627 if (IS_ERR(new))
628 return PTR_ERR(new);
629
630 if (vma->vm_ops && vma->vm_ops->set_policy) {
631 err = vma->vm_ops->set_policy(vma, new);
632 if (err)
633 goto err_out;
634 }
635
636 old = vma->vm_policy;
637 vma->vm_policy = new; /* protected by mmap_sem */
638 mpol_put(old);
639
640 return 0;
641 err_out:
642 mpol_put(new);
643 return err;
644}
645
610/* Step 2: apply policy to a range and do splits. */ 646/* Step 2: apply policy to a range and do splits. */
611static int mbind_range(struct mm_struct *mm, unsigned long start, 647static int mbind_range(struct mm_struct *mm, unsigned long start,
612 unsigned long end, struct mempolicy *new_pol) 648 unsigned long end, struct mempolicy *new_pol)
@@ -655,23 +691,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
655 if (err) 691 if (err)
656 goto out; 692 goto out;
657 } 693 }
658 694 err = vma_replace_policy(vma, new_pol);
659 /* 695 if (err)
660 * Apply policy to a single VMA. The reference counting of 696 goto out;
661 * policy for vma_policy linkages has already been handled by
662 * vma_merge and split_vma as necessary. If this is a shared
663 * policy then ->set_policy will increment the reference count
664 * for an sp node.
665 */
666 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
667 vma->vm_start, vma->vm_end, vma->vm_pgoff,
668 vma->vm_ops, vma->vm_file,
669 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
670 if (vma->vm_ops && vma->vm_ops->set_policy) {
671 err = vma->vm_ops->set_policy(vma, new_pol);
672 if (err)
673 goto out;
674 }
675 } 697 }
676 698
677 out: 699 out:
@@ -924,15 +946,18 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
924 nodemask_t nmask; 946 nodemask_t nmask;
925 LIST_HEAD(pagelist); 947 LIST_HEAD(pagelist);
926 int err = 0; 948 int err = 0;
927 struct vm_area_struct *vma;
928 949
929 nodes_clear(nmask); 950 nodes_clear(nmask);
930 node_set(source, nmask); 951 node_set(source, nmask);
931 952
932 vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, 953 /*
954 * This does not "check" the range but isolates all pages that
955 * need migration. Between passing in the full user address
956 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
957 */
958 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
959 check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
933 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 960 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
934 if (IS_ERR(vma))
935 return PTR_ERR(vma);
936 961
937 if (!list_empty(&pagelist)) { 962 if (!list_empty(&pagelist)) {
938 err = migrate_pages(&pagelist, new_node_page, dest, 963 err = migrate_pages(&pagelist, new_node_page, dest,
@@ -1530,8 +1555,18 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
1530 addr); 1555 addr);
1531 if (vpol) 1556 if (vpol)
1532 pol = vpol; 1557 pol = vpol;
1533 } else if (vma->vm_policy) 1558 } else if (vma->vm_policy) {
1534 pol = vma->vm_policy; 1559 pol = vma->vm_policy;
1560
1561 /*
1562 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1563 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1564 * count on these policies which will be dropped by
1565 * mpol_cond_put() later
1566 */
1567 if (mpol_needs_cond_ref(pol))
1568 mpol_get(pol);
1569 }
1535 } 1570 }
1536 if (!pol) 1571 if (!pol)
1537 pol = &default_policy; 1572 pol = &default_policy;
@@ -2061,7 +2096,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2061 */ 2096 */
2062 2097
2063/* lookup first element intersecting start-end */ 2098/* lookup first element intersecting start-end */
2064/* Caller holds sp->lock */ 2099/* Caller holds sp->mutex */
2065static struct sp_node * 2100static struct sp_node *
2066sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) 2101sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2067{ 2102{
@@ -2125,36 +2160,50 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2125 2160
2126 if (!sp->root.rb_node) 2161 if (!sp->root.rb_node)
2127 return NULL; 2162 return NULL;
2128 spin_lock(&sp->lock); 2163 mutex_lock(&sp->mutex);
2129 sn = sp_lookup(sp, idx, idx+1); 2164 sn = sp_lookup(sp, idx, idx+1);
2130 if (sn) { 2165 if (sn) {
2131 mpol_get(sn->policy); 2166 mpol_get(sn->policy);
2132 pol = sn->policy; 2167 pol = sn->policy;
2133 } 2168 }
2134 spin_unlock(&sp->lock); 2169 mutex_unlock(&sp->mutex);
2135 return pol; 2170 return pol;
2136} 2171}
2137 2172
2173static void sp_free(struct sp_node *n)
2174{
2175 mpol_put(n->policy);
2176 kmem_cache_free(sn_cache, n);
2177}
2178
2138static void sp_delete(struct shared_policy *sp, struct sp_node *n) 2179static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2139{ 2180{
2140 pr_debug("deleting %lx-l%lx\n", n->start, n->end); 2181 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2141 rb_erase(&n->nd, &sp->root); 2182 rb_erase(&n->nd, &sp->root);
2142 mpol_put(n->policy); 2183 sp_free(n);
2143 kmem_cache_free(sn_cache, n);
2144} 2184}
2145 2185
2146static struct sp_node *sp_alloc(unsigned long start, unsigned long end, 2186static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2147 struct mempolicy *pol) 2187 struct mempolicy *pol)
2148{ 2188{
2149 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); 2189 struct sp_node *n;
2190 struct mempolicy *newpol;
2150 2191
2192 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2151 if (!n) 2193 if (!n)
2152 return NULL; 2194 return NULL;
2195
2196 newpol = mpol_dup(pol);
2197 if (IS_ERR(newpol)) {
2198 kmem_cache_free(sn_cache, n);
2199 return NULL;
2200 }
2201 newpol->flags |= MPOL_F_SHARED;
2202
2153 n->start = start; 2203 n->start = start;
2154 n->end = end; 2204 n->end = end;
2155 mpol_get(pol); 2205 n->policy = newpol;
2156 pol->flags |= MPOL_F_SHARED; /* for unref */ 2206
2157 n->policy = pol;
2158 return n; 2207 return n;
2159} 2208}
2160 2209
@@ -2162,10 +2211,10 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2162static int shared_policy_replace(struct shared_policy *sp, unsigned long start, 2211static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2163 unsigned long end, struct sp_node *new) 2212 unsigned long end, struct sp_node *new)
2164{ 2213{
2165 struct sp_node *n, *new2 = NULL; 2214 struct sp_node *n;
2215 int ret = 0;
2166 2216
2167restart: 2217 mutex_lock(&sp->mutex);
2168 spin_lock(&sp->lock);
2169 n = sp_lookup(sp, start, end); 2218 n = sp_lookup(sp, start, end);
2170 /* Take care of old policies in the same range. */ 2219 /* Take care of old policies in the same range. */
2171 while (n && n->start < end) { 2220 while (n && n->start < end) {
@@ -2178,16 +2227,14 @@ restart:
2178 } else { 2227 } else {
2179 /* Old policy spanning whole new range. */ 2228 /* Old policy spanning whole new range. */
2180 if (n->end > end) { 2229 if (n->end > end) {
2230 struct sp_node *new2;
2231 new2 = sp_alloc(end, n->end, n->policy);
2181 if (!new2) { 2232 if (!new2) {
2182 spin_unlock(&sp->lock); 2233 ret = -ENOMEM;
2183 new2 = sp_alloc(end, n->end, n->policy); 2234 goto out;
2184 if (!new2)
2185 return -ENOMEM;
2186 goto restart;
2187 } 2235 }
2188 n->end = start; 2236 n->end = start;
2189 sp_insert(sp, new2); 2237 sp_insert(sp, new2);
2190 new2 = NULL;
2191 break; 2238 break;
2192 } else 2239 } else
2193 n->end = start; 2240 n->end = start;
@@ -2198,12 +2245,9 @@ restart:
2198 } 2245 }
2199 if (new) 2246 if (new)
2200 sp_insert(sp, new); 2247 sp_insert(sp, new);
2201 spin_unlock(&sp->lock); 2248out:
2202 if (new2) { 2249 mutex_unlock(&sp->mutex);
2203 mpol_put(new2->policy); 2250 return ret;
2204 kmem_cache_free(sn_cache, new2);
2205 }
2206 return 0;
2207} 2251}
2208 2252
2209/** 2253/**
@@ -2221,7 +2265,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2221 int ret; 2265 int ret;
2222 2266
2223 sp->root = RB_ROOT; /* empty tree == default mempolicy */ 2267 sp->root = RB_ROOT; /* empty tree == default mempolicy */
2224 spin_lock_init(&sp->lock); 2268 mutex_init(&sp->mutex);
2225 2269
2226 if (mpol) { 2270 if (mpol) {
2227 struct vm_area_struct pvma; 2271 struct vm_area_struct pvma;
@@ -2275,7 +2319,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
2275 } 2319 }
2276 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); 2320 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2277 if (err && new) 2321 if (err && new)
2278 kmem_cache_free(sn_cache, new); 2322 sp_free(new);
2279 return err; 2323 return err;
2280} 2324}
2281 2325
@@ -2287,16 +2331,14 @@ void mpol_free_shared_policy(struct shared_policy *p)
2287 2331
2288 if (!p->root.rb_node) 2332 if (!p->root.rb_node)
2289 return; 2333 return;
2290 spin_lock(&p->lock); 2334 mutex_lock(&p->mutex);
2291 next = rb_first(&p->root); 2335 next = rb_first(&p->root);
2292 while (next) { 2336 while (next) {
2293 n = rb_entry(next, struct sp_node, nd); 2337 n = rb_entry(next, struct sp_node, nd);
2294 next = rb_next(&n->nd); 2338 next = rb_next(&n->nd);
2295 rb_erase(&n->nd, &p->root); 2339 sp_delete(p, n);
2296 mpol_put(n->policy);
2297 kmem_cache_free(sn_cache, n);
2298 } 2340 }
2299 spin_unlock(&p->lock); 2341 mutex_unlock(&p->mutex);
2300} 2342}
2301 2343
2302/* assumes fs == KERNEL_DS */ 2344/* assumes fs == KERNEL_DS */
diff --git a/mm/mlock.c b/mm/mlock.c
index ef726e8aa8e9..f0b9ce572fc7 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -51,15 +51,13 @@ EXPORT_SYMBOL(can_do_mlock);
51/* 51/*
52 * LRU accounting for clear_page_mlock() 52 * LRU accounting for clear_page_mlock()
53 */ 53 */
54void __clear_page_mlock(struct page *page) 54void clear_page_mlock(struct page *page)
55{ 55{
56 VM_BUG_ON(!PageLocked(page)); 56 if (!TestClearPageMlocked(page))
57
58 if (!page->mapping) { /* truncated ? */
59 return; 57 return;
60 }
61 58
62 dec_zone_page_state(page, NR_MLOCK); 59 mod_zone_page_state(page_zone(page), NR_MLOCK,
60 -hpage_nr_pages(page));
63 count_vm_event(UNEVICTABLE_PGCLEARED); 61 count_vm_event(UNEVICTABLE_PGCLEARED);
64 if (!isolate_lru_page(page)) { 62 if (!isolate_lru_page(page)) {
65 putback_lru_page(page); 63 putback_lru_page(page);
@@ -81,7 +79,8 @@ void mlock_vma_page(struct page *page)
81 BUG_ON(!PageLocked(page)); 79 BUG_ON(!PageLocked(page));
82 80
83 if (!TestSetPageMlocked(page)) { 81 if (!TestSetPageMlocked(page)) {
84 inc_zone_page_state(page, NR_MLOCK); 82 mod_zone_page_state(page_zone(page), NR_MLOCK,
83 hpage_nr_pages(page));
85 count_vm_event(UNEVICTABLE_PGMLOCKED); 84 count_vm_event(UNEVICTABLE_PGMLOCKED);
86 if (!isolate_lru_page(page)) 85 if (!isolate_lru_page(page))
87 putback_lru_page(page); 86 putback_lru_page(page);
@@ -108,7 +107,8 @@ void munlock_vma_page(struct page *page)
108 BUG_ON(!PageLocked(page)); 107 BUG_ON(!PageLocked(page));
109 108
110 if (TestClearPageMlocked(page)) { 109 if (TestClearPageMlocked(page)) {
111 dec_zone_page_state(page, NR_MLOCK); 110 mod_zone_page_state(page_zone(page), NR_MLOCK,
111 -hpage_nr_pages(page));
112 if (!isolate_lru_page(page)) { 112 if (!isolate_lru_page(page)) {
113 int ret = SWAP_AGAIN; 113 int ret = SWAP_AGAIN;
114 114
@@ -227,7 +227,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
227 if (vma->vm_flags & (VM_IO | VM_PFNMAP)) 227 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
228 goto no_mlock; 228 goto no_mlock;
229 229
230 if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || 230 if (!((vma->vm_flags & VM_DONTEXPAND) ||
231 is_vm_hugetlb_page(vma) || 231 is_vm_hugetlb_page(vma) ||
232 vma == get_gate_vma(current->mm))) { 232 vma == get_gate_vma(current->mm))) {
233 233
@@ -290,14 +290,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
290 page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); 290 page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
291 if (page && !IS_ERR(page)) { 291 if (page && !IS_ERR(page)) {
292 lock_page(page); 292 lock_page(page);
293 /* 293 munlock_vma_page(page);
294 * Like in __mlock_vma_pages_range(),
295 * because we lock page here and migration is
296 * blocked by the elevated reference, we need
297 * only check for file-cache page truncation.
298 */
299 if (page->mapping)
300 munlock_vma_page(page);
301 unlock_page(page); 294 unlock_page(page);
302 put_page(page); 295 put_page(page);
303 } 296 }
diff --git a/mm/mmap.c b/mm/mmap.c
index 872441e81914..2d942353d681 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -51,12 +51,6 @@ static void unmap_region(struct mm_struct *mm,
51 struct vm_area_struct *vma, struct vm_area_struct *prev, 51 struct vm_area_struct *vma, struct vm_area_struct *prev,
52 unsigned long start, unsigned long end); 52 unsigned long start, unsigned long end);
53 53
54/*
55 * WARNING: the debugging will use recursive algorithms so never enable this
56 * unless you know what you are doing.
57 */
58#undef DEBUG_MM_RB
59
60/* description of effects of mapping type and prot in current implementation. 54/* description of effects of mapping type and prot in current implementation.
61 * this is due to the limited x86 page protection hardware. The expected 55 * this is due to the limited x86 page protection hardware. The expected
62 * behavior is in parens: 56 * behavior is in parens:
@@ -199,14 +193,14 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
199 193
200 flush_dcache_mmap_lock(mapping); 194 flush_dcache_mmap_lock(mapping);
201 if (unlikely(vma->vm_flags & VM_NONLINEAR)) 195 if (unlikely(vma->vm_flags & VM_NONLINEAR))
202 list_del_init(&vma->shared.vm_set.list); 196 list_del_init(&vma->shared.nonlinear);
203 else 197 else
204 vma_prio_tree_remove(vma, &mapping->i_mmap); 198 vma_interval_tree_remove(vma, &mapping->i_mmap);
205 flush_dcache_mmap_unlock(mapping); 199 flush_dcache_mmap_unlock(mapping);
206} 200}
207 201
208/* 202/*
209 * Unlink a file-based vm structure from its prio_tree, to hide 203 * Unlink a file-based vm structure from its interval tree, to hide
210 * vma from rmap and vmtruncate before freeing its page tables. 204 * vma from rmap and vmtruncate before freeing its page tables.
211 */ 205 */
212void unlink_file_vma(struct vm_area_struct *vma) 206void unlink_file_vma(struct vm_area_struct *vma)
@@ -231,11 +225,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
231 might_sleep(); 225 might_sleep();
232 if (vma->vm_ops && vma->vm_ops->close) 226 if (vma->vm_ops && vma->vm_ops->close)
233 vma->vm_ops->close(vma); 227 vma->vm_ops->close(vma);
234 if (vma->vm_file) { 228 if (vma->vm_file)
235 fput(vma->vm_file); 229 fput(vma->vm_file);
236 if (vma->vm_flags & VM_EXECUTABLE)
237 removed_exe_file_vma(vma->vm_mm);
238 }
239 mpol_put(vma_policy(vma)); 230 mpol_put(vma_policy(vma));
240 kmem_cache_free(vm_area_cachep, vma); 231 kmem_cache_free(vm_area_cachep, vma);
241 return next; 232 return next;
@@ -306,7 +297,7 @@ out:
306 return retval; 297 return retval;
307} 298}
308 299
309#ifdef DEBUG_MM_RB 300#ifdef CONFIG_DEBUG_VM_RB
310static int browse_rb(struct rb_root *root) 301static int browse_rb(struct rb_root *root)
311{ 302{
312 int i = 0, j; 303 int i = 0, j;
@@ -340,9 +331,12 @@ void validate_mm(struct mm_struct *mm)
340{ 331{
341 int bug = 0; 332 int bug = 0;
342 int i = 0; 333 int i = 0;
343 struct vm_area_struct *tmp = mm->mmap; 334 struct vm_area_struct *vma = mm->mmap;
344 while (tmp) { 335 while (vma) {
345 tmp = tmp->vm_next; 336 struct anon_vma_chain *avc;
337 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
338 anon_vma_interval_tree_verify(avc);
339 vma = vma->vm_next;
346 i++; 340 i++;
347 } 341 }
348 if (i != mm->map_count) 342 if (i != mm->map_count)
@@ -356,17 +350,46 @@ void validate_mm(struct mm_struct *mm)
356#define validate_mm(mm) do { } while (0) 350#define validate_mm(mm) do { } while (0)
357#endif 351#endif
358 352
359static struct vm_area_struct * 353/*
360find_vma_prepare(struct mm_struct *mm, unsigned long addr, 354 * vma has some anon_vma assigned, and is already inserted on that
361 struct vm_area_struct **pprev, struct rb_node ***rb_link, 355 * anon_vma's interval trees.
362 struct rb_node ** rb_parent) 356 *
357 * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
358 * vma must be removed from the anon_vma's interval trees using
359 * anon_vma_interval_tree_pre_update_vma().
360 *
361 * After the update, the vma will be reinserted using
362 * anon_vma_interval_tree_post_update_vma().
363 *
364 * The entire update must be protected by exclusive mmap_sem and by
365 * the root anon_vma's mutex.
366 */
367static inline void
368anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
363{ 369{
364 struct vm_area_struct * vma; 370 struct anon_vma_chain *avc;
365 struct rb_node ** __rb_link, * __rb_parent, * rb_prev; 371
372 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
373 anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
374}
375
376static inline void
377anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
378{
379 struct anon_vma_chain *avc;
380
381 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
382 anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
383}
384
385static int find_vma_links(struct mm_struct *mm, unsigned long addr,
386 unsigned long end, struct vm_area_struct **pprev,
387 struct rb_node ***rb_link, struct rb_node **rb_parent)
388{
389 struct rb_node **__rb_link, *__rb_parent, *rb_prev;
366 390
367 __rb_link = &mm->mm_rb.rb_node; 391 __rb_link = &mm->mm_rb.rb_node;
368 rb_prev = __rb_parent = NULL; 392 rb_prev = __rb_parent = NULL;
369 vma = NULL;
370 393
371 while (*__rb_link) { 394 while (*__rb_link) {
372 struct vm_area_struct *vma_tmp; 395 struct vm_area_struct *vma_tmp;
@@ -375,9 +398,9 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
375 vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); 398 vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
376 399
377 if (vma_tmp->vm_end > addr) { 400 if (vma_tmp->vm_end > addr) {
378 vma = vma_tmp; 401 /* Fail if an existing vma overlaps the area */
379 if (vma_tmp->vm_start <= addr) 402 if (vma_tmp->vm_start < end)
380 break; 403 return -ENOMEM;
381 __rb_link = &__rb_parent->rb_left; 404 __rb_link = &__rb_parent->rb_left;
382 } else { 405 } else {
383 rb_prev = __rb_parent; 406 rb_prev = __rb_parent;
@@ -390,7 +413,7 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
390 *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); 413 *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
391 *rb_link = __rb_link; 414 *rb_link = __rb_link;
392 *rb_parent = __rb_parent; 415 *rb_parent = __rb_parent;
393 return vma; 416 return 0;
394} 417}
395 418
396void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, 419void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -417,7 +440,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
417 if (unlikely(vma->vm_flags & VM_NONLINEAR)) 440 if (unlikely(vma->vm_flags & VM_NONLINEAR))
418 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); 441 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
419 else 442 else
420 vma_prio_tree_insert(vma, &mapping->i_mmap); 443 vma_interval_tree_insert(vma, &mapping->i_mmap);
421 flush_dcache_mmap_unlock(mapping); 444 flush_dcache_mmap_unlock(mapping);
422 } 445 }
423} 446}
@@ -455,15 +478,16 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
455 478
456/* 479/*
457 * Helper for vma_adjust() in the split_vma insert case: insert a vma into the 480 * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
458 * mm's list and rbtree. It has already been inserted into the prio_tree. 481 * mm's list and rbtree. It has already been inserted into the interval tree.
459 */ 482 */
460static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) 483static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
461{ 484{
462 struct vm_area_struct *__vma, *prev; 485 struct vm_area_struct *prev;
463 struct rb_node **rb_link, *rb_parent; 486 struct rb_node **rb_link, *rb_parent;
464 487
465 __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent); 488 if (find_vma_links(mm, vma->vm_start, vma->vm_end,
466 BUG_ON(__vma && __vma->vm_start < vma->vm_end); 489 &prev, &rb_link, &rb_parent))
490 BUG();
467 __vma_link(mm, vma, prev, rb_link, rb_parent); 491 __vma_link(mm, vma, prev, rb_link, rb_parent);
468 mm->map_count++; 492 mm->map_count++;
469} 493}
@@ -496,7 +520,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
496 struct vm_area_struct *next = vma->vm_next; 520 struct vm_area_struct *next = vma->vm_next;
497 struct vm_area_struct *importer = NULL; 521 struct vm_area_struct *importer = NULL;
498 struct address_space *mapping = NULL; 522 struct address_space *mapping = NULL;
499 struct prio_tree_root *root = NULL; 523 struct rb_root *root = NULL;
500 struct anon_vma *anon_vma = NULL; 524 struct anon_vma *anon_vma = NULL;
501 struct file *file = vma->vm_file; 525 struct file *file = vma->vm_file;
502 long adjust_next = 0; 526 long adjust_next = 0;
@@ -559,7 +583,7 @@ again: remove_next = 1 + (end > next->vm_end);
559 mutex_lock(&mapping->i_mmap_mutex); 583 mutex_lock(&mapping->i_mmap_mutex);
560 if (insert) { 584 if (insert) {
561 /* 585 /*
562 * Put into prio_tree now, so instantiated pages 586 * Put into interval tree now, so instantiated pages
563 * are visible to arm/parisc __flush_dcache_page 587 * are visible to arm/parisc __flush_dcache_page
564 * throughout; but we cannot insert into address 588 * throughout; but we cannot insert into address
565 * space until vma start or end is updated. 589 * space until vma start or end is updated.
@@ -570,22 +594,23 @@ again: remove_next = 1 + (end > next->vm_end);
570 594
571 vma_adjust_trans_huge(vma, start, end, adjust_next); 595 vma_adjust_trans_huge(vma, start, end, adjust_next);
572 596
573 /* 597 anon_vma = vma->anon_vma;
574 * When changing only vma->vm_end, we don't really need anon_vma 598 if (!anon_vma && adjust_next)
575 * lock. This is a fairly rare case by itself, but the anon_vma 599 anon_vma = next->anon_vma;
576 * lock may be shared between many sibling processes. Skipping 600 if (anon_vma) {
577 * the lock for brk adjustments makes a difference sometimes. 601 VM_BUG_ON(adjust_next && next->anon_vma &&
578 */ 602 anon_vma != next->anon_vma);
579 if (vma->anon_vma && (importer || start != vma->vm_start)) {
580 anon_vma = vma->anon_vma;
581 anon_vma_lock(anon_vma); 603 anon_vma_lock(anon_vma);
604 anon_vma_interval_tree_pre_update_vma(vma);
605 if (adjust_next)
606 anon_vma_interval_tree_pre_update_vma(next);
582 } 607 }
583 608
584 if (root) { 609 if (root) {
585 flush_dcache_mmap_lock(mapping); 610 flush_dcache_mmap_lock(mapping);
586 vma_prio_tree_remove(vma, root); 611 vma_interval_tree_remove(vma, root);
587 if (adjust_next) 612 if (adjust_next)
588 vma_prio_tree_remove(next, root); 613 vma_interval_tree_remove(next, root);
589 } 614 }
590 615
591 vma->vm_start = start; 616 vma->vm_start = start;
@@ -598,8 +623,8 @@ again: remove_next = 1 + (end > next->vm_end);
598 623
599 if (root) { 624 if (root) {
600 if (adjust_next) 625 if (adjust_next)
601 vma_prio_tree_insert(next, root); 626 vma_interval_tree_insert(next, root);
602 vma_prio_tree_insert(vma, root); 627 vma_interval_tree_insert(vma, root);
603 flush_dcache_mmap_unlock(mapping); 628 flush_dcache_mmap_unlock(mapping);
604 } 629 }
605 630
@@ -620,8 +645,12 @@ again: remove_next = 1 + (end > next->vm_end);
620 __insert_vm_struct(mm, insert); 645 __insert_vm_struct(mm, insert);
621 } 646 }
622 647
623 if (anon_vma) 648 if (anon_vma) {
649 anon_vma_interval_tree_post_update_vma(vma);
650 if (adjust_next)
651 anon_vma_interval_tree_post_update_vma(next);
624 anon_vma_unlock(anon_vma); 652 anon_vma_unlock(anon_vma);
653 }
625 if (mapping) 654 if (mapping)
626 mutex_unlock(&mapping->i_mmap_mutex); 655 mutex_unlock(&mapping->i_mmap_mutex);
627 656
@@ -636,8 +665,6 @@ again: remove_next = 1 + (end > next->vm_end);
636 if (file) { 665 if (file) {
637 uprobe_munmap(next, next->vm_start, next->vm_end); 666 uprobe_munmap(next, next->vm_start, next->vm_end);
638 fput(file); 667 fput(file);
639 if (next->vm_flags & VM_EXECUTABLE)
640 removed_exe_file_vma(mm);
641 } 668 }
642 if (next->anon_vma) 669 if (next->anon_vma)
643 anon_vma_merge(vma, next); 670 anon_vma_merge(vma, next);
@@ -669,8 +696,7 @@ again: remove_next = 1 + (end > next->vm_end);
669static inline int is_mergeable_vma(struct vm_area_struct *vma, 696static inline int is_mergeable_vma(struct vm_area_struct *vma,
670 struct file *file, unsigned long vm_flags) 697 struct file *file, unsigned long vm_flags)
671{ 698{
672 /* VM_CAN_NONLINEAR may get set later by f_op->mmap() */ 699 if (vma->vm_flags ^ vm_flags)
673 if ((vma->vm_flags ^ vm_flags) & ~VM_CAN_NONLINEAR)
674 return 0; 700 return 0;
675 if (vma->vm_file != file) 701 if (vma->vm_file != file)
676 return 0; 702 return 0;
@@ -951,8 +977,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
951 mm->exec_vm += pages; 977 mm->exec_vm += pages;
952 } else if (flags & stack_flags) 978 } else if (flags & stack_flags)
953 mm->stack_vm += pages; 979 mm->stack_vm += pages;
954 if (flags & (VM_RESERVED|VM_IO))
955 mm->reserved_vm += pages;
956} 980}
957#endif /* CONFIG_PROC_FS */ 981#endif /* CONFIG_PROC_FS */
958 982
@@ -1190,7 +1214,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
1190 return 0; 1214 return 0;
1191 1215
1192 /* Specialty mapping? */ 1216 /* Specialty mapping? */
1193 if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE)) 1217 if (vm_flags & VM_PFNMAP)
1194 return 0; 1218 return 0;
1195 1219
1196 /* Can the mapping track the dirty pages? */ 1220 /* Can the mapping track the dirty pages? */
@@ -1229,8 +1253,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
1229 /* Clear old maps */ 1253 /* Clear old maps */
1230 error = -ENOMEM; 1254 error = -ENOMEM;
1231munmap_back: 1255munmap_back:
1232 vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); 1256 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
1233 if (vma && vma->vm_start < addr + len) {
1234 if (do_munmap(mm, addr, len)) 1257 if (do_munmap(mm, addr, len))
1235 return -ENOMEM; 1258 return -ENOMEM;
1236 goto munmap_back; 1259 goto munmap_back;
@@ -1305,8 +1328,6 @@ munmap_back:
1305 error = file->f_op->mmap(file, vma); 1328 error = file->f_op->mmap(file, vma);
1306 if (error) 1329 if (error)
1307 goto unmap_and_free_vma; 1330 goto unmap_and_free_vma;
1308 if (vm_flags & VM_EXECUTABLE)
1309 added_exe_file_vma(mm);
1310 1331
1311 /* Can addr have changed?? 1332 /* Can addr have changed??
1312 * 1333 *
@@ -1757,13 +1778,16 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1757 if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { 1778 if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
1758 error = acct_stack_growth(vma, size, grow); 1779 error = acct_stack_growth(vma, size, grow);
1759 if (!error) { 1780 if (!error) {
1781 anon_vma_interval_tree_pre_update_vma(vma);
1760 vma->vm_end = address; 1782 vma->vm_end = address;
1783 anon_vma_interval_tree_post_update_vma(vma);
1761 perf_event_mmap(vma); 1784 perf_event_mmap(vma);
1762 } 1785 }
1763 } 1786 }
1764 } 1787 }
1765 vma_unlock_anon_vma(vma); 1788 vma_unlock_anon_vma(vma);
1766 khugepaged_enter_vma_merge(vma); 1789 khugepaged_enter_vma_merge(vma);
1790 validate_mm(vma->vm_mm);
1767 return error; 1791 return error;
1768} 1792}
1769#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ 1793#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1807,14 +1831,17 @@ int expand_downwards(struct vm_area_struct *vma,
1807 if (grow <= vma->vm_pgoff) { 1831 if (grow <= vma->vm_pgoff) {
1808 error = acct_stack_growth(vma, size, grow); 1832 error = acct_stack_growth(vma, size, grow);
1809 if (!error) { 1833 if (!error) {
1834 anon_vma_interval_tree_pre_update_vma(vma);
1810 vma->vm_start = address; 1835 vma->vm_start = address;
1811 vma->vm_pgoff -= grow; 1836 vma->vm_pgoff -= grow;
1837 anon_vma_interval_tree_post_update_vma(vma);
1812 perf_event_mmap(vma); 1838 perf_event_mmap(vma);
1813 } 1839 }
1814 } 1840 }
1815 } 1841 }
1816 vma_unlock_anon_vma(vma); 1842 vma_unlock_anon_vma(vma);
1817 khugepaged_enter_vma_merge(vma); 1843 khugepaged_enter_vma_merge(vma);
1844 validate_mm(vma->vm_mm);
1818 return error; 1845 return error;
1819} 1846}
1820 1847
@@ -1988,11 +2015,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1988 if (anon_vma_clone(new, vma)) 2015 if (anon_vma_clone(new, vma))
1989 goto out_free_mpol; 2016 goto out_free_mpol;
1990 2017
1991 if (new->vm_file) { 2018 if (new->vm_file)
1992 get_file(new->vm_file); 2019 get_file(new->vm_file);
1993 if (vma->vm_flags & VM_EXECUTABLE)
1994 added_exe_file_vma(mm);
1995 }
1996 2020
1997 if (new->vm_ops && new->vm_ops->open) 2021 if (new->vm_ops && new->vm_ops->open)
1998 new->vm_ops->open(new); 2022 new->vm_ops->open(new);
@@ -2010,11 +2034,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
2010 /* Clean everything up if vma_adjust failed. */ 2034 /* Clean everything up if vma_adjust failed. */
2011 if (new->vm_ops && new->vm_ops->close) 2035 if (new->vm_ops && new->vm_ops->close)
2012 new->vm_ops->close(new); 2036 new->vm_ops->close(new);
2013 if (new->vm_file) { 2037 if (new->vm_file)
2014 if (vma->vm_flags & VM_EXECUTABLE)
2015 removed_exe_file_vma(mm);
2016 fput(new->vm_file); 2038 fput(new->vm_file);
2017 }
2018 unlink_anon_vmas(new); 2039 unlink_anon_vmas(new);
2019 out_free_mpol: 2040 out_free_mpol:
2020 mpol_put(pol); 2041 mpol_put(pol);
@@ -2199,8 +2220,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
2199 * Clear old maps. this also does some error checking for us 2220 * Clear old maps. this also does some error checking for us
2200 */ 2221 */
2201 munmap_back: 2222 munmap_back:
2202 vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); 2223 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
2203 if (vma && vma->vm_start < addr + len) {
2204 if (do_munmap(mm, addr, len)) 2224 if (do_munmap(mm, addr, len))
2205 return -ENOMEM; 2225 return -ENOMEM;
2206 goto munmap_back; 2226 goto munmap_back;
@@ -2314,10 +2334,10 @@ void exit_mmap(struct mm_struct *mm)
2314 * and into the inode's i_mmap tree. If vm_file is non-NULL 2334 * and into the inode's i_mmap tree. If vm_file is non-NULL
2315 * then i_mmap_mutex is taken here. 2335 * then i_mmap_mutex is taken here.
2316 */ 2336 */
2317int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) 2337int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
2318{ 2338{
2319 struct vm_area_struct * __vma, * prev; 2339 struct vm_area_struct *prev;
2320 struct rb_node ** rb_link, * rb_parent; 2340 struct rb_node **rb_link, *rb_parent;
2321 2341
2322 /* 2342 /*
2323 * The vm_pgoff of a purely anonymous vma should be irrelevant 2343 * The vm_pgoff of a purely anonymous vma should be irrelevant
@@ -2335,8 +2355,8 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
2335 BUG_ON(vma->anon_vma); 2355 BUG_ON(vma->anon_vma);
2336 vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; 2356 vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
2337 } 2357 }
2338 __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent); 2358 if (find_vma_links(mm, vma->vm_start, vma->vm_end,
2339 if (__vma && __vma->vm_start < vma->vm_end) 2359 &prev, &rb_link, &rb_parent))
2340 return -ENOMEM; 2360 return -ENOMEM;
2341 if ((vma->vm_flags & VM_ACCOUNT) && 2361 if ((vma->vm_flags & VM_ACCOUNT) &&
2342 security_vm_enough_memory_mm(mm, vma_pages(vma))) 2362 security_vm_enough_memory_mm(mm, vma_pages(vma)))
@@ -2351,7 +2371,8 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
2351 * prior to moving page table entries, to effect an mremap move. 2371 * prior to moving page table entries, to effect an mremap move.
2352 */ 2372 */
2353struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, 2373struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2354 unsigned long addr, unsigned long len, pgoff_t pgoff) 2374 unsigned long addr, unsigned long len, pgoff_t pgoff,
2375 bool *need_rmap_locks)
2355{ 2376{
2356 struct vm_area_struct *vma = *vmap; 2377 struct vm_area_struct *vma = *vmap;
2357 unsigned long vma_start = vma->vm_start; 2378 unsigned long vma_start = vma->vm_start;
@@ -2370,7 +2391,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2370 faulted_in_anon_vma = false; 2391 faulted_in_anon_vma = false;
2371 } 2392 }
2372 2393
2373 find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); 2394 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
2395 return NULL; /* should never get here */
2374 new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, 2396 new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
2375 vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); 2397 vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
2376 if (new_vma) { 2398 if (new_vma) {
@@ -2392,32 +2414,29 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2392 * linear if there are no pages mapped yet. 2414 * linear if there are no pages mapped yet.
2393 */ 2415 */
2394 VM_BUG_ON(faulted_in_anon_vma); 2416 VM_BUG_ON(faulted_in_anon_vma);
2395 *vmap = new_vma; 2417 *vmap = vma = new_vma;
2396 } else 2418 }
2397 anon_vma_moveto_tail(new_vma); 2419 *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
2398 } else { 2420 } else {
2399 new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 2421 new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
2400 if (new_vma) { 2422 if (new_vma) {
2401 *new_vma = *vma; 2423 *new_vma = *vma;
2424 new_vma->vm_start = addr;
2425 new_vma->vm_end = addr + len;
2426 new_vma->vm_pgoff = pgoff;
2402 pol = mpol_dup(vma_policy(vma)); 2427 pol = mpol_dup(vma_policy(vma));
2403 if (IS_ERR(pol)) 2428 if (IS_ERR(pol))
2404 goto out_free_vma; 2429 goto out_free_vma;
2430 vma_set_policy(new_vma, pol);
2405 INIT_LIST_HEAD(&new_vma->anon_vma_chain); 2431 INIT_LIST_HEAD(&new_vma->anon_vma_chain);
2406 if (anon_vma_clone(new_vma, vma)) 2432 if (anon_vma_clone(new_vma, vma))
2407 goto out_free_mempol; 2433 goto out_free_mempol;
2408 vma_set_policy(new_vma, pol); 2434 if (new_vma->vm_file)
2409 new_vma->vm_start = addr;
2410 new_vma->vm_end = addr + len;
2411 new_vma->vm_pgoff = pgoff;
2412 if (new_vma->vm_file) {
2413 get_file(new_vma->vm_file); 2435 get_file(new_vma->vm_file);
2414
2415 if (vma->vm_flags & VM_EXECUTABLE)
2416 added_exe_file_vma(mm);
2417 }
2418 if (new_vma->vm_ops && new_vma->vm_ops->open) 2436 if (new_vma->vm_ops && new_vma->vm_ops->open)
2419 new_vma->vm_ops->open(new_vma); 2437 new_vma->vm_ops->open(new_vma);
2420 vma_link(mm, new_vma, prev, rb_link, rb_parent); 2438 vma_link(mm, new_vma, prev, rb_link, rb_parent);
2439 *need_rmap_locks = false;
2421 } 2440 }
2422 } 2441 }
2423 return new_vma; 2442 return new_vma;
@@ -2535,7 +2554,7 @@ static DEFINE_MUTEX(mm_all_locks_mutex);
2535 2554
2536static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) 2555static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
2537{ 2556{
2538 if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { 2557 if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
2539 /* 2558 /*
2540 * The LSB of head.next can't change from under us 2559 * The LSB of head.next can't change from under us
2541 * because we hold the mm_all_locks_mutex. 2560 * because we hold the mm_all_locks_mutex.
@@ -2551,7 +2570,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
2551 * anon_vma->root->mutex. 2570 * anon_vma->root->mutex.
2552 */ 2571 */
2553 if (__test_and_set_bit(0, (unsigned long *) 2572 if (__test_and_set_bit(0, (unsigned long *)
2554 &anon_vma->root->head.next)) 2573 &anon_vma->root->rb_root.rb_node))
2555 BUG(); 2574 BUG();
2556 } 2575 }
2557} 2576}
@@ -2592,7 +2611,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
2592 * A single task can't take more than one mm_take_all_locks() in a row 2611 * A single task can't take more than one mm_take_all_locks() in a row
2593 * or it would deadlock. 2612 * or it would deadlock.
2594 * 2613 *
2595 * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in 2614 * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
2596 * mapping->flags avoid to take the same lock twice, if more than one 2615 * mapping->flags avoid to take the same lock twice, if more than one
2597 * vma in this mm is backed by the same anon_vma or address_space. 2616 * vma in this mm is backed by the same anon_vma or address_space.
2598 * 2617 *
@@ -2639,13 +2658,13 @@ out_unlock:
2639 2658
2640static void vm_unlock_anon_vma(struct anon_vma *anon_vma) 2659static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
2641{ 2660{
2642 if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { 2661 if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
2643 /* 2662 /*
2644 * The LSB of head.next can't change to 0 from under 2663 * The LSB of head.next can't change to 0 from under
2645 * us because we hold the mm_all_locks_mutex. 2664 * us because we hold the mm_all_locks_mutex.
2646 * 2665 *
2647 * We must however clear the bitflag before unlocking 2666 * We must however clear the bitflag before unlocking
2648 * the vma so the users using the anon_vma->head will 2667 * the vma so the users using the anon_vma->rb_root will
2649 * never see our bitflag. 2668 * never see our bitflag.
2650 * 2669 *
2651 * No need of atomic instructions here, head.next 2670 * No need of atomic instructions here, head.next
@@ -2653,7 +2672,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
2653 * anon_vma->root->mutex. 2672 * anon_vma->root->mutex.
2654 */ 2673 */
2655 if (!__test_and_clear_bit(0, (unsigned long *) 2674 if (!__test_and_clear_bit(0, (unsigned long *)
2656 &anon_vma->root->head.next)) 2675 &anon_vma->root->rb_root.rb_node))
2657 BUG(); 2676 BUG();
2658 anon_vma_unlock(anon_vma); 2677 anon_vma_unlock(anon_vma);
2659 } 2678 }
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 862b60822d9f..479a1e751a73 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -14,10 +14,14 @@
14#include <linux/export.h> 14#include <linux/export.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/err.h> 16#include <linux/err.h>
17#include <linux/srcu.h>
17#include <linux/rcupdate.h> 18#include <linux/rcupdate.h>
18#include <linux/sched.h> 19#include <linux/sched.h>
19#include <linux/slab.h> 20#include <linux/slab.h>
20 21
22/* global SRCU for all MMs */
23static struct srcu_struct srcu;
24
21/* 25/*
22 * This function can't run concurrently against mmu_notifier_register 26 * This function can't run concurrently against mmu_notifier_register
23 * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap 27 * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
@@ -25,8 +29,8 @@
25 * in parallel despite there being no task using this mm any more, 29 * in parallel despite there being no task using this mm any more,
26 * through the vmas outside of the exit_mmap context, such as with 30 * through the vmas outside of the exit_mmap context, such as with
27 * vmtruncate. This serializes against mmu_notifier_unregister with 31 * vmtruncate. This serializes against mmu_notifier_unregister with
28 * the mmu_notifier_mm->lock in addition to RCU and it serializes 32 * the mmu_notifier_mm->lock in addition to SRCU and it serializes
29 * against the other mmu notifiers with RCU. struct mmu_notifier_mm 33 * against the other mmu notifiers with SRCU. struct mmu_notifier_mm
30 * can't go away from under us as exit_mmap holds an mm_count pin 34 * can't go away from under us as exit_mmap holds an mm_count pin
31 * itself. 35 * itself.
32 */ 36 */
@@ -34,12 +38,13 @@ void __mmu_notifier_release(struct mm_struct *mm)
34{ 38{
35 struct mmu_notifier *mn; 39 struct mmu_notifier *mn;
36 struct hlist_node *n; 40 struct hlist_node *n;
41 int id;
37 42
38 /* 43 /*
39 * RCU here will block mmu_notifier_unregister until 44 * SRCU here will block mmu_notifier_unregister until
40 * ->release returns. 45 * ->release returns.
41 */ 46 */
42 rcu_read_lock(); 47 id = srcu_read_lock(&srcu);
43 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) 48 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
44 /* 49 /*
45 * if ->release runs before mmu_notifier_unregister it 50 * if ->release runs before mmu_notifier_unregister it
@@ -50,7 +55,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
50 */ 55 */
51 if (mn->ops->release) 56 if (mn->ops->release)
52 mn->ops->release(mn, mm); 57 mn->ops->release(mn, mm);
53 rcu_read_unlock(); 58 srcu_read_unlock(&srcu, id);
54 59
55 spin_lock(&mm->mmu_notifier_mm->lock); 60 spin_lock(&mm->mmu_notifier_mm->lock);
56 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { 61 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -68,7 +73,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
68 spin_unlock(&mm->mmu_notifier_mm->lock); 73 spin_unlock(&mm->mmu_notifier_mm->lock);
69 74
70 /* 75 /*
71 * synchronize_rcu here prevents mmu_notifier_release to 76 * synchronize_srcu here prevents mmu_notifier_release to
72 * return to exit_mmap (which would proceed freeing all pages 77 * return to exit_mmap (which would proceed freeing all pages
73 * in the mm) until the ->release method returns, if it was 78 * in the mm) until the ->release method returns, if it was
74 * invoked by mmu_notifier_unregister. 79 * invoked by mmu_notifier_unregister.
@@ -76,7 +81,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
76 * The mmu_notifier_mm can't go away from under us because one 81 * The mmu_notifier_mm can't go away from under us because one
77 * mm_count is hold by exit_mmap. 82 * mm_count is hold by exit_mmap.
78 */ 83 */
79 synchronize_rcu(); 84 synchronize_srcu(&srcu);
80} 85}
81 86
82/* 87/*
@@ -89,14 +94,14 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
89{ 94{
90 struct mmu_notifier *mn; 95 struct mmu_notifier *mn;
91 struct hlist_node *n; 96 struct hlist_node *n;
92 int young = 0; 97 int young = 0, id;
93 98
94 rcu_read_lock(); 99 id = srcu_read_lock(&srcu);
95 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { 100 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
96 if (mn->ops->clear_flush_young) 101 if (mn->ops->clear_flush_young)
97 young |= mn->ops->clear_flush_young(mn, mm, address); 102 young |= mn->ops->clear_flush_young(mn, mm, address);
98 } 103 }
99 rcu_read_unlock(); 104 srcu_read_unlock(&srcu, id);
100 105
101 return young; 106 return young;
102} 107}
@@ -106,9 +111,9 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
106{ 111{
107 struct mmu_notifier *mn; 112 struct mmu_notifier *mn;
108 struct hlist_node *n; 113 struct hlist_node *n;
109 int young = 0; 114 int young = 0, id;
110 115
111 rcu_read_lock(); 116 id = srcu_read_lock(&srcu);
112 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { 117 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
113 if (mn->ops->test_young) { 118 if (mn->ops->test_young) {
114 young = mn->ops->test_young(mn, mm, address); 119 young = mn->ops->test_young(mn, mm, address);
@@ -116,7 +121,7 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
116 break; 121 break;
117 } 122 }
118 } 123 }
119 rcu_read_unlock(); 124 srcu_read_unlock(&srcu, id);
120 125
121 return young; 126 return young;
122} 127}
@@ -126,19 +131,14 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
126{ 131{
127 struct mmu_notifier *mn; 132 struct mmu_notifier *mn;
128 struct hlist_node *n; 133 struct hlist_node *n;
134 int id;
129 135
130 rcu_read_lock(); 136 id = srcu_read_lock(&srcu);
131 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { 137 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
132 if (mn->ops->change_pte) 138 if (mn->ops->change_pte)
133 mn->ops->change_pte(mn, mm, address, pte); 139 mn->ops->change_pte(mn, mm, address, pte);
134 /*
135 * Some drivers don't have change_pte,
136 * so we must call invalidate_page in that case.
137 */
138 else if (mn->ops->invalidate_page)
139 mn->ops->invalidate_page(mn, mm, address);
140 } 140 }
141 rcu_read_unlock(); 141 srcu_read_unlock(&srcu, id);
142} 142}
143 143
144void __mmu_notifier_invalidate_page(struct mm_struct *mm, 144void __mmu_notifier_invalidate_page(struct mm_struct *mm,
@@ -146,13 +146,14 @@ void __mmu_notifier_invalidate_page(struct mm_struct *mm,
146{ 146{
147 struct mmu_notifier *mn; 147 struct mmu_notifier *mn;
148 struct hlist_node *n; 148 struct hlist_node *n;
149 int id;
149 150
150 rcu_read_lock(); 151 id = srcu_read_lock(&srcu);
151 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { 152 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
152 if (mn->ops->invalidate_page) 153 if (mn->ops->invalidate_page)
153 mn->ops->invalidate_page(mn, mm, address); 154 mn->ops->invalidate_page(mn, mm, address);
154 } 155 }
155 rcu_read_unlock(); 156 srcu_read_unlock(&srcu, id);
156} 157}
157 158
158void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, 159void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
@@ -160,13 +161,14 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
160{ 161{
161 struct mmu_notifier *mn; 162 struct mmu_notifier *mn;
162 struct hlist_node *n; 163 struct hlist_node *n;
164 int id;
163 165
164 rcu_read_lock(); 166 id = srcu_read_lock(&srcu);
165 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { 167 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
166 if (mn->ops->invalidate_range_start) 168 if (mn->ops->invalidate_range_start)
167 mn->ops->invalidate_range_start(mn, mm, start, end); 169 mn->ops->invalidate_range_start(mn, mm, start, end);
168 } 170 }
169 rcu_read_unlock(); 171 srcu_read_unlock(&srcu, id);
170} 172}
171 173
172void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, 174void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
@@ -174,13 +176,14 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
174{ 176{
175 struct mmu_notifier *mn; 177 struct mmu_notifier *mn;
176 struct hlist_node *n; 178 struct hlist_node *n;
179 int id;
177 180
178 rcu_read_lock(); 181 id = srcu_read_lock(&srcu);
179 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { 182 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
180 if (mn->ops->invalidate_range_end) 183 if (mn->ops->invalidate_range_end)
181 mn->ops->invalidate_range_end(mn, mm, start, end); 184 mn->ops->invalidate_range_end(mn, mm, start, end);
182 } 185 }
183 rcu_read_unlock(); 186 srcu_read_unlock(&srcu, id);
184} 187}
185 188
186static int do_mmu_notifier_register(struct mmu_notifier *mn, 189static int do_mmu_notifier_register(struct mmu_notifier *mn,
@@ -192,22 +195,29 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
192 195
193 BUG_ON(atomic_read(&mm->mm_users) <= 0); 196 BUG_ON(atomic_read(&mm->mm_users) <= 0);
194 197
195 ret = -ENOMEM; 198 /*
196 mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); 199 * Verify that mmu_notifier_init() already run and the global srcu is
197 if (unlikely(!mmu_notifier_mm)) 200 * initialized.
198 goto out; 201 */
202 BUG_ON(!srcu.per_cpu_ref);
199 203
200 if (take_mmap_sem) 204 if (take_mmap_sem)
201 down_write(&mm->mmap_sem); 205 down_write(&mm->mmap_sem);
202 ret = mm_take_all_locks(mm); 206 ret = mm_take_all_locks(mm);
203 if (unlikely(ret)) 207 if (unlikely(ret))
204 goto out_cleanup; 208 goto out;
205 209
206 if (!mm_has_notifiers(mm)) { 210 if (!mm_has_notifiers(mm)) {
211 mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm),
212 GFP_KERNEL);
213 if (unlikely(!mmu_notifier_mm)) {
214 ret = -ENOMEM;
215 goto out_of_mem;
216 }
207 INIT_HLIST_HEAD(&mmu_notifier_mm->list); 217 INIT_HLIST_HEAD(&mmu_notifier_mm->list);
208 spin_lock_init(&mmu_notifier_mm->lock); 218 spin_lock_init(&mmu_notifier_mm->lock);
219
209 mm->mmu_notifier_mm = mmu_notifier_mm; 220 mm->mmu_notifier_mm = mmu_notifier_mm;
210 mmu_notifier_mm = NULL;
211 } 221 }
212 atomic_inc(&mm->mm_count); 222 atomic_inc(&mm->mm_count);
213 223
@@ -223,13 +233,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
223 hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list); 233 hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list);
224 spin_unlock(&mm->mmu_notifier_mm->lock); 234 spin_unlock(&mm->mmu_notifier_mm->lock);
225 235
236out_of_mem:
226 mm_drop_all_locks(mm); 237 mm_drop_all_locks(mm);
227out_cleanup: 238out:
228 if (take_mmap_sem) 239 if (take_mmap_sem)
229 up_write(&mm->mmap_sem); 240 up_write(&mm->mmap_sem);
230 /* kfree() does nothing if mmu_notifier_mm is NULL */ 241
231 kfree(mmu_notifier_mm);
232out:
233 BUG_ON(atomic_read(&mm->mm_users) <= 0); 242 BUG_ON(atomic_read(&mm->mm_users) <= 0);
234 return ret; 243 return ret;
235} 244}
@@ -274,8 +283,8 @@ void __mmu_notifier_mm_destroy(struct mm_struct *mm)
274/* 283/*
275 * This releases the mm_count pin automatically and frees the mm 284 * This releases the mm_count pin automatically and frees the mm
276 * structure if it was the last user of it. It serializes against 285 * structure if it was the last user of it. It serializes against
277 * running mmu notifiers with RCU and against mmu_notifier_unregister 286 * running mmu notifiers with SRCU and against mmu_notifier_unregister
278 * with the unregister lock + RCU. All sptes must be dropped before 287 * with the unregister lock + SRCU. All sptes must be dropped before
279 * calling mmu_notifier_unregister. ->release or any other notifier 288 * calling mmu_notifier_unregister. ->release or any other notifier
280 * method may be invoked concurrently with mmu_notifier_unregister, 289 * method may be invoked concurrently with mmu_notifier_unregister,
281 * and only after mmu_notifier_unregister returned we're guaranteed 290 * and only after mmu_notifier_unregister returned we're guaranteed
@@ -287,11 +296,12 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
287 296
288 if (!hlist_unhashed(&mn->hlist)) { 297 if (!hlist_unhashed(&mn->hlist)) {
289 /* 298 /*
290 * RCU here will force exit_mmap to wait ->release to finish 299 * SRCU here will force exit_mmap to wait ->release to finish
291 * before freeing the pages. 300 * before freeing the pages.
292 */ 301 */
293 rcu_read_lock(); 302 int id;
294 303
304 id = srcu_read_lock(&srcu);
295 /* 305 /*
296 * exit_mmap will block in mmu_notifier_release to 306 * exit_mmap will block in mmu_notifier_release to
297 * guarantee ->release is called before freeing the 307 * guarantee ->release is called before freeing the
@@ -299,7 +309,7 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
299 */ 309 */
300 if (mn->ops->release) 310 if (mn->ops->release)
301 mn->ops->release(mn, mm); 311 mn->ops->release(mn, mm);
302 rcu_read_unlock(); 312 srcu_read_unlock(&srcu, id);
303 313
304 spin_lock(&mm->mmu_notifier_mm->lock); 314 spin_lock(&mm->mmu_notifier_mm->lock);
305 hlist_del_rcu(&mn->hlist); 315 hlist_del_rcu(&mn->hlist);
@@ -310,10 +320,17 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
310 * Wait any running method to finish, of course including 320 * Wait any running method to finish, of course including
311 * ->release if it was run by mmu_notifier_relase instead of us. 321 * ->release if it was run by mmu_notifier_relase instead of us.
312 */ 322 */
313 synchronize_rcu(); 323 synchronize_srcu(&srcu);
314 324
315 BUG_ON(atomic_read(&mm->mm_count) <= 0); 325 BUG_ON(atomic_read(&mm->mm_count) <= 0);
316 326
317 mmdrop(mm); 327 mmdrop(mm);
318} 328}
319EXPORT_SYMBOL_GPL(mmu_notifier_unregister); 329EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
330
331static int __init mmu_notifier_init(void)
332{
333 return init_srcu_struct(&srcu);
334}
335
336module_init(mmu_notifier_init);
diff --git a/mm/mremap.c b/mm/mremap.c
index cc06d0e48d05..1b61c2d3307a 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -71,22 +71,41 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
71static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, 71static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
72 unsigned long old_addr, unsigned long old_end, 72 unsigned long old_addr, unsigned long old_end,
73 struct vm_area_struct *new_vma, pmd_t *new_pmd, 73 struct vm_area_struct *new_vma, pmd_t *new_pmd,
74 unsigned long new_addr) 74 unsigned long new_addr, bool need_rmap_locks)
75{ 75{
76 struct address_space *mapping = NULL; 76 struct address_space *mapping = NULL;
77 struct anon_vma *anon_vma = NULL;
77 struct mm_struct *mm = vma->vm_mm; 78 struct mm_struct *mm = vma->vm_mm;
78 pte_t *old_pte, *new_pte, pte; 79 pte_t *old_pte, *new_pte, pte;
79 spinlock_t *old_ptl, *new_ptl; 80 spinlock_t *old_ptl, *new_ptl;
80 81
81 if (vma->vm_file) { 82 /*
82 /* 83 * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma
83 * Subtle point from Rajesh Venkatasubramanian: before 84 * locks to ensure that rmap will always observe either the old or the
84 * moving file-based ptes, we must lock truncate_pagecache 85 * new ptes. This is the easiest way to avoid races with
85 * out, since it might clean the dst vma before the src vma, 86 * truncate_pagecache(), page migration, etc...
86 * and we propagate stale pages into the dst afterward. 87 *
87 */ 88 * When need_rmap_locks is false, we use other ways to avoid
88 mapping = vma->vm_file->f_mapping; 89 * such races:
89 mutex_lock(&mapping->i_mmap_mutex); 90 *
91 * - During exec() shift_arg_pages(), we use a specially tagged vma
92 * which rmap call sites look for using is_vma_temporary_stack().
93 *
94 * - During mremap(), new_vma is often known to be placed after vma
95 * in rmap traversal order. This ensures rmap will always observe
96 * either the old pte, or the new pte, or both (the page table locks
97 * serialize access to individual ptes, but only rmap traversal
98 * order guarantees that we won't miss both the old and new ptes).
99 */
100 if (need_rmap_locks) {
101 if (vma->vm_file) {
102 mapping = vma->vm_file->f_mapping;
103 mutex_lock(&mapping->i_mmap_mutex);
104 }
105 if (vma->anon_vma) {
106 anon_vma = vma->anon_vma;
107 anon_vma_lock(anon_vma);
108 }
90 } 109 }
91 110
92 /* 111 /*
@@ -114,6 +133,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
114 spin_unlock(new_ptl); 133 spin_unlock(new_ptl);
115 pte_unmap(new_pte - 1); 134 pte_unmap(new_pte - 1);
116 pte_unmap_unlock(old_pte - 1, old_ptl); 135 pte_unmap_unlock(old_pte - 1, old_ptl);
136 if (anon_vma)
137 anon_vma_unlock(anon_vma);
117 if (mapping) 138 if (mapping)
118 mutex_unlock(&mapping->i_mmap_mutex); 139 mutex_unlock(&mapping->i_mmap_mutex);
119} 140}
@@ -122,16 +143,21 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
122 143
123unsigned long move_page_tables(struct vm_area_struct *vma, 144unsigned long move_page_tables(struct vm_area_struct *vma,
124 unsigned long old_addr, struct vm_area_struct *new_vma, 145 unsigned long old_addr, struct vm_area_struct *new_vma,
125 unsigned long new_addr, unsigned long len) 146 unsigned long new_addr, unsigned long len,
147 bool need_rmap_locks)
126{ 148{
127 unsigned long extent, next, old_end; 149 unsigned long extent, next, old_end;
128 pmd_t *old_pmd, *new_pmd; 150 pmd_t *old_pmd, *new_pmd;
129 bool need_flush = false; 151 bool need_flush = false;
152 unsigned long mmun_start; /* For mmu_notifiers */
153 unsigned long mmun_end; /* For mmu_notifiers */
130 154
131 old_end = old_addr + len; 155 old_end = old_addr + len;
132 flush_cache_range(vma, old_addr, old_end); 156 flush_cache_range(vma, old_addr, old_end);
133 157
134 mmu_notifier_invalidate_range_start(vma->vm_mm, old_addr, old_end); 158 mmun_start = old_addr;
159 mmun_end = old_end;
160 mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
135 161
136 for (; old_addr < old_end; old_addr += extent, new_addr += extent) { 162 for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
137 cond_resched(); 163 cond_resched();
@@ -169,13 +195,13 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
169 if (extent > LATENCY_LIMIT) 195 if (extent > LATENCY_LIMIT)
170 extent = LATENCY_LIMIT; 196 extent = LATENCY_LIMIT;
171 move_ptes(vma, old_pmd, old_addr, old_addr + extent, 197 move_ptes(vma, old_pmd, old_addr, old_addr + extent,
172 new_vma, new_pmd, new_addr); 198 new_vma, new_pmd, new_addr, need_rmap_locks);
173 need_flush = true; 199 need_flush = true;
174 } 200 }
175 if (likely(need_flush)) 201 if (likely(need_flush))
176 flush_tlb_range(vma, old_end-len, old_addr); 202 flush_tlb_range(vma, old_end-len, old_addr);
177 203
178 mmu_notifier_invalidate_range_end(vma->vm_mm, old_end-len, old_end); 204 mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
179 205
180 return len + old_addr - old_end; /* how much done */ 206 return len + old_addr - old_end; /* how much done */
181} 207}
@@ -193,6 +219,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
193 unsigned long hiwater_vm; 219 unsigned long hiwater_vm;
194 int split = 0; 220 int split = 0;
195 int err; 221 int err;
222 bool need_rmap_locks;
196 223
197 /* 224 /*
198 * We'd prefer to avoid failure later on in do_munmap: 225 * We'd prefer to avoid failure later on in do_munmap:
@@ -214,27 +241,21 @@ static unsigned long move_vma(struct vm_area_struct *vma,
214 return err; 241 return err;
215 242
216 new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); 243 new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
217 new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); 244 new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
245 &need_rmap_locks);
218 if (!new_vma) 246 if (!new_vma)
219 return -ENOMEM; 247 return -ENOMEM;
220 248
221 moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); 249 moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
250 need_rmap_locks);
222 if (moved_len < old_len) { 251 if (moved_len < old_len) {
223 /* 252 /*
224 * Before moving the page tables from the new vma to
225 * the old vma, we need to be sure the old vma is
226 * queued after new vma in the same_anon_vma list to
227 * prevent SMP races with rmap_walk (that could lead
228 * rmap_walk to miss some page table).
229 */
230 anon_vma_moveto_tail(vma);
231
232 /*
233 * On error, move entries back from new area to old, 253 * On error, move entries back from new area to old,
234 * which will succeed since page tables still there, 254 * which will succeed since page tables still there,
235 * and then proceed to unmap new area instead of old. 255 * and then proceed to unmap new area instead of old.
236 */ 256 */
237 move_page_tables(new_vma, new_addr, vma, old_addr, moved_len); 257 move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
258 true);
238 vma = new_vma; 259 vma = new_vma;
239 old_len = new_len; 260 old_len = new_len;
240 old_addr = new_addr; 261 old_addr = new_addr;
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 405573010f99..714d5d650470 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -116,6 +116,8 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
116 return 0; 116 return 0;
117 117
118 __free_pages_memory(start_pfn, end_pfn); 118 __free_pages_memory(start_pfn, end_pfn);
119 fixup_zone_present_pages(pfn_to_nid(start >> PAGE_SHIFT),
120 start_pfn, end_pfn);
119 121
120 return end_pfn - start_pfn; 122 return end_pfn - start_pfn;
121} 123}
@@ -126,6 +128,7 @@ unsigned long __init free_low_memory_core_early(int nodeid)
126 phys_addr_t start, end, size; 128 phys_addr_t start, end, size;
127 u64 i; 129 u64 i;
128 130
131 reset_zone_present_pages();
129 for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) 132 for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL)
130 count += __free_memory_core(start, end); 133 count += __free_memory_core(start, end);
131 134
@@ -162,8 +165,6 @@ unsigned long __init free_all_bootmem(void)
162 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id 165 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
163 * because in some case like Node0 doesn't have RAM installed 166 * because in some case like Node0 doesn't have RAM installed
164 * low ram will be on Node1 167 * low ram will be on Node1
165 * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
166 * will be used instead of only Node0 related
167 */ 168 */
168 return free_low_memory_core_early(MAX_NUMNODES); 169 return free_low_memory_core_early(MAX_NUMNODES);
169} 170}
diff --git a/mm/nommu.c b/mm/nommu.c
index dee2ff89fd58..45131b41bcdb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -698,7 +698,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
698 698
699 mutex_lock(&mapping->i_mmap_mutex); 699 mutex_lock(&mapping->i_mmap_mutex);
700 flush_dcache_mmap_lock(mapping); 700 flush_dcache_mmap_lock(mapping);
701 vma_prio_tree_insert(vma, &mapping->i_mmap); 701 vma_interval_tree_insert(vma, &mapping->i_mmap);
702 flush_dcache_mmap_unlock(mapping); 702 flush_dcache_mmap_unlock(mapping);
703 mutex_unlock(&mapping->i_mmap_mutex); 703 mutex_unlock(&mapping->i_mmap_mutex);
704 } 704 }
@@ -764,7 +764,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
764 764
765 mutex_lock(&mapping->i_mmap_mutex); 765 mutex_lock(&mapping->i_mmap_mutex);
766 flush_dcache_mmap_lock(mapping); 766 flush_dcache_mmap_lock(mapping);
767 vma_prio_tree_remove(vma, &mapping->i_mmap); 767 vma_interval_tree_remove(vma, &mapping->i_mmap);
768 flush_dcache_mmap_unlock(mapping); 768 flush_dcache_mmap_unlock(mapping);
769 mutex_unlock(&mapping->i_mmap_mutex); 769 mutex_unlock(&mapping->i_mmap_mutex);
770 } 770 }
@@ -789,11 +789,8 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
789 kenter("%p", vma); 789 kenter("%p", vma);
790 if (vma->vm_ops && vma->vm_ops->close) 790 if (vma->vm_ops && vma->vm_ops->close)
791 vma->vm_ops->close(vma); 791 vma->vm_ops->close(vma);
792 if (vma->vm_file) { 792 if (vma->vm_file)
793 fput(vma->vm_file); 793 fput(vma->vm_file);
794 if (vma->vm_flags & VM_EXECUTABLE)
795 removed_exe_file_vma(mm);
796 }
797 put_nommu_region(vma->vm_region); 794 put_nommu_region(vma->vm_region);
798 kmem_cache_free(vm_area_cachep, vma); 795 kmem_cache_free(vm_area_cachep, vma);
799} 796}
@@ -1284,10 +1281,6 @@ unsigned long do_mmap_pgoff(struct file *file,
1284 if (file) { 1281 if (file) {
1285 region->vm_file = get_file(file); 1282 region->vm_file = get_file(file);
1286 vma->vm_file = get_file(file); 1283 vma->vm_file = get_file(file);
1287 if (vm_flags & VM_EXECUTABLE) {
1288 added_exe_file_vma(current->mm);
1289 vma->vm_mm = current->mm;
1290 }
1291 } 1284 }
1292 1285
1293 down_write(&nommu_region_sem); 1286 down_write(&nommu_region_sem);
@@ -1440,8 +1433,6 @@ error:
1440 kmem_cache_free(vm_region_jar, region); 1433 kmem_cache_free(vm_region_jar, region);
1441 if (vma->vm_file) 1434 if (vma->vm_file)
1442 fput(vma->vm_file); 1435 fput(vma->vm_file);
1443 if (vma->vm_flags & VM_EXECUTABLE)
1444 removed_exe_file_vma(vma->vm_mm);
1445 kmem_cache_free(vm_area_cachep, vma); 1436 kmem_cache_free(vm_area_cachep, vma);
1446 kleave(" = %d", ret); 1437 kleave(" = %d", ret);
1447 return ret; 1438 return ret;
@@ -1820,7 +1811,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1820 if (addr != (pfn << PAGE_SHIFT)) 1811 if (addr != (pfn << PAGE_SHIFT))
1821 return -EINVAL; 1812 return -EINVAL;
1822 1813
1823 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; 1814 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
1824 return 0; 1815 return 0;
1825} 1816}
1826EXPORT_SYMBOL(remap_pfn_range); 1817EXPORT_SYMBOL(remap_pfn_range);
@@ -1961,6 +1952,14 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1961} 1952}
1962EXPORT_SYMBOL(filemap_fault); 1953EXPORT_SYMBOL(filemap_fault);
1963 1954
1955int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
1956 unsigned long size, pgoff_t pgoff)
1957{
1958 BUG();
1959 return 0;
1960}
1961EXPORT_SYMBOL(generic_file_remap_pages);
1962
1964static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, 1963static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
1965 unsigned long addr, void *buf, int len, int write) 1964 unsigned long addr, void *buf, int len, int write)
1966{ 1965{
@@ -2045,7 +2044,6 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
2045 size_t newsize) 2044 size_t newsize)
2046{ 2045{
2047 struct vm_area_struct *vma; 2046 struct vm_area_struct *vma;
2048 struct prio_tree_iter iter;
2049 struct vm_region *region; 2047 struct vm_region *region;
2050 pgoff_t low, high; 2048 pgoff_t low, high;
2051 size_t r_size, r_top; 2049 size_t r_size, r_top;
@@ -2057,8 +2055,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
2057 mutex_lock(&inode->i_mapping->i_mmap_mutex); 2055 mutex_lock(&inode->i_mapping->i_mmap_mutex);
2058 2056
2059 /* search for VMAs that fall within the dead zone */ 2057 /* search for VMAs that fall within the dead zone */
2060 vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, 2058 vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) {
2061 low, high) {
2062 /* found one - only interested if it's shared out of the page 2059 /* found one - only interested if it's shared out of the page
2063 * cache */ 2060 * cache */
2064 if (vma->vm_flags & VM_SHARED) { 2061 if (vma->vm_flags & VM_SHARED) {
@@ -2074,8 +2071,8 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
2074 * we don't check for any regions that start beyond the EOF as there 2071 * we don't check for any regions that start beyond the EOF as there
2075 * shouldn't be any 2072 * shouldn't be any
2076 */ 2073 */
2077 vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, 2074 vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap,
2078 0, ULONG_MAX) { 2075 0, ULONG_MAX) {
2079 if (!(vma->vm_flags & VM_SHARED)) 2076 if (!(vma->vm_flags & VM_SHARED))
2080 continue; 2077 continue;
2081 2078
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 198600861638..79e0f3e24831 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -428,8 +428,8 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
428{ 428{
429 task_lock(current); 429 task_lock(current);
430 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " 430 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
431 "oom_adj=%d, oom_score_adj=%d\n", 431 "oom_score_adj=%d\n",
432 current->comm, gfp_mask, order, current->signal->oom_adj, 432 current->comm, gfp_mask, order,
433 current->signal->oom_score_adj); 433 current->signal->oom_score_adj);
434 cpuset_print_task_mems_allowed(current); 434 cpuset_print_task_mems_allowed(current);
435 task_unlock(current); 435 task_unlock(current);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c13ea7538891..bb90971182bd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -558,7 +558,8 @@ static inline void __free_one_page(struct page *page,
558 if (page_is_guard(buddy)) { 558 if (page_is_guard(buddy)) {
559 clear_page_guard_flag(buddy); 559 clear_page_guard_flag(buddy);
560 set_page_private(page, 0); 560 set_page_private(page, 0);
561 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); 561 __mod_zone_freepage_state(zone, 1 << order,
562 migratetype);
562 } else { 563 } else {
563 list_del(&buddy->lru); 564 list_del(&buddy->lru);
564 zone->free_area[order].nr_free--; 565 zone->free_area[order].nr_free--;
@@ -597,17 +598,6 @@ out:
597 zone->free_area[order].nr_free++; 598 zone->free_area[order].nr_free++;
598} 599}
599 600
600/*
601 * free_page_mlock() -- clean up attempts to free and mlocked() page.
602 * Page should not be on lru, so no need to fix that up.
603 * free_pages_check() will verify...
604 */
605static inline void free_page_mlock(struct page *page)
606{
607 __dec_zone_page_state(page, NR_MLOCK);
608 __count_vm_event(UNEVICTABLE_MLOCKFREED);
609}
610
611static inline int free_pages_check(struct page *page) 601static inline int free_pages_check(struct page *page)
612{ 602{
613 if (unlikely(page_mapcount(page) | 603 if (unlikely(page_mapcount(page) |
@@ -668,12 +658,17 @@ static void free_pcppages_bulk(struct zone *zone, int count,
668 batch_free = to_free; 658 batch_free = to_free;
669 659
670 do { 660 do {
661 int mt; /* migratetype of the to-be-freed page */
662
671 page = list_entry(list->prev, struct page, lru); 663 page = list_entry(list->prev, struct page, lru);
672 /* must delete as __free_one_page list manipulates */ 664 /* must delete as __free_one_page list manipulates */
673 list_del(&page->lru); 665 list_del(&page->lru);
666 mt = get_freepage_migratetype(page);
674 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ 667 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
675 __free_one_page(page, zone, 0, page_private(page)); 668 __free_one_page(page, zone, 0, mt);
676 trace_mm_page_pcpu_drain(page, 0, page_private(page)); 669 trace_mm_page_pcpu_drain(page, 0, mt);
670 if (is_migrate_cma(mt))
671 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
677 } while (--to_free && --batch_free && !list_empty(list)); 672 } while (--to_free && --batch_free && !list_empty(list));
678 } 673 }
679 __mod_zone_page_state(zone, NR_FREE_PAGES, count); 674 __mod_zone_page_state(zone, NR_FREE_PAGES, count);
@@ -688,7 +683,8 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
688 zone->pages_scanned = 0; 683 zone->pages_scanned = 0;
689 684
690 __free_one_page(page, zone, order, migratetype); 685 __free_one_page(page, zone, order, migratetype);
691 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); 686 if (unlikely(migratetype != MIGRATE_ISOLATE))
687 __mod_zone_freepage_state(zone, 1 << order, migratetype);
692 spin_unlock(&zone->lock); 688 spin_unlock(&zone->lock);
693} 689}
694 690
@@ -721,17 +717,16 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
721static void __free_pages_ok(struct page *page, unsigned int order) 717static void __free_pages_ok(struct page *page, unsigned int order)
722{ 718{
723 unsigned long flags; 719 unsigned long flags;
724 int wasMlocked = __TestClearPageMlocked(page); 720 int migratetype;
725 721
726 if (!free_pages_prepare(page, order)) 722 if (!free_pages_prepare(page, order))
727 return; 723 return;
728 724
729 local_irq_save(flags); 725 local_irq_save(flags);
730 if (unlikely(wasMlocked))
731 free_page_mlock(page);
732 __count_vm_events(PGFREE, 1 << order); 726 __count_vm_events(PGFREE, 1 << order);
733 free_one_page(page_zone(page), page, order, 727 migratetype = get_pageblock_migratetype(page);
734 get_pageblock_migratetype(page)); 728 set_freepage_migratetype(page, migratetype);
729 free_one_page(page_zone(page), page, order, migratetype);
735 local_irq_restore(flags); 730 local_irq_restore(flags);
736} 731}
737 732
@@ -811,7 +806,8 @@ static inline void expand(struct zone *zone, struct page *page,
811 set_page_guard_flag(&page[size]); 806 set_page_guard_flag(&page[size]);
812 set_page_private(&page[size], high); 807 set_page_private(&page[size], high);
813 /* Guard pages are not available for any usage */ 808 /* Guard pages are not available for any usage */
814 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high)); 809 __mod_zone_freepage_state(zone, -(1 << high),
810 migratetype);
815 continue; 811 continue;
816 } 812 }
817#endif 813#endif
@@ -915,7 +911,7 @@ static int fallbacks[MIGRATE_TYPES][4] = {
915 * Note that start_page and end_pages are not aligned on a pageblock 911 * Note that start_page and end_pages are not aligned on a pageblock
916 * boundary. If alignment is required, use move_freepages_block() 912 * boundary. If alignment is required, use move_freepages_block()
917 */ 913 */
918static int move_freepages(struct zone *zone, 914int move_freepages(struct zone *zone,
919 struct page *start_page, struct page *end_page, 915 struct page *start_page, struct page *end_page,
920 int migratetype) 916 int migratetype)
921{ 917{
@@ -951,6 +947,7 @@ static int move_freepages(struct zone *zone,
951 order = page_order(page); 947 order = page_order(page);
952 list_move(&page->lru, 948 list_move(&page->lru,
953 &zone->free_area[order].free_list[migratetype]); 949 &zone->free_area[order].free_list[migratetype]);
950 set_freepage_migratetype(page, migratetype);
954 page += 1 << order; 951 page += 1 << order;
955 pages_moved += 1 << order; 952 pages_moved += 1 << order;
956 } 953 }
@@ -1135,8 +1132,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
1135 if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) 1132 if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)
1136 mt = migratetype; 1133 mt = migratetype;
1137 } 1134 }
1138 set_page_private(page, mt); 1135 set_freepage_migratetype(page, mt);
1139 list = &page->lru; 1136 list = &page->lru;
1137 if (is_migrate_cma(mt))
1138 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
1139 -(1 << order));
1140 } 1140 }
1141 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); 1141 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
1142 spin_unlock(&zone->lock); 1142 spin_unlock(&zone->lock);
@@ -1296,16 +1296,13 @@ void free_hot_cold_page(struct page *page, int cold)
1296 struct per_cpu_pages *pcp; 1296 struct per_cpu_pages *pcp;
1297 unsigned long flags; 1297 unsigned long flags;
1298 int migratetype; 1298 int migratetype;
1299 int wasMlocked = __TestClearPageMlocked(page);
1300 1299
1301 if (!free_pages_prepare(page, 0)) 1300 if (!free_pages_prepare(page, 0))
1302 return; 1301 return;
1303 1302
1304 migratetype = get_pageblock_migratetype(page); 1303 migratetype = get_pageblock_migratetype(page);
1305 set_page_private(page, migratetype); 1304 set_freepage_migratetype(page, migratetype);
1306 local_irq_save(flags); 1305 local_irq_save(flags);
1307 if (unlikely(wasMlocked))
1308 free_page_mlock(page);
1309 __count_vm_event(PGFREE); 1306 __count_vm_event(PGFREE);
1310 1307
1311 /* 1308 /*
@@ -1380,20 +1377,16 @@ void split_page(struct page *page, unsigned int order)
1380} 1377}
1381 1378
1382/* 1379/*
1383 * Similar to split_page except the page is already free. As this is only 1380 * Similar to the split_page family of functions except that the page
1384 * being used for migration, the migratetype of the block also changes. 1381 * required at the given order and being isolated now to prevent races
1385 * As this is called with interrupts disabled, the caller is responsible 1382 * with parallel allocators
1386 * for calling arch_alloc_page() and kernel_map_page() after interrupts
1387 * are enabled.
1388 *
1389 * Note: this is probably too low level an operation for use in drivers.
1390 * Please consult with lkml before using this in your driver.
1391 */ 1383 */
1392int split_free_page(struct page *page) 1384int capture_free_page(struct page *page, int alloc_order, int migratetype)
1393{ 1385{
1394 unsigned int order; 1386 unsigned int order;
1395 unsigned long watermark; 1387 unsigned long watermark;
1396 struct zone *zone; 1388 struct zone *zone;
1389 int mt;
1397 1390
1398 BUG_ON(!PageBuddy(page)); 1391 BUG_ON(!PageBuddy(page));
1399 1392
@@ -1409,12 +1402,16 @@ int split_free_page(struct page *page)
1409 list_del(&page->lru); 1402 list_del(&page->lru);
1410 zone->free_area[order].nr_free--; 1403 zone->free_area[order].nr_free--;
1411 rmv_page_order(page); 1404 rmv_page_order(page);
1412 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
1413 1405
1414 /* Split into individual pages */ 1406 mt = get_pageblock_migratetype(page);
1415 set_page_refcounted(page); 1407 if (unlikely(mt != MIGRATE_ISOLATE))
1416 split_page(page, order); 1408 __mod_zone_freepage_state(zone, -(1UL << order), mt);
1417 1409
1410 if (alloc_order != order)
1411 expand(zone, page, alloc_order, order,
1412 &zone->free_area[order], migratetype);
1413
1414 /* Set the pageblock if the captured page is at least a pageblock */
1418 if (order >= pageblock_order - 1) { 1415 if (order >= pageblock_order - 1) {
1419 struct page *endpage = page + (1 << order) - 1; 1416 struct page *endpage = page + (1 << order) - 1;
1420 for (; page < endpage; page += pageblock_nr_pages) { 1417 for (; page < endpage; page += pageblock_nr_pages) {
@@ -1425,7 +1422,35 @@ int split_free_page(struct page *page)
1425 } 1422 }
1426 } 1423 }
1427 1424
1428 return 1 << order; 1425 return 1UL << order;
1426}
1427
1428/*
1429 * Similar to split_page except the page is already free. As this is only
1430 * being used for migration, the migratetype of the block also changes.
1431 * As this is called with interrupts disabled, the caller is responsible
1432 * for calling arch_alloc_page() and kernel_map_page() after interrupts
1433 * are enabled.
1434 *
1435 * Note: this is probably too low level an operation for use in drivers.
1436 * Please consult with lkml before using this in your driver.
1437 */
1438int split_free_page(struct page *page)
1439{
1440 unsigned int order;
1441 int nr_pages;
1442
1443 BUG_ON(!PageBuddy(page));
1444 order = page_order(page);
1445
1446 nr_pages = capture_free_page(page, order, 0);
1447 if (!nr_pages)
1448 return 0;
1449
1450 /* Split into individual pages */
1451 set_page_refcounted(page);
1452 split_page(page, order);
1453 return nr_pages;
1429} 1454}
1430 1455
1431/* 1456/*
@@ -1484,7 +1509,8 @@ again:
1484 spin_unlock(&zone->lock); 1509 spin_unlock(&zone->lock);
1485 if (!page) 1510 if (!page)
1486 goto failed; 1511 goto failed;
1487 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); 1512 __mod_zone_freepage_state(zone, -(1 << order),
1513 get_pageblock_migratetype(page));
1488 } 1514 }
1489 1515
1490 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1516 __count_zone_vm_events(PGALLOC, zone, 1 << order);
@@ -1501,19 +1527,6 @@ failed:
1501 return NULL; 1527 return NULL;
1502} 1528}
1503 1529
1504/* The ALLOC_WMARK bits are used as an index to zone->watermark */
1505#define ALLOC_WMARK_MIN WMARK_MIN
1506#define ALLOC_WMARK_LOW WMARK_LOW
1507#define ALLOC_WMARK_HIGH WMARK_HIGH
1508#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
1509
1510/* Mask to get the watermark bits */
1511#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
1512
1513#define ALLOC_HARDER 0x10 /* try to alloc harder */
1514#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
1515#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
1516
1517#ifdef CONFIG_FAIL_PAGE_ALLOC 1530#ifdef CONFIG_FAIL_PAGE_ALLOC
1518 1531
1519static struct { 1532static struct {
@@ -1608,7 +1621,11 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1608 min -= min / 2; 1621 min -= min / 2;
1609 if (alloc_flags & ALLOC_HARDER) 1622 if (alloc_flags & ALLOC_HARDER)
1610 min -= min / 4; 1623 min -= min / 4;
1611 1624#ifdef CONFIG_CMA
1625 /* If allocation can't use CMA areas don't use free CMA pages */
1626 if (!(alloc_flags & ALLOC_CMA))
1627 free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
1628#endif
1612 if (free_pages <= min + lowmem_reserve) 1629 if (free_pages <= min + lowmem_reserve)
1613 return false; 1630 return false;
1614 for (o = 0; o < order; o++) { 1631 for (o = 0; o < order; o++) {
@@ -1782,6 +1799,22 @@ static void zlc_clear_zones_full(struct zonelist *zonelist)
1782 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1799 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1783} 1800}
1784 1801
1802static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1803{
1804 return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
1805}
1806
1807static void __paginginit init_zone_allows_reclaim(int nid)
1808{
1809 int i;
1810
1811 for_each_online_node(i)
1812 if (node_distance(nid, i) <= RECLAIM_DISTANCE) {
1813 node_set(i, NODE_DATA(nid)->reclaim_nodes);
1814 zone_reclaim_mode = 1;
1815 }
1816}
1817
1785#else /* CONFIG_NUMA */ 1818#else /* CONFIG_NUMA */
1786 1819
1787static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1820static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
@@ -1802,6 +1835,15 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1802static void zlc_clear_zones_full(struct zonelist *zonelist) 1835static void zlc_clear_zones_full(struct zonelist *zonelist)
1803{ 1836{
1804} 1837}
1838
1839static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1840{
1841 return true;
1842}
1843
1844static inline void init_zone_allows_reclaim(int nid)
1845{
1846}
1805#endif /* CONFIG_NUMA */ 1847#endif /* CONFIG_NUMA */
1806 1848
1807/* 1849/*
@@ -1886,7 +1928,8 @@ zonelist_scan:
1886 did_zlc_setup = 1; 1928 did_zlc_setup = 1;
1887 } 1929 }
1888 1930
1889 if (zone_reclaim_mode == 0) 1931 if (zone_reclaim_mode == 0 ||
1932 !zone_allows_reclaim(preferred_zone, zone))
1890 goto this_zone_full; 1933 goto this_zone_full;
1891 1934
1892 /* 1935 /*
@@ -2105,7 +2148,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2105 bool *contended_compaction, bool *deferred_compaction, 2148 bool *contended_compaction, bool *deferred_compaction,
2106 unsigned long *did_some_progress) 2149 unsigned long *did_some_progress)
2107{ 2150{
2108 struct page *page; 2151 struct page *page = NULL;
2109 2152
2110 if (!order) 2153 if (!order)
2111 return NULL; 2154 return NULL;
@@ -2118,10 +2161,16 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2118 current->flags |= PF_MEMALLOC; 2161 current->flags |= PF_MEMALLOC;
2119 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 2162 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
2120 nodemask, sync_migration, 2163 nodemask, sync_migration,
2121 contended_compaction); 2164 contended_compaction, &page);
2122 current->flags &= ~PF_MEMALLOC; 2165 current->flags &= ~PF_MEMALLOC;
2123 if (*did_some_progress != COMPACT_SKIPPED) {
2124 2166
2167 /* If compaction captured a page, prep and use it */
2168 if (page) {
2169 prep_new_page(page, order, gfp_mask);
2170 goto got_page;
2171 }
2172
2173 if (*did_some_progress != COMPACT_SKIPPED) {
2125 /* Page migration frees to the PCP lists but we want merging */ 2174 /* Page migration frees to the PCP lists but we want merging */
2126 drain_pages(get_cpu()); 2175 drain_pages(get_cpu());
2127 put_cpu(); 2176 put_cpu();
@@ -2131,6 +2180,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2131 alloc_flags & ~ALLOC_NO_WATERMARKS, 2180 alloc_flags & ~ALLOC_NO_WATERMARKS,
2132 preferred_zone, migratetype); 2181 preferred_zone, migratetype);
2133 if (page) { 2182 if (page) {
2183got_page:
2184 preferred_zone->compact_blockskip_flush = false;
2134 preferred_zone->compact_considered = 0; 2185 preferred_zone->compact_considered = 0;
2135 preferred_zone->compact_defer_shift = 0; 2186 preferred_zone->compact_defer_shift = 0;
2136 if (order >= preferred_zone->compact_order_failed) 2187 if (order >= preferred_zone->compact_order_failed)
@@ -2315,7 +2366,10 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
2315 unlikely(test_thread_flag(TIF_MEMDIE)))) 2366 unlikely(test_thread_flag(TIF_MEMDIE))))
2316 alloc_flags |= ALLOC_NO_WATERMARKS; 2367 alloc_flags |= ALLOC_NO_WATERMARKS;
2317 } 2368 }
2318 2369#ifdef CONFIG_CMA
2370 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2371 alloc_flags |= ALLOC_CMA;
2372#endif
2319 return alloc_flags; 2373 return alloc_flags;
2320} 2374}
2321 2375
@@ -2362,9 +2416,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2362 goto nopage; 2416 goto nopage;
2363 2417
2364restart: 2418restart:
2365 if (!(gfp_mask & __GFP_NO_KSWAPD)) 2419 wake_all_kswapd(order, zonelist, high_zoneidx,
2366 wake_all_kswapd(order, zonelist, high_zoneidx, 2420 zone_idx(preferred_zone));
2367 zone_idx(preferred_zone));
2368 2421
2369 /* 2422 /*
2370 * OK, we're below the kswapd watermark and have kicked background 2423 * OK, we're below the kswapd watermark and have kicked background
@@ -2441,7 +2494,7 @@ rebalance:
2441 * system then fail the allocation instead of entering direct reclaim. 2494 * system then fail the allocation instead of entering direct reclaim.
2442 */ 2495 */
2443 if ((deferred_compaction || contended_compaction) && 2496 if ((deferred_compaction || contended_compaction) &&
2444 (gfp_mask & __GFP_NO_KSWAPD)) 2497 (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE)
2445 goto nopage; 2498 goto nopage;
2446 2499
2447 /* Try direct reclaim and then allocating */ 2500 /* Try direct reclaim and then allocating */
@@ -2541,6 +2594,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2541 struct page *page = NULL; 2594 struct page *page = NULL;
2542 int migratetype = allocflags_to_migratetype(gfp_mask); 2595 int migratetype = allocflags_to_migratetype(gfp_mask);
2543 unsigned int cpuset_mems_cookie; 2596 unsigned int cpuset_mems_cookie;
2597 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
2544 2598
2545 gfp_mask &= gfp_allowed_mask; 2599 gfp_mask &= gfp_allowed_mask;
2546 2600
@@ -2569,9 +2623,13 @@ retry_cpuset:
2569 if (!preferred_zone) 2623 if (!preferred_zone)
2570 goto out; 2624 goto out;
2571 2625
2626#ifdef CONFIG_CMA
2627 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2628 alloc_flags |= ALLOC_CMA;
2629#endif
2572 /* First allocation attempt */ 2630 /* First allocation attempt */
2573 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2631 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
2574 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET, 2632 zonelist, high_zoneidx, alloc_flags,
2575 preferred_zone, migratetype); 2633 preferred_zone, migratetype);
2576 if (unlikely(!page)) 2634 if (unlikely(!page))
2577 page = __alloc_pages_slowpath(gfp_mask, order, 2635 page = __alloc_pages_slowpath(gfp_mask, order,
@@ -2852,7 +2910,8 @@ void show_free_areas(unsigned int filter)
2852 " unevictable:%lu" 2910 " unevictable:%lu"
2853 " dirty:%lu writeback:%lu unstable:%lu\n" 2911 " dirty:%lu writeback:%lu unstable:%lu\n"
2854 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" 2912 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
2855 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n", 2913 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
2914 " free_cma:%lu\n",
2856 global_page_state(NR_ACTIVE_ANON), 2915 global_page_state(NR_ACTIVE_ANON),
2857 global_page_state(NR_INACTIVE_ANON), 2916 global_page_state(NR_INACTIVE_ANON),
2858 global_page_state(NR_ISOLATED_ANON), 2917 global_page_state(NR_ISOLATED_ANON),
@@ -2869,7 +2928,8 @@ void show_free_areas(unsigned int filter)
2869 global_page_state(NR_FILE_MAPPED), 2928 global_page_state(NR_FILE_MAPPED),
2870 global_page_state(NR_SHMEM), 2929 global_page_state(NR_SHMEM),
2871 global_page_state(NR_PAGETABLE), 2930 global_page_state(NR_PAGETABLE),
2872 global_page_state(NR_BOUNCE)); 2931 global_page_state(NR_BOUNCE),
2932 global_page_state(NR_FREE_CMA_PAGES));
2873 2933
2874 for_each_populated_zone(zone) { 2934 for_each_populated_zone(zone) {
2875 int i; 2935 int i;
@@ -2901,6 +2961,7 @@ void show_free_areas(unsigned int filter)
2901 " pagetables:%lukB" 2961 " pagetables:%lukB"
2902 " unstable:%lukB" 2962 " unstable:%lukB"
2903 " bounce:%lukB" 2963 " bounce:%lukB"
2964 " free_cma:%lukB"
2904 " writeback_tmp:%lukB" 2965 " writeback_tmp:%lukB"
2905 " pages_scanned:%lu" 2966 " pages_scanned:%lu"
2906 " all_unreclaimable? %s" 2967 " all_unreclaimable? %s"
@@ -2930,6 +2991,7 @@ void show_free_areas(unsigned int filter)
2930 K(zone_page_state(zone, NR_PAGETABLE)), 2991 K(zone_page_state(zone, NR_PAGETABLE)),
2931 K(zone_page_state(zone, NR_UNSTABLE_NFS)), 2992 K(zone_page_state(zone, NR_UNSTABLE_NFS)),
2932 K(zone_page_state(zone, NR_BOUNCE)), 2993 K(zone_page_state(zone, NR_BOUNCE)),
2994 K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
2933 K(zone_page_state(zone, NR_WRITEBACK_TEMP)), 2995 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
2934 zone->pages_scanned, 2996 zone->pages_scanned,
2935 (zone->all_unreclaimable ? "yes" : "no") 2997 (zone->all_unreclaimable ? "yes" : "no")
@@ -3328,21 +3390,13 @@ static void build_zonelists(pg_data_t *pgdat)
3328 j = 0; 3390 j = 0;
3329 3391
3330 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 3392 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
3331 int distance = node_distance(local_node, node);
3332
3333 /*
3334 * If another node is sufficiently far away then it is better
3335 * to reclaim pages in a zone before going off node.
3336 */
3337 if (distance > RECLAIM_DISTANCE)
3338 zone_reclaim_mode = 1;
3339
3340 /* 3393 /*
3341 * We don't want to pressure a particular node. 3394 * We don't want to pressure a particular node.
3342 * So adding penalty to the first node in same 3395 * So adding penalty to the first node in same
3343 * distance group to make it round-robin. 3396 * distance group to make it round-robin.
3344 */ 3397 */
3345 if (distance != node_distance(local_node, prev_node)) 3398 if (node_distance(local_node, node) !=
3399 node_distance(local_node, prev_node))
3346 node_load[node] = load; 3400 node_load[node] = load;
3347 3401
3348 prev_node = node; 3402 prev_node = node;
@@ -4438,11 +4492,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4438 4492
4439 zone->spanned_pages = size; 4493 zone->spanned_pages = size;
4440 zone->present_pages = realsize; 4494 zone->present_pages = realsize;
4441#if defined CONFIG_COMPACTION || defined CONFIG_CMA
4442 zone->compact_cached_free_pfn = zone->zone_start_pfn +
4443 zone->spanned_pages;
4444 zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1);
4445#endif
4446#ifdef CONFIG_NUMA 4495#ifdef CONFIG_NUMA
4447 zone->node = nid; 4496 zone->node = nid;
4448 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) 4497 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
@@ -4521,6 +4570,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4521 4570
4522 pgdat->node_id = nid; 4571 pgdat->node_id = nid;
4523 pgdat->node_start_pfn = node_start_pfn; 4572 pgdat->node_start_pfn = node_start_pfn;
4573 init_zone_allows_reclaim(nid);
4524 calculate_node_totalpages(pgdat, zones_size, zholes_size); 4574 calculate_node_totalpages(pgdat, zones_size, zholes_size);
4525 4575
4526 alloc_node_mem_map(pgdat); 4576 alloc_node_mem_map(pgdat);
@@ -4879,7 +4929,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4879 zone_movable_pfn[i] << PAGE_SHIFT); 4929 zone_movable_pfn[i] << PAGE_SHIFT);
4880 } 4930 }
4881 4931
4882 /* Print out the early_node_map[] */ 4932 /* Print out the early node map */
4883 printk("Early memory node ranges\n"); 4933 printk("Early memory node ranges\n");
4884 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) 4934 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
4885 printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, 4935 printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,
@@ -5619,47 +5669,28 @@ static unsigned long pfn_max_align_up(unsigned long pfn)
5619 pageblock_nr_pages)); 5669 pageblock_nr_pages));
5620} 5670}
5621 5671
5622static struct page *
5623__alloc_contig_migrate_alloc(struct page *page, unsigned long private,
5624 int **resultp)
5625{
5626 gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
5627
5628 if (PageHighMem(page))
5629 gfp_mask |= __GFP_HIGHMEM;
5630
5631 return alloc_page(gfp_mask);
5632}
5633
5634/* [start, end) must belong to a single zone. */ 5672/* [start, end) must belong to a single zone. */
5635static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) 5673static int __alloc_contig_migrate_range(struct compact_control *cc,
5674 unsigned long start, unsigned long end)
5636{ 5675{
5637 /* This function is based on compact_zone() from compaction.c. */ 5676 /* This function is based on compact_zone() from compaction.c. */
5638 5677 unsigned long nr_reclaimed;
5639 unsigned long pfn = start; 5678 unsigned long pfn = start;
5640 unsigned int tries = 0; 5679 unsigned int tries = 0;
5641 int ret = 0; 5680 int ret = 0;
5642 5681
5643 struct compact_control cc = {
5644 .nr_migratepages = 0,
5645 .order = -1,
5646 .zone = page_zone(pfn_to_page(start)),
5647 .sync = true,
5648 };
5649 INIT_LIST_HEAD(&cc.migratepages);
5650
5651 migrate_prep_local(); 5682 migrate_prep_local();
5652 5683
5653 while (pfn < end || !list_empty(&cc.migratepages)) { 5684 while (pfn < end || !list_empty(&cc->migratepages)) {
5654 if (fatal_signal_pending(current)) { 5685 if (fatal_signal_pending(current)) {
5655 ret = -EINTR; 5686 ret = -EINTR;
5656 break; 5687 break;
5657 } 5688 }
5658 5689
5659 if (list_empty(&cc.migratepages)) { 5690 if (list_empty(&cc->migratepages)) {
5660 cc.nr_migratepages = 0; 5691 cc->nr_migratepages = 0;
5661 pfn = isolate_migratepages_range(cc.zone, &cc, 5692 pfn = isolate_migratepages_range(cc->zone, cc,
5662 pfn, end); 5693 pfn, end, true);
5663 if (!pfn) { 5694 if (!pfn) {
5664 ret = -EINTR; 5695 ret = -EINTR;
5665 break; 5696 break;
@@ -5670,12 +5701,16 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
5670 break; 5701 break;
5671 } 5702 }
5672 5703
5673 ret = migrate_pages(&cc.migratepages, 5704 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
5674 __alloc_contig_migrate_alloc, 5705 &cc->migratepages);
5706 cc->nr_migratepages -= nr_reclaimed;
5707
5708 ret = migrate_pages(&cc->migratepages,
5709 alloc_migrate_target,
5675 0, false, MIGRATE_SYNC); 5710 0, false, MIGRATE_SYNC);
5676 } 5711 }
5677 5712
5678 putback_lru_pages(&cc.migratepages); 5713 putback_lru_pages(&cc->migratepages);
5679 return ret > 0 ? 0 : ret; 5714 return ret > 0 ? 0 : ret;
5680} 5715}
5681 5716
@@ -5754,6 +5789,15 @@ int alloc_contig_range(unsigned long start, unsigned long end,
5754 unsigned long outer_start, outer_end; 5789 unsigned long outer_start, outer_end;
5755 int ret = 0, order; 5790 int ret = 0, order;
5756 5791
5792 struct compact_control cc = {
5793 .nr_migratepages = 0,
5794 .order = -1,
5795 .zone = page_zone(pfn_to_page(start)),
5796 .sync = true,
5797 .ignore_skip_hint = true,
5798 };
5799 INIT_LIST_HEAD(&cc.migratepages);
5800
5757 /* 5801 /*
5758 * What we do here is we mark all pageblocks in range as 5802 * What we do here is we mark all pageblocks in range as
5759 * MIGRATE_ISOLATE. Because pageblock and max order pages may 5803 * MIGRATE_ISOLATE. Because pageblock and max order pages may
@@ -5783,7 +5827,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
5783 if (ret) 5827 if (ret)
5784 goto done; 5828 goto done;
5785 5829
5786 ret = __alloc_contig_migrate_range(start, end); 5830 ret = __alloc_contig_migrate_range(&cc, start, end);
5787 if (ret) 5831 if (ret)
5788 goto done; 5832 goto done;
5789 5833
@@ -5832,7 +5876,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
5832 __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start); 5876 __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
5833 5877
5834 /* Grab isolated pages from freelists. */ 5878 /* Grab isolated pages from freelists. */
5835 outer_end = isolate_freepages_range(outer_start, end); 5879 outer_end = isolate_freepages_range(&cc, outer_start, end);
5836 if (!outer_end) { 5880 if (!outer_end) {
5837 ret = -EBUSY; 5881 ret = -EBUSY;
5838 goto done; 5882 goto done;
@@ -5874,6 +5918,7 @@ static int __meminit __zone_pcp_update(void *data)
5874 local_irq_save(flags); 5918 local_irq_save(flags);
5875 if (pcp->count > 0) 5919 if (pcp->count > 0)
5876 free_pcppages_bulk(zone, pcp->count, pcp); 5920 free_pcppages_bulk(zone, pcp->count, pcp);
5921 drain_zonestat(zone, pset);
5877 setup_pageset(pset, batch); 5922 setup_pageset(pset, batch);
5878 local_irq_restore(flags); 5923 local_irq_restore(flags);
5879 } 5924 }
@@ -5890,10 +5935,16 @@ void __meminit zone_pcp_update(struct zone *zone)
5890void zone_pcp_reset(struct zone *zone) 5935void zone_pcp_reset(struct zone *zone)
5891{ 5936{
5892 unsigned long flags; 5937 unsigned long flags;
5938 int cpu;
5939 struct per_cpu_pageset *pset;
5893 5940
5894 /* avoid races with drain_pages() */ 5941 /* avoid races with drain_pages() */
5895 local_irq_save(flags); 5942 local_irq_save(flags);
5896 if (zone->pageset != &boot_pageset) { 5943 if (zone->pageset != &boot_pageset) {
5944 for_each_online_cpu(cpu) {
5945 pset = per_cpu_ptr(zone->pageset, cpu);
5946 drain_zonestat(zone, pset);
5947 }
5897 free_percpu(zone->pageset); 5948 free_percpu(zone->pageset);
5898 zone->pageset = &boot_pageset; 5949 zone->pageset = &boot_pageset;
5899 } 5950 }
@@ -6047,3 +6098,37 @@ void dump_page(struct page *page)
6047 dump_page_flags(page->flags); 6098 dump_page_flags(page->flags);
6048 mem_cgroup_print_bad_page(page); 6099 mem_cgroup_print_bad_page(page);
6049} 6100}
6101
6102/* reset zone->present_pages */
6103void reset_zone_present_pages(void)
6104{
6105 struct zone *z;
6106 int i, nid;
6107
6108 for_each_node_state(nid, N_HIGH_MEMORY) {
6109 for (i = 0; i < MAX_NR_ZONES; i++) {
6110 z = NODE_DATA(nid)->node_zones + i;
6111 z->present_pages = 0;
6112 }
6113 }
6114}
6115
6116/* calculate zone's present pages in buddy system */
6117void fixup_zone_present_pages(int nid, unsigned long start_pfn,
6118 unsigned long end_pfn)
6119{
6120 struct zone *z;
6121 unsigned long zone_start_pfn, zone_end_pfn;
6122 int i;
6123
6124 for (i = 0; i < MAX_NR_ZONES; i++) {
6125 z = NODE_DATA(nid)->node_zones + i;
6126 zone_start_pfn = z->zone_start_pfn;
6127 zone_end_pfn = zone_start_pfn + z->spanned_pages;
6128
6129 /* if the two regions intersect */
6130 if (!(zone_start_pfn >= end_pfn || zone_end_pfn <= start_pfn))
6131 z->present_pages += min(end_pfn, zone_end_pfn) -
6132 max(start_pfn, zone_start_pfn);
6133 }
6134}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 247d1f175739..f2f5b4818e94 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -76,8 +76,13 @@ int set_migratetype_isolate(struct page *page)
76 76
77out: 77out:
78 if (!ret) { 78 if (!ret) {
79 unsigned long nr_pages;
80 int migratetype = get_pageblock_migratetype(page);
81
79 set_pageblock_isolate(page); 82 set_pageblock_isolate(page);
80 move_freepages_block(zone, page, MIGRATE_ISOLATE); 83 nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
84
85 __mod_zone_freepage_state(zone, -nr_pages, migratetype);
81 } 86 }
82 87
83 spin_unlock_irqrestore(&zone->lock, flags); 88 spin_unlock_irqrestore(&zone->lock, flags);
@@ -89,12 +94,14 @@ out:
89void unset_migratetype_isolate(struct page *page, unsigned migratetype) 94void unset_migratetype_isolate(struct page *page, unsigned migratetype)
90{ 95{
91 struct zone *zone; 96 struct zone *zone;
92 unsigned long flags; 97 unsigned long flags, nr_pages;
98
93 zone = page_zone(page); 99 zone = page_zone(page);
94 spin_lock_irqsave(&zone->lock, flags); 100 spin_lock_irqsave(&zone->lock, flags);
95 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) 101 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
96 goto out; 102 goto out;
97 move_freepages_block(zone, page, migratetype); 103 nr_pages = move_freepages_block(zone, page, migratetype);
104 __mod_zone_freepage_state(zone, nr_pages, migratetype);
98 restore_pageblock_isolate(page, migratetype); 105 restore_pageblock_isolate(page, migratetype);
99out: 106out:
100 spin_unlock_irqrestore(&zone->lock, flags); 107 spin_unlock_irqrestore(&zone->lock, flags);
@@ -193,10 +200,25 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
193 continue; 200 continue;
194 } 201 }
195 page = pfn_to_page(pfn); 202 page = pfn_to_page(pfn);
196 if (PageBuddy(page)) 203 if (PageBuddy(page)) {
204 /*
205 * If race between isolatation and allocation happens,
206 * some free pages could be in MIGRATE_MOVABLE list
207 * although pageblock's migratation type of the page
208 * is MIGRATE_ISOLATE. Catch it and move the page into
209 * MIGRATE_ISOLATE list.
210 */
211 if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) {
212 struct page *end_page;
213
214 end_page = page + (1 << page_order(page)) - 1;
215 move_freepages(page_zone(page), page, end_page,
216 MIGRATE_ISOLATE);
217 }
197 pfn += 1 << page_order(page); 218 pfn += 1 << page_order(page);
219 }
198 else if (page_count(page) == 0 && 220 else if (page_count(page) == 0 &&
199 page_private(page) == MIGRATE_ISOLATE) 221 get_freepage_migratetype(page) == MIGRATE_ISOLATE)
200 pfn += 1; 222 pfn += 1;
201 else 223 else
202 break; 224 break;
@@ -233,3 +255,14 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
233 spin_unlock_irqrestore(&zone->lock, flags); 255 spin_unlock_irqrestore(&zone->lock, flags);
234 return ret ? 0 : -EBUSY; 256 return ret ? 0 : -EBUSY;
235} 257}
258
259struct page *alloc_migrate_target(struct page *page, unsigned long private,
260 int **resultp)
261{
262 gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
263
264 if (PageHighMem(page))
265 gfp_mask |= __GFP_HIGHMEM;
266
267 return alloc_page(gfp_mask);
268}
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 74c0ddaa6fa0..e642627da6b7 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -120,3 +120,53 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
120} 120}
121#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 121#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
122#endif 122#endif
123
124#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
125#ifdef CONFIG_TRANSPARENT_HUGEPAGE
126void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
127{
128 assert_spin_locked(&mm->page_table_lock);
129
130 /* FIFO */
131 if (!mm->pmd_huge_pte)
132 INIT_LIST_HEAD(&pgtable->lru);
133 else
134 list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
135 mm->pmd_huge_pte = pgtable;
136}
137#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
138#endif
139
140#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
141#ifdef CONFIG_TRANSPARENT_HUGEPAGE
142/* no "address" argument so destroys page coloring of some arch */
143pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm)
144{
145 pgtable_t pgtable;
146
147 assert_spin_locked(&mm->page_table_lock);
148
149 /* FIFO */
150 pgtable = mm->pmd_huge_pte;
151 if (list_empty(&pgtable->lru))
152 mm->pmd_huge_pte = NULL;
153 else {
154 mm->pmd_huge_pte = list_entry(pgtable->lru.next,
155 struct page, lru);
156 list_del(&pgtable->lru);
157 }
158 return pgtable;
159}
160#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
161#endif
162
163#ifndef __HAVE_ARCH_PMDP_INVALIDATE
164#ifdef CONFIG_TRANSPARENT_HUGEPAGE
165void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
166 pmd_t *pmdp)
167{
168 set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp));
169 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
170}
171#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
172#endif
diff --git a/mm/prio_tree.c b/mm/prio_tree.c
deleted file mode 100644
index 799dcfd7cd8c..000000000000
--- a/mm/prio_tree.c
+++ /dev/null
@@ -1,208 +0,0 @@
1/*
2 * mm/prio_tree.c - priority search tree for mapping->i_mmap
3 *
4 * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu>
5 *
6 * This file is released under the GPL v2.
7 *
8 * Based on the radix priority search tree proposed by Edward M. McCreight
9 * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985
10 *
11 * 02Feb2004 Initial version
12 */
13
14#include <linux/mm.h>
15#include <linux/prio_tree.h>
16#include <linux/prefetch.h>
17
18/*
19 * See lib/prio_tree.c for details on the general radix priority search tree
20 * code.
21 */
22
23/*
24 * The following #defines are mirrored from lib/prio_tree.c. They're only used
25 * for debugging, and should be removed (along with the debugging code using
26 * them) when switching also VMAs to the regular prio_tree code.
27 */
28
29#define RADIX_INDEX(vma) ((vma)->vm_pgoff)
30#define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT)
31/* avoid overflow */
32#define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1))
33
34/*
35 * Radix priority search tree for address_space->i_mmap
36 *
37 * For each vma that map a unique set of file pages i.e., unique [radix_index,
38 * heap_index] value, we have a corresponding priority search tree node. If
39 * multiple vmas have identical [radix_index, heap_index] value, then one of
40 * them is used as a tree node and others are stored in a vm_set list. The tree
41 * node points to the first vma (head) of the list using vm_set.head.
42 *
43 * prio_tree_root
44 * |
45 * A vm_set.head
46 * / \ /
47 * L R -> H-I-J-K-M-N-O-P-Q-S
48 * ^ ^ <-- vm_set.list -->
49 * tree nodes
50 *
51 * We need some way to identify whether a vma is a tree node, head of a vm_set
52 * list, or just a member of a vm_set list. We cannot use vm_flags to store
53 * such information. The reason is, in the above figure, it is possible that
54 * vm_flags' of R and H are covered by the different mmap_sems. When R is
55 * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold
56 * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now.
57 * That's why some trick involving shared.vm_set.parent is used for identifying
58 * tree nodes and list head nodes.
59 *
60 * vma radix priority search tree node rules:
61 *
62 * vma->shared.vm_set.parent != NULL ==> a tree node
63 * vma->shared.vm_set.head != NULL ==> list of others mapping same range
64 * vma->shared.vm_set.head == NULL ==> no others map the same range
65 *
66 * vma->shared.vm_set.parent == NULL
67 * vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range
68 * vma->shared.vm_set.head == NULL ==> a list node
69 */
70
71/*
72 * Add a new vma known to map the same set of pages as the old vma:
73 * useful for fork's dup_mmap as well as vma_prio_tree_insert below.
74 * Note that it just happens to work correctly on i_mmap_nonlinear too.
75 */
76void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old)
77{
78 /* Leave these BUG_ONs till prio_tree patch stabilizes */
79 BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old));
80 BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old));
81
82 vma->shared.vm_set.head = NULL;
83 vma->shared.vm_set.parent = NULL;
84
85 if (!old->shared.vm_set.parent)
86 list_add(&vma->shared.vm_set.list,
87 &old->shared.vm_set.list);
88 else if (old->shared.vm_set.head)
89 list_add_tail(&vma->shared.vm_set.list,
90 &old->shared.vm_set.head->shared.vm_set.list);
91 else {
92 INIT_LIST_HEAD(&vma->shared.vm_set.list);
93 vma->shared.vm_set.head = old;
94 old->shared.vm_set.head = vma;
95 }
96}
97
98void vma_prio_tree_insert(struct vm_area_struct *vma,
99 struct prio_tree_root *root)
100{
101 struct prio_tree_node *ptr;
102 struct vm_area_struct *old;
103
104 vma->shared.vm_set.head = NULL;
105
106 ptr = raw_prio_tree_insert(root, &vma->shared.prio_tree_node);
107 if (ptr != (struct prio_tree_node *) &vma->shared.prio_tree_node) {
108 old = prio_tree_entry(ptr, struct vm_area_struct,
109 shared.prio_tree_node);
110 vma_prio_tree_add(vma, old);
111 }
112}
113
114void vma_prio_tree_remove(struct vm_area_struct *vma,
115 struct prio_tree_root *root)
116{
117 struct vm_area_struct *node, *head, *new_head;
118
119 if (!vma->shared.vm_set.head) {
120 if (!vma->shared.vm_set.parent)
121 list_del_init(&vma->shared.vm_set.list);
122 else
123 raw_prio_tree_remove(root, &vma->shared.prio_tree_node);
124 } else {
125 /* Leave this BUG_ON till prio_tree patch stabilizes */
126 BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma);
127 if (vma->shared.vm_set.parent) {
128 head = vma->shared.vm_set.head;
129 if (!list_empty(&head->shared.vm_set.list)) {
130 new_head = list_entry(
131 head->shared.vm_set.list.next,
132 struct vm_area_struct,
133 shared.vm_set.list);
134 list_del_init(&head->shared.vm_set.list);
135 } else
136 new_head = NULL;
137
138 raw_prio_tree_replace(root, &vma->shared.prio_tree_node,
139 &head->shared.prio_tree_node);
140 head->shared.vm_set.head = new_head;
141 if (new_head)
142 new_head->shared.vm_set.head = head;
143
144 } else {
145 node = vma->shared.vm_set.head;
146 if (!list_empty(&vma->shared.vm_set.list)) {
147 new_head = list_entry(
148 vma->shared.vm_set.list.next,
149 struct vm_area_struct,
150 shared.vm_set.list);
151 list_del_init(&vma->shared.vm_set.list);
152 node->shared.vm_set.head = new_head;
153 new_head->shared.vm_set.head = node;
154 } else
155 node->shared.vm_set.head = NULL;
156 }
157 }
158}
159
160/*
161 * Helper function to enumerate vmas that map a given file page or a set of
162 * contiguous file pages. The function returns vmas that at least map a single
163 * page in the given range of contiguous file pages.
164 */
165struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
166 struct prio_tree_iter *iter)
167{
168 struct prio_tree_node *ptr;
169 struct vm_area_struct *next;
170
171 if (!vma) {
172 /*
173 * First call is with NULL vma
174 */
175 ptr = prio_tree_next(iter);
176 if (ptr) {
177 next = prio_tree_entry(ptr, struct vm_area_struct,
178 shared.prio_tree_node);
179 prefetch(next->shared.vm_set.head);
180 return next;
181 } else
182 return NULL;
183 }
184
185 if (vma->shared.vm_set.parent) {
186 if (vma->shared.vm_set.head) {
187 next = vma->shared.vm_set.head;
188 prefetch(next->shared.vm_set.list.next);
189 return next;
190 }
191 } else {
192 next = list_entry(vma->shared.vm_set.list.next,
193 struct vm_area_struct, shared.vm_set.list);
194 if (!next->shared.vm_set.head) {
195 prefetch(next->shared.vm_set.list.next);
196 return next;
197 }
198 }
199
200 ptr = prio_tree_next(iter);
201 if (ptr) {
202 next = prio_tree_entry(ptr, struct vm_area_struct,
203 shared.prio_tree_node);
204 prefetch(next->shared.vm_set.head);
205 return next;
206 } else
207 return NULL;
208}
diff --git a/mm/rmap.c b/mm/rmap.c
index 0f3b7cda2a24..7df7984d476c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -127,12 +127,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
127 avc->vma = vma; 127 avc->vma = vma;
128 avc->anon_vma = anon_vma; 128 avc->anon_vma = anon_vma;
129 list_add(&avc->same_vma, &vma->anon_vma_chain); 129 list_add(&avc->same_vma, &vma->anon_vma_chain);
130 130 anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
131 /*
132 * It's critical to add new vmas to the tail of the anon_vma,
133 * see comment in huge_memory.c:__split_huge_page().
134 */
135 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
136} 131}
137 132
138/** 133/**
@@ -269,51 +264,6 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
269} 264}
270 265
271/* 266/*
272 * Some rmap walk that needs to find all ptes/hugepmds without false
273 * negatives (like migrate and split_huge_page) running concurrent
274 * with operations that copy or move pagetables (like mremap() and
275 * fork()) to be safe. They depend on the anon_vma "same_anon_vma"
276 * list to be in a certain order: the dst_vma must be placed after the
277 * src_vma in the list. This is always guaranteed by fork() but
278 * mremap() needs to call this function to enforce it in case the
279 * dst_vma isn't newly allocated and chained with the anon_vma_clone()
280 * function but just an extension of a pre-existing vma through
281 * vma_merge.
282 *
283 * NOTE: the same_anon_vma list can still be changed by other
284 * processes while mremap runs because mremap doesn't hold the
285 * anon_vma mutex to prevent modifications to the list while it
286 * runs. All we need to enforce is that the relative order of this
287 * process vmas isn't changing (we don't care about other vmas
288 * order). Each vma corresponds to an anon_vma_chain structure so
289 * there's no risk that other processes calling anon_vma_moveto_tail()
290 * and changing the same_anon_vma list under mremap() will screw with
291 * the relative order of this process vmas in the list, because we
292 * they can't alter the order of any vma that belongs to this
293 * process. And there can't be another anon_vma_moveto_tail() running
294 * concurrently with mremap() coming from this process because we hold
295 * the mmap_sem for the whole mremap(). fork() ordering dependency
296 * also shouldn't be affected because fork() only cares that the
297 * parent vmas are placed in the list before the child vmas and
298 * anon_vma_moveto_tail() won't reorder vmas from either the fork()
299 * parent or child.
300 */
301void anon_vma_moveto_tail(struct vm_area_struct *dst)
302{
303 struct anon_vma_chain *pavc;
304 struct anon_vma *root = NULL;
305
306 list_for_each_entry_reverse(pavc, &dst->anon_vma_chain, same_vma) {
307 struct anon_vma *anon_vma = pavc->anon_vma;
308 VM_BUG_ON(pavc->vma != dst);
309 root = lock_anon_vma_root(root, anon_vma);
310 list_del(&pavc->same_anon_vma);
311 list_add_tail(&pavc->same_anon_vma, &anon_vma->head);
312 }
313 unlock_anon_vma_root(root);
314}
315
316/*
317 * Attach vma to its own anon_vma, as well as to the anon_vmas that 267 * Attach vma to its own anon_vma, as well as to the anon_vmas that
318 * the corresponding VMA in the parent process is attached to. 268 * the corresponding VMA in the parent process is attached to.
319 * Returns 0 on success, non-zero on failure. 269 * Returns 0 on success, non-zero on failure.
@@ -381,13 +331,13 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
381 struct anon_vma *anon_vma = avc->anon_vma; 331 struct anon_vma *anon_vma = avc->anon_vma;
382 332
383 root = lock_anon_vma_root(root, anon_vma); 333 root = lock_anon_vma_root(root, anon_vma);
384 list_del(&avc->same_anon_vma); 334 anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
385 335
386 /* 336 /*
387 * Leave empty anon_vmas on the list - we'll need 337 * Leave empty anon_vmas on the list - we'll need
388 * to free them outside the lock. 338 * to free them outside the lock.
389 */ 339 */
390 if (list_empty(&anon_vma->head)) 340 if (RB_EMPTY_ROOT(&anon_vma->rb_root))
391 continue; 341 continue;
392 342
393 list_del(&avc->same_vma); 343 list_del(&avc->same_vma);
@@ -416,7 +366,7 @@ static void anon_vma_ctor(void *data)
416 366
417 mutex_init(&anon_vma->mutex); 367 mutex_init(&anon_vma->mutex);
418 atomic_set(&anon_vma->refcount, 0); 368 atomic_set(&anon_vma->refcount, 0);
419 INIT_LIST_HEAD(&anon_vma->head); 369 anon_vma->rb_root = RB_ROOT;
420} 370}
421 371
422void __init anon_vma_init(void) 372void __init anon_vma_init(void)
@@ -560,22 +510,26 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma)
560 510
561/* 511/*
562 * At what user virtual address is page expected in @vma? 512 * At what user virtual address is page expected in @vma?
563 * Returns virtual address or -EFAULT if page's index/offset is not
564 * within the range mapped the @vma.
565 */ 513 */
566inline unsigned long 514static inline unsigned long
567vma_address(struct page *page, struct vm_area_struct *vma) 515__vma_address(struct page *page, struct vm_area_struct *vma)
568{ 516{
569 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 517 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
570 unsigned long address;
571 518
572 if (unlikely(is_vm_hugetlb_page(vma))) 519 if (unlikely(is_vm_hugetlb_page(vma)))
573 pgoff = page->index << huge_page_order(page_hstate(page)); 520 pgoff = page->index << huge_page_order(page_hstate(page));
574 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 521
575 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { 522 return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
576 /* page should be within @vma mapping range */ 523}
577 return -EFAULT; 524
578 } 525inline unsigned long
526vma_address(struct page *page, struct vm_area_struct *vma)
527{
528 unsigned long address = __vma_address(page, vma);
529
530 /* page should be within @vma mapping range */
531 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
532
579 return address; 533 return address;
580} 534}
581 535
@@ -585,6 +539,7 @@ vma_address(struct page *page, struct vm_area_struct *vma)
585 */ 539 */
586unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 540unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
587{ 541{
542 unsigned long address;
588 if (PageAnon(page)) { 543 if (PageAnon(page)) {
589 struct anon_vma *page__anon_vma = page_anon_vma(page); 544 struct anon_vma *page__anon_vma = page_anon_vma(page);
590 /* 545 /*
@@ -600,7 +555,10 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
600 return -EFAULT; 555 return -EFAULT;
601 } else 556 } else
602 return -EFAULT; 557 return -EFAULT;
603 return vma_address(page, vma); 558 address = __vma_address(page, vma);
559 if (unlikely(address < vma->vm_start || address >= vma->vm_end))
560 return -EFAULT;
561 return address;
604} 562}
605 563
606/* 564/*
@@ -674,8 +632,8 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
674 pte_t *pte; 632 pte_t *pte;
675 spinlock_t *ptl; 633 spinlock_t *ptl;
676 634
677 address = vma_address(page, vma); 635 address = __vma_address(page, vma);
678 if (address == -EFAULT) /* out of vma range */ 636 if (unlikely(address < vma->vm_start || address >= vma->vm_end))
679 return 0; 637 return 0;
680 pte = page_check_address(page, vma->vm_mm, address, &ptl, 1); 638 pte = page_check_address(page, vma->vm_mm, address, &ptl, 1);
681 if (!pte) /* the page is not in this mm */ 639 if (!pte) /* the page is not in this mm */
@@ -769,6 +727,7 @@ static int page_referenced_anon(struct page *page,
769{ 727{
770 unsigned int mapcount; 728 unsigned int mapcount;
771 struct anon_vma *anon_vma; 729 struct anon_vma *anon_vma;
730 pgoff_t pgoff;
772 struct anon_vma_chain *avc; 731 struct anon_vma_chain *avc;
773 int referenced = 0; 732 int referenced = 0;
774 733
@@ -777,11 +736,10 @@ static int page_referenced_anon(struct page *page,
777 return referenced; 736 return referenced;
778 737
779 mapcount = page_mapcount(page); 738 mapcount = page_mapcount(page);
780 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { 739 pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
740 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
781 struct vm_area_struct *vma = avc->vma; 741 struct vm_area_struct *vma = avc->vma;
782 unsigned long address = vma_address(page, vma); 742 unsigned long address = vma_address(page, vma);
783 if (address == -EFAULT)
784 continue;
785 /* 743 /*
786 * If we are reclaiming on behalf of a cgroup, skip 744 * If we are reclaiming on behalf of a cgroup, skip
787 * counting on behalf of references from different 745 * counting on behalf of references from different
@@ -820,7 +778,6 @@ static int page_referenced_file(struct page *page,
820 struct address_space *mapping = page->mapping; 778 struct address_space *mapping = page->mapping;
821 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 779 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
822 struct vm_area_struct *vma; 780 struct vm_area_struct *vma;
823 struct prio_tree_iter iter;
824 int referenced = 0; 781 int referenced = 0;
825 782
826 /* 783 /*
@@ -846,10 +803,8 @@ static int page_referenced_file(struct page *page,
846 */ 803 */
847 mapcount = page_mapcount(page); 804 mapcount = page_mapcount(page);
848 805
849 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 806 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
850 unsigned long address = vma_address(page, vma); 807 unsigned long address = vma_address(page, vma);
851 if (address == -EFAULT)
852 continue;
853 /* 808 /*
854 * If we are reclaiming on behalf of a cgroup, skip 809 * If we are reclaiming on behalf of a cgroup, skip
855 * counting on behalf of references from different 810 * counting on behalf of references from different
@@ -929,7 +884,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
929 pte_t entry; 884 pte_t entry;
930 885
931 flush_cache_page(vma, address, pte_pfn(*pte)); 886 flush_cache_page(vma, address, pte_pfn(*pte));
932 entry = ptep_clear_flush_notify(vma, address, pte); 887 entry = ptep_clear_flush(vma, address, pte);
933 entry = pte_wrprotect(entry); 888 entry = pte_wrprotect(entry);
934 entry = pte_mkclean(entry); 889 entry = pte_mkclean(entry);
935 set_pte_at(mm, address, pte, entry); 890 set_pte_at(mm, address, pte, entry);
@@ -937,6 +892,9 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
937 } 892 }
938 893
939 pte_unmap_unlock(pte, ptl); 894 pte_unmap_unlock(pte, ptl);
895
896 if (ret)
897 mmu_notifier_invalidate_page(mm, address);
940out: 898out:
941 return ret; 899 return ret;
942} 900}
@@ -945,17 +903,14 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)
945{ 903{
946 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 904 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
947 struct vm_area_struct *vma; 905 struct vm_area_struct *vma;
948 struct prio_tree_iter iter;
949 int ret = 0; 906 int ret = 0;
950 907
951 BUG_ON(PageAnon(page)); 908 BUG_ON(PageAnon(page));
952 909
953 mutex_lock(&mapping->i_mmap_mutex); 910 mutex_lock(&mapping->i_mmap_mutex);
954 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 911 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
955 if (vma->vm_flags & VM_SHARED) { 912 if (vma->vm_flags & VM_SHARED) {
956 unsigned long address = vma_address(page, vma); 913 unsigned long address = vma_address(page, vma);
957 if (address == -EFAULT)
958 continue;
959 ret += page_mkclean_one(page, vma, address); 914 ret += page_mkclean_one(page, vma, address);
960 } 915 }
961 } 916 }
@@ -1128,7 +1083,7 @@ void page_add_new_anon_rmap(struct page *page,
1128 else 1083 else
1129 __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); 1084 __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
1130 __page_set_anon_rmap(page, vma, address, 1); 1085 __page_set_anon_rmap(page, vma, address, 1);
1131 if (page_evictable(page, vma)) 1086 if (!mlocked_vma_newpage(vma, page))
1132 lru_cache_add_lru(page, LRU_ACTIVE_ANON); 1087 lru_cache_add_lru(page, LRU_ACTIVE_ANON);
1133 else 1088 else
1134 add_page_to_unevictable_list(page); 1089 add_page_to_unevictable_list(page);
@@ -1203,7 +1158,10 @@ void page_remove_rmap(struct page *page)
1203 } else { 1158 } else {
1204 __dec_zone_page_state(page, NR_FILE_MAPPED); 1159 __dec_zone_page_state(page, NR_FILE_MAPPED);
1205 mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); 1160 mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED);
1161 mem_cgroup_end_update_page_stat(page, &locked, &flags);
1206 } 1162 }
1163 if (unlikely(PageMlocked(page)))
1164 clear_page_mlock(page);
1207 /* 1165 /*
1208 * It would be tidy to reset the PageAnon mapping here, 1166 * It would be tidy to reset the PageAnon mapping here,
1209 * but that might overwrite a racing page_add_anon_rmap 1167 * but that might overwrite a racing page_add_anon_rmap
@@ -1213,6 +1171,7 @@ void page_remove_rmap(struct page *page)
1213 * Leaving it set also helps swapoff to reinstate ptes 1171 * Leaving it set also helps swapoff to reinstate ptes
1214 * faster for those pages still in swapcache. 1172 * faster for those pages still in swapcache.
1215 */ 1173 */
1174 return;
1216out: 1175out:
1217 if (!anon) 1176 if (!anon)
1218 mem_cgroup_end_update_page_stat(page, &locked, &flags); 1177 mem_cgroup_end_update_page_stat(page, &locked, &flags);
@@ -1256,7 +1215,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1256 1215
1257 /* Nuke the page table entry. */ 1216 /* Nuke the page table entry. */
1258 flush_cache_page(vma, address, page_to_pfn(page)); 1217 flush_cache_page(vma, address, page_to_pfn(page));
1259 pteval = ptep_clear_flush_notify(vma, address, pte); 1218 pteval = ptep_clear_flush(vma, address, pte);
1260 1219
1261 /* Move the dirty bit to the physical page now the pte is gone. */ 1220 /* Move the dirty bit to the physical page now the pte is gone. */
1262 if (pte_dirty(pteval)) 1221 if (pte_dirty(pteval))
@@ -1318,6 +1277,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1318 1277
1319out_unmap: 1278out_unmap:
1320 pte_unmap_unlock(pte, ptl); 1279 pte_unmap_unlock(pte, ptl);
1280 if (ret != SWAP_FAIL)
1281 mmu_notifier_invalidate_page(mm, address);
1321out: 1282out:
1322 return ret; 1283 return ret;
1323 1284
@@ -1382,6 +1343,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1382 spinlock_t *ptl; 1343 spinlock_t *ptl;
1383 struct page *page; 1344 struct page *page;
1384 unsigned long address; 1345 unsigned long address;
1346 unsigned long mmun_start; /* For mmu_notifiers */
1347 unsigned long mmun_end; /* For mmu_notifiers */
1385 unsigned long end; 1348 unsigned long end;
1386 int ret = SWAP_AGAIN; 1349 int ret = SWAP_AGAIN;
1387 int locked_vma = 0; 1350 int locked_vma = 0;
@@ -1405,6 +1368,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1405 if (!pmd_present(*pmd)) 1368 if (!pmd_present(*pmd))
1406 return ret; 1369 return ret;
1407 1370
1371 mmun_start = address;
1372 mmun_end = end;
1373 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1374
1408 /* 1375 /*
1409 * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, 1376 * If we can acquire the mmap_sem for read, and vma is VM_LOCKED,
1410 * keep the sem while scanning the cluster for mlocking pages. 1377 * keep the sem while scanning the cluster for mlocking pages.
@@ -1438,7 +1405,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1438 1405
1439 /* Nuke the page table entry. */ 1406 /* Nuke the page table entry. */
1440 flush_cache_page(vma, address, pte_pfn(*pte)); 1407 flush_cache_page(vma, address, pte_pfn(*pte));
1441 pteval = ptep_clear_flush_notify(vma, address, pte); 1408 pteval = ptep_clear_flush(vma, address, pte);
1442 1409
1443 /* If nonlinear, store the file page offset in the pte. */ 1410 /* If nonlinear, store the file page offset in the pte. */
1444 if (page->index != linear_page_index(vma, address)) 1411 if (page->index != linear_page_index(vma, address))
@@ -1454,6 +1421,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1454 (*mapcount)--; 1421 (*mapcount)--;
1455 } 1422 }
1456 pte_unmap_unlock(pte - 1, ptl); 1423 pte_unmap_unlock(pte - 1, ptl);
1424 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1457 if (locked_vma) 1425 if (locked_vma)
1458 up_read(&vma->vm_mm->mmap_sem); 1426 up_read(&vma->vm_mm->mmap_sem);
1459 return ret; 1427 return ret;
@@ -1492,6 +1460,7 @@ bool is_vma_temporary_stack(struct vm_area_struct *vma)
1492static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) 1460static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1493{ 1461{
1494 struct anon_vma *anon_vma; 1462 struct anon_vma *anon_vma;
1463 pgoff_t pgoff;
1495 struct anon_vma_chain *avc; 1464 struct anon_vma_chain *avc;
1496 int ret = SWAP_AGAIN; 1465 int ret = SWAP_AGAIN;
1497 1466
@@ -1499,7 +1468,8 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1499 if (!anon_vma) 1468 if (!anon_vma)
1500 return ret; 1469 return ret;
1501 1470
1502 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { 1471 pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1472 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1503 struct vm_area_struct *vma = avc->vma; 1473 struct vm_area_struct *vma = avc->vma;
1504 unsigned long address; 1474 unsigned long address;
1505 1475
@@ -1516,8 +1486,6 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1516 continue; 1486 continue;
1517 1487
1518 address = vma_address(page, vma); 1488 address = vma_address(page, vma);
1519 if (address == -EFAULT)
1520 continue;
1521 ret = try_to_unmap_one(page, vma, address, flags); 1489 ret = try_to_unmap_one(page, vma, address, flags);
1522 if (ret != SWAP_AGAIN || !page_mapped(page)) 1490 if (ret != SWAP_AGAIN || !page_mapped(page))
1523 break; 1491 break;
@@ -1547,7 +1515,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1547 struct address_space *mapping = page->mapping; 1515 struct address_space *mapping = page->mapping;
1548 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1516 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1549 struct vm_area_struct *vma; 1517 struct vm_area_struct *vma;
1550 struct prio_tree_iter iter;
1551 int ret = SWAP_AGAIN; 1518 int ret = SWAP_AGAIN;
1552 unsigned long cursor; 1519 unsigned long cursor;
1553 unsigned long max_nl_cursor = 0; 1520 unsigned long max_nl_cursor = 0;
@@ -1555,10 +1522,8 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1555 unsigned int mapcount; 1522 unsigned int mapcount;
1556 1523
1557 mutex_lock(&mapping->i_mmap_mutex); 1524 mutex_lock(&mapping->i_mmap_mutex);
1558 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 1525 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
1559 unsigned long address = vma_address(page, vma); 1526 unsigned long address = vma_address(page, vma);
1560 if (address == -EFAULT)
1561 continue;
1562 ret = try_to_unmap_one(page, vma, address, flags); 1527 ret = try_to_unmap_one(page, vma, address, flags);
1563 if (ret != SWAP_AGAIN || !page_mapped(page)) 1528 if (ret != SWAP_AGAIN || !page_mapped(page))
1564 goto out; 1529 goto out;
@@ -1576,7 +1541,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1576 goto out; 1541 goto out;
1577 1542
1578 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1543 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
1579 shared.vm_set.list) { 1544 shared.nonlinear) {
1580 cursor = (unsigned long) vma->vm_private_data; 1545 cursor = (unsigned long) vma->vm_private_data;
1581 if (cursor > max_nl_cursor) 1546 if (cursor > max_nl_cursor)
1582 max_nl_cursor = cursor; 1547 max_nl_cursor = cursor;
@@ -1608,7 +1573,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1608 1573
1609 do { 1574 do {
1610 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1575 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
1611 shared.vm_set.list) { 1576 shared.nonlinear) {
1612 cursor = (unsigned long) vma->vm_private_data; 1577 cursor = (unsigned long) vma->vm_private_data;
1613 while ( cursor < max_nl_cursor && 1578 while ( cursor < max_nl_cursor &&
1614 cursor < vma->vm_end - vma->vm_start) { 1579 cursor < vma->vm_end - vma->vm_start) {
@@ -1631,7 +1596,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1631 * in locked vmas). Reset cursor on all unreserved nonlinear 1596 * in locked vmas). Reset cursor on all unreserved nonlinear
1632 * vmas, now forgetting on which ones it had fallen behind. 1597 * vmas, now forgetting on which ones it had fallen behind.
1633 */ 1598 */
1634 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) 1599 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
1635 vma->vm_private_data = NULL; 1600 vma->vm_private_data = NULL;
1636out: 1601out:
1637 mutex_unlock(&mapping->i_mmap_mutex); 1602 mutex_unlock(&mapping->i_mmap_mutex);
@@ -1716,6 +1681,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1716 struct vm_area_struct *, unsigned long, void *), void *arg) 1681 struct vm_area_struct *, unsigned long, void *), void *arg)
1717{ 1682{
1718 struct anon_vma *anon_vma; 1683 struct anon_vma *anon_vma;
1684 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1719 struct anon_vma_chain *avc; 1685 struct anon_vma_chain *avc;
1720 int ret = SWAP_AGAIN; 1686 int ret = SWAP_AGAIN;
1721 1687
@@ -1729,11 +1695,9 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1729 if (!anon_vma) 1695 if (!anon_vma)
1730 return ret; 1696 return ret;
1731 anon_vma_lock(anon_vma); 1697 anon_vma_lock(anon_vma);
1732 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { 1698 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1733 struct vm_area_struct *vma = avc->vma; 1699 struct vm_area_struct *vma = avc->vma;
1734 unsigned long address = vma_address(page, vma); 1700 unsigned long address = vma_address(page, vma);
1735 if (address == -EFAULT)
1736 continue;
1737 ret = rmap_one(page, vma, address, arg); 1701 ret = rmap_one(page, vma, address, arg);
1738 if (ret != SWAP_AGAIN) 1702 if (ret != SWAP_AGAIN)
1739 break; 1703 break;
@@ -1748,16 +1712,13 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
1748 struct address_space *mapping = page->mapping; 1712 struct address_space *mapping = page->mapping;
1749 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1713 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1750 struct vm_area_struct *vma; 1714 struct vm_area_struct *vma;
1751 struct prio_tree_iter iter;
1752 int ret = SWAP_AGAIN; 1715 int ret = SWAP_AGAIN;
1753 1716
1754 if (!mapping) 1717 if (!mapping)
1755 return ret; 1718 return ret;
1756 mutex_lock(&mapping->i_mmap_mutex); 1719 mutex_lock(&mapping->i_mmap_mutex);
1757 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 1720 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
1758 unsigned long address = vma_address(page, vma); 1721 unsigned long address = vma_address(page, vma);
1759 if (address == -EFAULT)
1760 continue;
1761 ret = rmap_one(page, vma, address, arg); 1722 ret = rmap_one(page, vma, address, arg);
1762 if (ret != SWAP_AGAIN) 1723 if (ret != SWAP_AGAIN)
1763 break; 1724 break;
diff --git a/mm/shmem.c b/mm/shmem.c
index d3752110c8c7..cc12072f8787 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1339,7 +1339,6 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
1339{ 1339{
1340 file_accessed(file); 1340 file_accessed(file);
1341 vma->vm_ops = &shmem_vm_ops; 1341 vma->vm_ops = &shmem_vm_ops;
1342 vma->vm_flags |= VM_CAN_NONLINEAR;
1343 return 0; 1342 return 0;
1344} 1343}
1345 1344
@@ -2643,6 +2642,7 @@ static const struct vm_operations_struct shmem_vm_ops = {
2643 .set_policy = shmem_set_policy, 2642 .set_policy = shmem_set_policy,
2644 .get_policy = shmem_get_policy, 2643 .get_policy = shmem_get_policy,
2645#endif 2644#endif
2645 .remap_pages = generic_file_remap_pages,
2646}; 2646};
2647 2647
2648static struct dentry *shmem_mount(struct file_system_type *fs_type, 2648static struct dentry *shmem_mount(struct file_system_type *fs_type,
@@ -2836,7 +2836,6 @@ int shmem_zero_setup(struct vm_area_struct *vma)
2836 fput(vma->vm_file); 2836 fput(vma->vm_file);
2837 vma->vm_file = file; 2837 vma->vm_file = file;
2838 vma->vm_ops = &shmem_vm_ops; 2838 vma->vm_ops = &shmem_vm_ops;
2839 vma->vm_flags |= VM_CAN_NONLINEAR;
2840 return 0; 2839 return 0;
2841} 2840}
2842 2841
diff --git a/mm/swap.c b/mm/swap.c
index 77825883298f..6310dc2008ff 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -446,13 +446,22 @@ void mark_page_accessed(struct page *page)
446} 446}
447EXPORT_SYMBOL(mark_page_accessed); 447EXPORT_SYMBOL(mark_page_accessed);
448 448
449/*
450 * Order of operations is important: flush the pagevec when it's already
451 * full, not when adding the last page, to make sure that last page is
452 * not added to the LRU directly when passed to this function. Because
453 * mark_page_accessed() (called after this when writing) only activates
454 * pages that are on the LRU, linear writes in subpage chunks would see
455 * every PAGEVEC_SIZE page activated, which is unexpected.
456 */
449void __lru_cache_add(struct page *page, enum lru_list lru) 457void __lru_cache_add(struct page *page, enum lru_list lru)
450{ 458{
451 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; 459 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
452 460
453 page_cache_get(page); 461 page_cache_get(page);
454 if (!pagevec_add(pvec, page)) 462 if (!pagevec_space(pvec))
455 __pagevec_lru_add(pvec, lru); 463 __pagevec_lru_add(pvec, lru);
464 pagevec_add(pvec, page);
456 put_cpu_var(lru_add_pvecs); 465 put_cpu_var(lru_add_pvecs);
457} 466}
458EXPORT_SYMBOL(__lru_cache_add); 467EXPORT_SYMBOL(__lru_cache_add);
@@ -742,7 +751,7 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
742 751
743 SetPageLRU(page_tail); 752 SetPageLRU(page_tail);
744 753
745 if (page_evictable(page_tail, NULL)) { 754 if (page_evictable(page_tail)) {
746 if (PageActive(page)) { 755 if (PageActive(page)) {
747 SetPageActive(page_tail); 756 SetPageActive(page_tail);
748 active = 1; 757 active = 1;
diff --git a/mm/truncate.c b/mm/truncate.c
index 75801acdaac7..d51ce92d6e83 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -107,7 +107,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
107 107
108 cancel_dirty_page(page, PAGE_CACHE_SIZE); 108 cancel_dirty_page(page, PAGE_CACHE_SIZE);
109 109
110 clear_page_mlock(page);
111 ClearPageMappedToDisk(page); 110 ClearPageMappedToDisk(page);
112 delete_from_page_cache(page); 111 delete_from_page_cache(page);
113 return 0; 112 return 0;
@@ -132,7 +131,6 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
132 if (page_has_private(page) && !try_to_release_page(page, 0)) 131 if (page_has_private(page) && !try_to_release_page(page, 0))
133 return 0; 132 return 0;
134 133
135 clear_page_mlock(page);
136 ret = remove_mapping(mapping, page); 134 ret = remove_mapping(mapping, page);
137 135
138 return ret; 136 return ret;
@@ -398,7 +396,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
398 if (PageDirty(page)) 396 if (PageDirty(page))
399 goto failed; 397 goto failed;
400 398
401 clear_page_mlock(page);
402 BUG_ON(page_has_private(page)); 399 BUG_ON(page_has_private(page));
403 __delete_from_page_cache(page); 400 __delete_from_page_cache(page);
404 spin_unlock_irq(&mapping->tree_lock); 401 spin_unlock_irq(&mapping->tree_lock);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 2bb90b1d241c..78e08300db21 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2163,8 +2163,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
2163 usize -= PAGE_SIZE; 2163 usize -= PAGE_SIZE;
2164 } while (usize > 0); 2164 } while (usize > 0);
2165 2165
2166 /* Prevent "things" like memory migration? VM_flags need a cleanup... */ 2166 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
2167 vma->vm_flags |= VM_RESERVED;
2168 2167
2169 return 0; 2168 return 0;
2170} 2169}
@@ -2572,7 +2571,7 @@ static int s_show(struct seq_file *m, void *p)
2572{ 2571{
2573 struct vm_struct *v = p; 2572 struct vm_struct *v = p;
2574 2573
2575 seq_printf(m, "0x%p-0x%p %7ld", 2574 seq_printf(m, "0x%pK-0x%pK %7ld",
2576 v->addr, v->addr + v->size, v->size); 2575 v->addr, v->addr + v->size, v->size);
2577 2576
2578 if (v->caller) 2577 if (v->caller)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 99b434b674c0..2624edcfb420 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -553,7 +553,7 @@ void putback_lru_page(struct page *page)
553redo: 553redo:
554 ClearPageUnevictable(page); 554 ClearPageUnevictable(page);
555 555
556 if (page_evictable(page, NULL)) { 556 if (page_evictable(page)) {
557 /* 557 /*
558 * For evictable pages, we can use the cache. 558 * For evictable pages, we can use the cache.
559 * In event of a race, worst case is we end up with an 559 * In event of a race, worst case is we end up with an
@@ -587,7 +587,7 @@ redo:
587 * page is on unevictable list, it never be freed. To avoid that, 587 * page is on unevictable list, it never be freed. To avoid that,
588 * check after we added it to the list, again. 588 * check after we added it to the list, again.
589 */ 589 */
590 if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) { 590 if (lru == LRU_UNEVICTABLE && page_evictable(page)) {
591 if (!isolate_lru_page(page)) { 591 if (!isolate_lru_page(page)) {
592 put_page(page); 592 put_page(page);
593 goto redo; 593 goto redo;
@@ -674,8 +674,10 @@ static enum page_references page_check_references(struct page *page,
674static unsigned long shrink_page_list(struct list_head *page_list, 674static unsigned long shrink_page_list(struct list_head *page_list,
675 struct zone *zone, 675 struct zone *zone,
676 struct scan_control *sc, 676 struct scan_control *sc,
677 enum ttu_flags ttu_flags,
677 unsigned long *ret_nr_dirty, 678 unsigned long *ret_nr_dirty,
678 unsigned long *ret_nr_writeback) 679 unsigned long *ret_nr_writeback,
680 bool force_reclaim)
679{ 681{
680 LIST_HEAD(ret_pages); 682 LIST_HEAD(ret_pages);
681 LIST_HEAD(free_pages); 683 LIST_HEAD(free_pages);
@@ -689,10 +691,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
689 691
690 mem_cgroup_uncharge_start(); 692 mem_cgroup_uncharge_start();
691 while (!list_empty(page_list)) { 693 while (!list_empty(page_list)) {
692 enum page_references references;
693 struct address_space *mapping; 694 struct address_space *mapping;
694 struct page *page; 695 struct page *page;
695 int may_enter_fs; 696 int may_enter_fs;
697 enum page_references references = PAGEREF_RECLAIM_CLEAN;
696 698
697 cond_resched(); 699 cond_resched();
698 700
@@ -707,7 +709,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
707 709
708 sc->nr_scanned++; 710 sc->nr_scanned++;
709 711
710 if (unlikely(!page_evictable(page, NULL))) 712 if (unlikely(!page_evictable(page)))
711 goto cull_mlocked; 713 goto cull_mlocked;
712 714
713 if (!sc->may_unmap && page_mapped(page)) 715 if (!sc->may_unmap && page_mapped(page))
@@ -758,7 +760,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
758 wait_on_page_writeback(page); 760 wait_on_page_writeback(page);
759 } 761 }
760 762
761 references = page_check_references(page, sc); 763 if (!force_reclaim)
764 references = page_check_references(page, sc);
765
762 switch (references) { 766 switch (references) {
763 case PAGEREF_ACTIVATE: 767 case PAGEREF_ACTIVATE:
764 goto activate_locked; 768 goto activate_locked;
@@ -788,7 +792,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
788 * processes. Try to unmap it here. 792 * processes. Try to unmap it here.
789 */ 793 */
790 if (page_mapped(page) && mapping) { 794 if (page_mapped(page) && mapping) {
791 switch (try_to_unmap(page, TTU_UNMAP)) { 795 switch (try_to_unmap(page, ttu_flags)) {
792 case SWAP_FAIL: 796 case SWAP_FAIL:
793 goto activate_locked; 797 goto activate_locked;
794 case SWAP_AGAIN: 798 case SWAP_AGAIN:
@@ -960,6 +964,33 @@ keep:
960 return nr_reclaimed; 964 return nr_reclaimed;
961} 965}
962 966
967unsigned long reclaim_clean_pages_from_list(struct zone *zone,
968 struct list_head *page_list)
969{
970 struct scan_control sc = {
971 .gfp_mask = GFP_KERNEL,
972 .priority = DEF_PRIORITY,
973 .may_unmap = 1,
974 };
975 unsigned long ret, dummy1, dummy2;
976 struct page *page, *next;
977 LIST_HEAD(clean_pages);
978
979 list_for_each_entry_safe(page, next, page_list, lru) {
980 if (page_is_file_cache(page) && !PageDirty(page)) {
981 ClearPageActive(page);
982 list_move(&page->lru, &clean_pages);
983 }
984 }
985
986 ret = shrink_page_list(&clean_pages, zone, &sc,
987 TTU_UNMAP|TTU_IGNORE_ACCESS,
988 &dummy1, &dummy2, true);
989 list_splice(&clean_pages, page_list);
990 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
991 return ret;
992}
993
963/* 994/*
964 * Attempt to remove the specified page from its LRU. Only take this page 995 * Attempt to remove the specified page from its LRU. Only take this page
965 * if it is of the appropriate PageActive status. Pages which are being 996 * if it is of the appropriate PageActive status. Pages which are being
@@ -978,8 +1009,8 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
978 if (!PageLRU(page)) 1009 if (!PageLRU(page))
979 return ret; 1010 return ret;
980 1011
981 /* Do not give back unevictable pages for compaction */ 1012 /* Compaction should not handle unevictable pages but CMA can do so */
982 if (PageUnevictable(page)) 1013 if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
983 return ret; 1014 return ret;
984 1015
985 ret = -EBUSY; 1016 ret = -EBUSY;
@@ -1186,7 +1217,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
1186 1217
1187 VM_BUG_ON(PageLRU(page)); 1218 VM_BUG_ON(PageLRU(page));
1188 list_del(&page->lru); 1219 list_del(&page->lru);
1189 if (unlikely(!page_evictable(page, NULL))) { 1220 if (unlikely(!page_evictable(page))) {
1190 spin_unlock_irq(&zone->lru_lock); 1221 spin_unlock_irq(&zone->lru_lock);
1191 putback_lru_page(page); 1222 putback_lru_page(page);
1192 spin_lock_irq(&zone->lru_lock); 1223 spin_lock_irq(&zone->lru_lock);
@@ -1278,8 +1309,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1278 if (nr_taken == 0) 1309 if (nr_taken == 0)
1279 return 0; 1310 return 0;
1280 1311
1281 nr_reclaimed = shrink_page_list(&page_list, zone, sc, 1312 nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
1282 &nr_dirty, &nr_writeback); 1313 &nr_dirty, &nr_writeback, false);
1283 1314
1284 spin_lock_irq(&zone->lru_lock); 1315 spin_lock_irq(&zone->lru_lock);
1285 1316
@@ -1439,7 +1470,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
1439 page = lru_to_page(&l_hold); 1470 page = lru_to_page(&l_hold);
1440 list_del(&page->lru); 1471 list_del(&page->lru);
1441 1472
1442 if (unlikely(!page_evictable(page, NULL))) { 1473 if (unlikely(!page_evictable(page))) {
1443 putback_lru_page(page); 1474 putback_lru_page(page);
1444 continue; 1475 continue;
1445 } 1476 }
@@ -1729,6 +1760,28 @@ static bool in_reclaim_compaction(struct scan_control *sc)
1729 return false; 1760 return false;
1730} 1761}
1731 1762
1763#ifdef CONFIG_COMPACTION
1764/*
1765 * If compaction is deferred for sc->order then scale the number of pages
1766 * reclaimed based on the number of consecutive allocation failures
1767 */
1768static unsigned long scale_for_compaction(unsigned long pages_for_compaction,
1769 struct lruvec *lruvec, struct scan_control *sc)
1770{
1771 struct zone *zone = lruvec_zone(lruvec);
1772
1773 if (zone->compact_order_failed <= sc->order)
1774 pages_for_compaction <<= zone->compact_defer_shift;
1775 return pages_for_compaction;
1776}
1777#else
1778static unsigned long scale_for_compaction(unsigned long pages_for_compaction,
1779 struct lruvec *lruvec, struct scan_control *sc)
1780{
1781 return pages_for_compaction;
1782}
1783#endif
1784
1732/* 1785/*
1733 * Reclaim/compaction is used for high-order allocation requests. It reclaims 1786 * Reclaim/compaction is used for high-order allocation requests. It reclaims
1734 * order-0 pages before compacting the zone. should_continue_reclaim() returns 1787 * order-0 pages before compacting the zone. should_continue_reclaim() returns
@@ -1776,6 +1829,9 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec,
1776 * inactive lists are large enough, continue reclaiming 1829 * inactive lists are large enough, continue reclaiming
1777 */ 1830 */
1778 pages_for_compaction = (2UL << sc->order); 1831 pages_for_compaction = (2UL << sc->order);
1832
1833 pages_for_compaction = scale_for_compaction(pages_for_compaction,
1834 lruvec, sc);
1779 inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE); 1835 inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE);
1780 if (nr_swap_pages > 0) 1836 if (nr_swap_pages > 0)
1781 inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON); 1837 inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON);
@@ -2839,6 +2895,14 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2839 */ 2895 */
2840 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); 2896 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
2841 2897
2898 /*
2899 * Compaction records what page blocks it recently failed to
2900 * isolate pages from and skips them in the future scanning.
2901 * When kswapd is going to sleep, it is reasonable to assume
2902 * that pages and compaction may succeed so reset the cache.
2903 */
2904 reset_isolation_suitable(pgdat);
2905
2842 if (!kthread_should_stop()) 2906 if (!kthread_should_stop())
2843 schedule(); 2907 schedule();
2844 2908
@@ -3101,9 +3165,9 @@ int kswapd_run(int nid)
3101 if (IS_ERR(pgdat->kswapd)) { 3165 if (IS_ERR(pgdat->kswapd)) {
3102 /* failure at boot is fatal */ 3166 /* failure at boot is fatal */
3103 BUG_ON(system_state == SYSTEM_BOOTING); 3167 BUG_ON(system_state == SYSTEM_BOOTING);
3104 printk("Failed to start kswapd on node %d\n",nid);
3105 pgdat->kswapd = NULL; 3168 pgdat->kswapd = NULL;
3106 ret = -1; 3169 pr_err("Failed to start kswapd on node %d\n", nid);
3170 ret = PTR_ERR(pgdat->kswapd);
3107 } 3171 }
3108 return ret; 3172 return ret;
3109} 3173}
@@ -3350,27 +3414,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3350/* 3414/*
3351 * page_evictable - test whether a page is evictable 3415 * page_evictable - test whether a page is evictable
3352 * @page: the page to test 3416 * @page: the page to test
3353 * @vma: the VMA in which the page is or will be mapped, may be NULL
3354 * 3417 *
3355 * Test whether page is evictable--i.e., should be placed on active/inactive 3418 * Test whether page is evictable--i.e., should be placed on active/inactive
3356 * lists vs unevictable list. The vma argument is !NULL when called from the 3419 * lists vs unevictable list.
3357 * fault path to determine how to instantate a new page.
3358 * 3420 *
3359 * Reasons page might not be evictable: 3421 * Reasons page might not be evictable:
3360 * (1) page's mapping marked unevictable 3422 * (1) page's mapping marked unevictable
3361 * (2) page is part of an mlocked VMA 3423 * (2) page is part of an mlocked VMA
3362 * 3424 *
3363 */ 3425 */
3364int page_evictable(struct page *page, struct vm_area_struct *vma) 3426int page_evictable(struct page *page)
3365{ 3427{
3366 3428 return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
3367 if (mapping_unevictable(page_mapping(page)))
3368 return 0;
3369
3370 if (PageMlocked(page) || (vma && mlocked_vma_newpage(vma, page)))
3371 return 0;
3372
3373 return 1;
3374} 3429}
3375 3430
3376#ifdef CONFIG_SHMEM 3431#ifdef CONFIG_SHMEM
@@ -3408,7 +3463,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
3408 if (!PageLRU(page) || !PageUnevictable(page)) 3463 if (!PageLRU(page) || !PageUnevictable(page))
3409 continue; 3464 continue;
3410 3465
3411 if (page_evictable(page, NULL)) { 3466 if (page_evictable(page)) {
3412 enum lru_list lru = page_lru_base_type(page); 3467 enum lru_list lru = page_lru_base_type(page);
3413 3468
3414 VM_BUG_ON(PageActive(page)); 3469 VM_BUG_ON(PageActive(page));
diff --git a/mm/vmstat.c b/mm/vmstat.c
index b3e3b9d525d0..c7370579111b 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -495,6 +495,18 @@ void refresh_cpu_vm_stats(int cpu)
495 atomic_long_add(global_diff[i], &vm_stat[i]); 495 atomic_long_add(global_diff[i], &vm_stat[i]);
496} 496}
497 497
498void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
499{
500 int i;
501
502 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
503 if (pset->vm_stat_diff[i]) {
504 int v = pset->vm_stat_diff[i];
505 pset->vm_stat_diff[i] = 0;
506 atomic_long_add(v, &zone->vm_stat[i]);
507 atomic_long_add(v, &vm_stat[i]);
508 }
509}
498#endif 510#endif
499 511
500#ifdef CONFIG_NUMA 512#ifdef CONFIG_NUMA
@@ -722,6 +734,7 @@ const char * const vmstat_text[] = {
722 "numa_other", 734 "numa_other",
723#endif 735#endif
724 "nr_anon_transparent_hugepages", 736 "nr_anon_transparent_hugepages",
737 "nr_free_cma",
725 "nr_dirty_threshold", 738 "nr_dirty_threshold",
726 "nr_dirty_background_threshold", 739 "nr_dirty_background_threshold",
727 740
@@ -781,7 +794,6 @@ const char * const vmstat_text[] = {
781 "unevictable_pgs_munlocked", 794 "unevictable_pgs_munlocked",
782 "unevictable_pgs_cleared", 795 "unevictable_pgs_cleared",
783 "unevictable_pgs_stranded", 796 "unevictable_pgs_stranded",
784 "unevictable_pgs_mlockfreed",
785 797
786#ifdef CONFIG_TRANSPARENT_HUGEPAGE 798#ifdef CONFIG_TRANSPARENT_HUGEPAGE
787 "thp_fault_alloc", 799 "thp_fault_alloc",
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index ccbdfbba9e53..c1d756cc7448 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -221,7 +221,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
221 kref_init(&req->r_kref); 221 kref_init(&req->r_kref);
222 init_completion(&req->r_completion); 222 init_completion(&req->r_completion);
223 init_completion(&req->r_safe_completion); 223 init_completion(&req->r_safe_completion);
224 rb_init_node(&req->r_node);
225 INIT_LIST_HEAD(&req->r_unsafe_item); 224 INIT_LIST_HEAD(&req->r_unsafe_item);
226 INIT_LIST_HEAD(&req->r_linger_item); 225 INIT_LIST_HEAD(&req->r_linger_item);
227 INIT_LIST_HEAD(&req->r_linger_osd); 226 INIT_LIST_HEAD(&req->r_linger_osd);
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 55af8c5b57e6..3a6e8731646c 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -485,7 +485,7 @@ static int sel_mmap_policy(struct file *filp, struct vm_area_struct *vma)
485 return -EACCES; 485 return -EACCES;
486 } 486 }
487 487
488 vma->vm_flags |= VM_RESERVED; 488 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
489 vma->vm_ops = &sel_mmap_policy_ops; 489 vma->vm_ops = &sel_mmap_policy_ops;
490 490
491 return 0; 491 return 0;
diff --git a/security/tomoyo/util.c b/security/tomoyo/util.c
index 867558c98334..2952ba576fb9 100644
--- a/security/tomoyo/util.c
+++ b/security/tomoyo/util.c
@@ -949,18 +949,13 @@ bool tomoyo_path_matches_pattern(const struct tomoyo_path_info *filename,
949const char *tomoyo_get_exe(void) 949const char *tomoyo_get_exe(void)
950{ 950{
951 struct mm_struct *mm = current->mm; 951 struct mm_struct *mm = current->mm;
952 struct vm_area_struct *vma;
953 const char *cp = NULL; 952 const char *cp = NULL;
954 953
955 if (!mm) 954 if (!mm)
956 return NULL; 955 return NULL;
957 down_read(&mm->mmap_sem); 956 down_read(&mm->mmap_sem);
958 for (vma = mm->mmap; vma; vma = vma->vm_next) { 957 if (mm->exe_file)
959 if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file) { 958 cp = tomoyo_realpath_from_path(&mm->exe_file->f_path);
960 cp = tomoyo_realpath_from_path(&vma->vm_file->f_path);
961 break;
962 }
963 }
964 up_read(&mm->mmap_sem); 959 up_read(&mm->mmap_sem);
965 return cp; 960 return cp;
966} 961}
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 20554eff5a21..5e12e5bacbba 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -3039,7 +3039,7 @@ static int snd_pcm_mmap_status(struct snd_pcm_substream *substream, struct file
3039 return -EINVAL; 3039 return -EINVAL;
3040 area->vm_ops = &snd_pcm_vm_ops_status; 3040 area->vm_ops = &snd_pcm_vm_ops_status;
3041 area->vm_private_data = substream; 3041 area->vm_private_data = substream;
3042 area->vm_flags |= VM_RESERVED; 3042 area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
3043 return 0; 3043 return 0;
3044} 3044}
3045 3045
@@ -3076,7 +3076,7 @@ static int snd_pcm_mmap_control(struct snd_pcm_substream *substream, struct file
3076 return -EINVAL; 3076 return -EINVAL;
3077 area->vm_ops = &snd_pcm_vm_ops_control; 3077 area->vm_ops = &snd_pcm_vm_ops_control;
3078 area->vm_private_data = substream; 3078 area->vm_private_data = substream;
3079 area->vm_flags |= VM_RESERVED; 3079 area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
3080 return 0; 3080 return 0;
3081} 3081}
3082#else /* ! coherent mmap */ 3082#else /* ! coherent mmap */
@@ -3170,7 +3170,7 @@ static const struct vm_operations_struct snd_pcm_vm_ops_data_fault = {
3170int snd_pcm_lib_default_mmap(struct snd_pcm_substream *substream, 3170int snd_pcm_lib_default_mmap(struct snd_pcm_substream *substream,
3171 struct vm_area_struct *area) 3171 struct vm_area_struct *area)
3172{ 3172{
3173 area->vm_flags |= VM_RESERVED; 3173 area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
3174#ifdef ARCH_HAS_DMA_MMAP_COHERENT 3174#ifdef ARCH_HAS_DMA_MMAP_COHERENT
3175 if (!substream->ops->page && 3175 if (!substream->ops->page &&
3176 substream->dma_buffer.dev.type == SNDRV_DMA_TYPE_DEV) 3176 substream->dma_buffer.dev.type == SNDRV_DMA_TYPE_DEV)
diff --git a/sound/usb/usx2y/us122l.c b/sound/usb/usx2y/us122l.c
index c4fd3b1d9592..d0323a693ba2 100644
--- a/sound/usb/usx2y/us122l.c
+++ b/sound/usb/usx2y/us122l.c
@@ -262,7 +262,7 @@ static int usb_stream_hwdep_mmap(struct snd_hwdep *hw,
262 } 262 }
263 263
264 area->vm_ops = &usb_stream_hwdep_vm_ops; 264 area->vm_ops = &usb_stream_hwdep_vm_ops;
265 area->vm_flags |= VM_RESERVED; 265 area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
266 area->vm_private_data = us122l; 266 area->vm_private_data = us122l;
267 atomic_inc(&us122l->mmap_count); 267 atomic_inc(&us122l->mmap_count);
268out: 268out:
diff --git a/sound/usb/usx2y/usX2Yhwdep.c b/sound/usb/usx2y/usX2Yhwdep.c
index 04aafb43a13c..0b34dbc8f302 100644
--- a/sound/usb/usx2y/usX2Yhwdep.c
+++ b/sound/usb/usx2y/usX2Yhwdep.c
@@ -82,7 +82,7 @@ static int snd_us428ctls_mmap(struct snd_hwdep * hw, struct file *filp, struct v
82 us428->us428ctls_sharedmem->CtlSnapShotLast = -2; 82 us428->us428ctls_sharedmem->CtlSnapShotLast = -2;
83 } 83 }
84 area->vm_ops = &us428ctls_vm_ops; 84 area->vm_ops = &us428ctls_vm_ops;
85 area->vm_flags |= VM_RESERVED | VM_DONTEXPAND; 85 area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
86 area->vm_private_data = hw->private_data; 86 area->vm_private_data = hw->private_data;
87 return 0; 87 return 0;
88} 88}
diff --git a/sound/usb/usx2y/usx2yhwdeppcm.c b/sound/usb/usx2y/usx2yhwdeppcm.c
index 8e40b6e67e9e..cc56007791e0 100644
--- a/sound/usb/usx2y/usx2yhwdeppcm.c
+++ b/sound/usb/usx2y/usx2yhwdeppcm.c
@@ -723,7 +723,7 @@ static int snd_usX2Y_hwdep_pcm_mmap(struct snd_hwdep * hw, struct file *filp, st
723 return -ENODEV; 723 return -ENODEV;
724 } 724 }
725 area->vm_ops = &snd_usX2Y_hwdep_pcm_vm_ops; 725 area->vm_ops = &snd_usX2Y_hwdep_pcm_vm_ops;
726 area->vm_flags |= VM_RESERVED | VM_DONTEXPAND; 726 area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
727 area->vm_private_data = hw->private_data; 727 area->vm_private_data = hw->private_data;
728 return 0; 728 return 0;
729} 729}
diff --git a/tools/perf/util/include/linux/rbtree.h b/tools/perf/util/include/linux/rbtree.h
index 2a030c5af3aa..9bcdc844b330 100644
--- a/tools/perf/util/include/linux/rbtree.h
+++ b/tools/perf/util/include/linux/rbtree.h
@@ -1,2 +1,3 @@
1#include <stdbool.h> 1#include <stdbool.h>
2#include <stdbool.h>
2#include "../../../../include/linux/rbtree.h" 3#include "../../../../include/linux/rbtree.h"