aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-04-14 19:49:17 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-04-14 19:49:17 -0400
commit1dcf58d6e6e6eb7ec10e9abc56887b040205b06f (patch)
treec03e7a25ef13eea62f1547914a76e5c68f3f4c28
parent80dcc31fbe55932ac9204daee5f2ebc0c49b6da3 (diff)
parente4b0db72be2487bae0e3251c22f82c104f7c1cfd (diff)
Merge branch 'akpm' (patches from Andrew)
Merge first patchbomb from Andrew Morton: - arch/sh updates - ocfs2 updates - kernel/watchdog feature - about half of mm/ * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (122 commits) Documentation: update arch list in the 'memtest' entry Kconfig: memtest: update number of test patterns up to 17 arm: add support for memtest arm64: add support for memtest memtest: use phys_addr_t for physical addresses mm: move memtest under mm mm, hugetlb: abort __get_user_pages if current has been oom killed mm, mempool: do not allow atomic resizing memcg: print cgroup information when system panics due to panic_on_oom mm: numa: remove migrate_ratelimited mm: fold arch_randomize_brk into ARCH_HAS_ELF_RANDOMIZE mm: split ET_DYN ASLR from mmap ASLR s390: redefine randomize_et_dyn for ELF_ET_DYN_BASE mm: expose arch_mmap_rnd when available s390: standardize mmap_rnd() usage powerpc: standardize mmap_rnd() usage mips: extract logic for mmap_rnd() arm64: standardize mmap_rnd() usage x86: standardize mmap_rnd() usage arm: factor out mmap ASLR into mmap_rnd ...
-rw-r--r--Documentation/cma/debugfs.txt21
-rw-r--r--Documentation/kernel-parameters.txt10
-rw-r--r--Documentation/sysctl/kernel.txt62
-rw-r--r--Documentation/vm/cleancache.txt4
-rw-r--r--Documentation/vm/unevictable-lru.txt26
-rw-r--r--arch/Kconfig15
-rw-r--r--arch/alpha/Kconfig4
-rw-r--r--arch/arm/Kconfig7
-rw-r--r--arch/arm/include/asm/elf.h4
-rw-r--r--arch/arm/mm/init.c3
-rw-r--r--arch/arm/mm/mmap.c16
-rw-r--r--arch/arm64/Kconfig16
-rw-r--r--arch/arm64/include/asm/elf.h5
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h4
-rw-r--r--arch/arm64/include/asm/page.h4
-rw-r--r--arch/arm64/include/asm/pgalloc.h8
-rw-r--r--arch/arm64/include/asm/pgtable-hwdef.h6
-rw-r--r--arch/arm64/include/asm/pgtable-types.h12
-rw-r--r--arch/arm64/include/asm/pgtable.h8
-rw-r--r--arch/arm64/include/asm/tlb.h4
-rw-r--r--arch/arm64/mm/init.c2
-rw-r--r--arch/arm64/mm/mmap.c20
-rw-r--r--arch/arm64/mm/mmu.c4
-rw-r--r--arch/ia64/Kconfig18
-rw-r--r--arch/ia64/include/asm/page.h4
-rw-r--r--arch/ia64/include/asm/pgalloc.h4
-rw-r--r--arch/ia64/include/asm/pgtable.h12
-rw-r--r--arch/ia64/kernel/ivt.S12
-rw-r--r--arch/ia64/kernel/machine_kexec.c4
-rw-r--r--arch/m68k/Kconfig4
-rw-r--r--arch/mips/Kconfig7
-rw-r--r--arch/mips/include/asm/elf.h4
-rw-r--r--arch/mips/mm/mmap.c24
-rw-r--r--arch/parisc/Kconfig5
-rw-r--r--arch/parisc/include/asm/pgalloc.h2
-rw-r--r--arch/parisc/include/asm/pgtable.h16
-rw-r--r--arch/parisc/kernel/entry.S4
-rw-r--r--arch/parisc/kernel/head.S4
-rw-r--r--arch/parisc/mm/init.c2
-rw-r--r--arch/powerpc/Kconfig8
-rw-r--r--arch/powerpc/include/asm/elf.h4
-rw-r--r--arch/powerpc/mm/mmap.c28
-rw-r--r--arch/s390/Kconfig6
-rw-r--r--arch/s390/include/asm/elf.h12
-rw-r--r--arch/s390/mm/mmap.c41
-rw-r--r--arch/sh/Kconfig4
-rw-r--r--arch/sh/kernel/dwarf.c18
-rw-r--r--arch/sparc/Kconfig4
-rw-r--r--arch/sparc/kernel/mdesc.c22
-rw-r--r--arch/tile/Kconfig5
-rw-r--r--arch/um/Kconfig.um5
-rw-r--r--arch/x86/Kconfig20
-rw-r--r--arch/x86/include/asm/e820.h8
-rw-r--r--arch/x86/include/asm/elf.h3
-rw-r--r--arch/x86/include/asm/page_types.h2
-rw-r--r--arch/x86/include/asm/paravirt.h8
-rw-r--r--arch/x86/include/asm/paravirt_types.h8
-rw-r--r--arch/x86/include/asm/pgalloc.h8
-rw-r--r--arch/x86/include/asm/pgtable-2level_types.h1
-rw-r--r--arch/x86/include/asm/pgtable-3level_types.h2
-rw-r--r--arch/x86/include/asm/pgtable.h8
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h1
-rw-r--r--arch/x86/include/asm/pgtable_types.h4
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/paravirt.c6
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/ioremap.c23
-rw-r--r--arch/x86/mm/mmap.c38
-rw-r--r--arch/x86/mm/pgtable.c79
-rw-r--r--arch/x86/xen/mmu.c14
-rw-r--r--drivers/base/memory.c21
-rw-r--r--drivers/s390/scsi/zfcp_erp.c4
-rw-r--r--drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h4
-rw-r--r--drivers/xen/tmem.c16
-rw-r--r--fs/Kconfig.binfmt3
-rw-r--r--fs/binfmt_elf.c31
-rw-r--r--fs/buffer.c4
-rw-r--r--fs/cifs/connect.c6
-rw-r--r--fs/hugetlbfs/inode.c2
-rw-r--r--fs/nfs/write.c5
-rw-r--r--fs/ocfs2/alloc.c48
-rw-r--r--fs/ocfs2/aops.c155
-rw-r--r--fs/ocfs2/cluster/heartbeat.c42
-rw-r--r--fs/ocfs2/cluster/masklog.h5
-rw-r--r--fs/ocfs2/dir.c15
-rw-r--r--fs/ocfs2/dlmglue.c7
-rw-r--r--fs/ocfs2/export.c2
-rw-r--r--fs/ocfs2/inode.c4
-rw-r--r--fs/ocfs2/localalloc.c4
-rw-r--r--fs/ocfs2/namei.c6
-rw-r--r--fs/ocfs2/refcounttree.c2
-rw-r--r--fs/ocfs2/slot_map.c4
-rw-r--r--fs/ocfs2/stack_o2cb.c2
-rw-r--r--fs/ocfs2/stack_user.c8
-rw-r--r--fs/ocfs2/suballoc.c2
-rw-r--r--fs/ocfs2/super.c46
-rw-r--r--fs/ocfs2/xattr.c8
-rw-r--r--fs/super.c2
-rw-r--r--include/asm-generic/pgtable.h30
-rw-r--r--include/linux/cleancache.h13
-rw-r--r--include/linux/cma.h12
-rw-r--r--include/linux/elf-randomize.h22
-rw-r--r--include/linux/gfp.h16
-rw-r--r--include/linux/io.h8
-rw-r--r--include/linux/memblock.h8
-rw-r--r--include/linux/memory_hotplug.h6
-rw-r--r--include/linux/mempool.h2
-rw-r--r--include/linux/migrate.h5
-rw-r--r--include/linux/mm.h4
-rw-r--r--include/linux/mm_types.h2
-rw-r--r--include/linux/nmi.h21
-rw-r--r--include/linux/oom.h3
-rw-r--r--include/linux/page-flags.h2
-rw-r--r--include/linux/slab.h2
-rw-r--r--include/trace/events/xen.h2
-rw-r--r--init/main.c2
-rw-r--r--kernel/cpuset.c18
-rw-r--r--kernel/sysctl.c35
-rw-r--r--kernel/watchdog.c289
-rw-r--r--lib/Kconfig.debug12
-rw-r--r--lib/ioremap.c53
-rw-r--r--mm/Kconfig6
-rw-r--r--mm/Makefile2
-rw-r--r--mm/cleancache.c276
-rw-r--r--mm/cma.c49
-rw-r--r--mm/cma.h24
-rw-r--r--mm/cma_debug.c170
-rw-r--r--mm/compaction.c15
-rw-r--r--mm/filemap.c15
-rw-r--r--mm/gup.c124
-rw-r--r--mm/huge_memory.c39
-rw-r--r--mm/hugetlb.c12
-rw-r--r--mm/internal.h4
-rw-r--r--mm/memblock.c4
-rw-r--r--mm/memcontrol.c194
-rw-r--r--mm/memory.c371
-rw-r--r--mm/memory_hotplug.c35
-rw-r--r--mm/mempolicy.c6
-rw-r--r--mm/mempool.c10
-rw-r--r--mm/memtest.c (renamed from arch/x86/mm/memtest.c)16
-rw-r--r--mm/migrate.c37
-rw-r--r--mm/mlock.c131
-rw-r--r--mm/mmap.c4
-rw-r--r--mm/oom_kill.c7
-rw-r--r--mm/page-writeback.c19
-rw-r--r--mm/page_alloc.c247
-rw-r--r--mm/slab.c22
-rw-r--r--mm/slob.c3
-rw-r--r--mm/slub.c28
-rw-r--r--mm/truncate.c37
-rw-r--r--mm/vmalloc.c8
-rw-r--r--net/openvswitch/flow.c4
-rw-r--r--scripts/coccinelle/misc/bugon.cocci2
153 files changed, 2312 insertions, 1419 deletions
diff --git a/Documentation/cma/debugfs.txt b/Documentation/cma/debugfs.txt
new file mode 100644
index 000000000000..6cef20a8cedc
--- /dev/null
+++ b/Documentation/cma/debugfs.txt
@@ -0,0 +1,21 @@
1The CMA debugfs interface is useful to retrieve basic information out of the
2different CMA areas and to test allocation/release in each of the areas.
3
4Each CMA zone represents a directory under <debugfs>/cma/, indexed by the
5kernel's CMA index. So the first CMA zone would be:
6
7 <debugfs>/cma/cma-0
8
9The structure of the files created under that directory is as follows:
10
11 - [RO] base_pfn: The base PFN (Page Frame Number) of the zone.
12 - [RO] count: Amount of memory in the CMA area.
13 - [RO] order_per_bit: Order of pages represented by one bit.
14 - [RO] bitmap: The bitmap of page states in the zone.
15 - [WO] alloc: Allocate N pages from that CMA area. For example:
16
17 echo 5 > <debugfs>/cma/cma-2/alloc
18
19would try to allocate 5 pages from the cma-2 area.
20
21 - [WO] free: Free N pages from that CMA area, similar to the above.
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 05c36118f8d7..327556349757 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1989,7 +1989,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
1989 seconds. Use this parameter to check at some 1989 seconds. Use this parameter to check at some
1990 other rate. 0 disables periodic checking. 1990 other rate. 0 disables periodic checking.
1991 1991
1992 memtest= [KNL,X86] Enable memtest 1992 memtest= [KNL,X86,ARM] Enable memtest
1993 Format: <integer> 1993 Format: <integer>
1994 default : 0 <disable> 1994 default : 0 <disable>
1995 Specifies the number of memtest passes to be 1995 Specifies the number of memtest passes to be
@@ -2236,8 +2236,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
2236 2236
2237 nmi_watchdog= [KNL,BUGS=X86] Debugging features for SMP kernels 2237 nmi_watchdog= [KNL,BUGS=X86] Debugging features for SMP kernels
2238 Format: [panic,][nopanic,][num] 2238 Format: [panic,][nopanic,][num]
2239 Valid num: 0 2239 Valid num: 0 or 1
2240 0 - turn nmi_watchdog off 2240 0 - turn nmi_watchdog off
2241 1 - turn nmi_watchdog on
2241 When panic is specified, panic when an NMI watchdog 2242 When panic is specified, panic when an NMI watchdog
2242 timeout occurs (or 'nopanic' to override the opposite 2243 timeout occurs (or 'nopanic' to override the opposite
2243 default). 2244 default).
@@ -2322,6 +2323,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
2322 register save and restore. The kernel will only save 2323 register save and restore. The kernel will only save
2323 legacy floating-point registers on task switch. 2324 legacy floating-point registers on task switch.
2324 2325
2326 nohugeiomap [KNL,x86] Disable kernel huge I/O mappings.
2327
2325 noxsave [BUGS=X86] Disables x86 extended register state save 2328 noxsave [BUGS=X86] Disables x86 extended register state save
2326 and restore using xsave. The kernel will fallback to 2329 and restore using xsave. The kernel will fallback to
2327 enabling legacy floating-point and sse state. 2330 enabling legacy floating-point and sse state.
@@ -2464,7 +2467,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
2464 2467
2465 nousb [USB] Disable the USB subsystem 2468 nousb [USB] Disable the USB subsystem
2466 2469
2467 nowatchdog [KNL] Disable the lockup detector (NMI watchdog). 2470 nowatchdog [KNL] Disable both lockup detectors, i.e.
2471 soft-lockup and NMI watchdog (hard-lockup).
2468 2472
2469 nowb [ARM] 2473 nowb [ARM]
2470 2474
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 83ab25660fc9..99d7eb3a1416 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -77,12 +77,14 @@ show up in /proc/sys/kernel:
77- shmmax [ sysv ipc ] 77- shmmax [ sysv ipc ]
78- shmmni 78- shmmni
79- softlockup_all_cpu_backtrace 79- softlockup_all_cpu_backtrace
80- soft_watchdog
80- stop-a [ SPARC only ] 81- stop-a [ SPARC only ]
81- sysrq ==> Documentation/sysrq.txt 82- sysrq ==> Documentation/sysrq.txt
82- sysctl_writes_strict 83- sysctl_writes_strict
83- tainted 84- tainted
84- threads-max 85- threads-max
85- unknown_nmi_panic 86- unknown_nmi_panic
87- watchdog
86- watchdog_thresh 88- watchdog_thresh
87- version 89- version
88 90
@@ -417,16 +419,23 @@ successful IPC object allocation.
417 419
418nmi_watchdog: 420nmi_watchdog:
419 421
420Enables/Disables the NMI watchdog on x86 systems. When the value is 422This parameter can be used to control the NMI watchdog
421non-zero the NMI watchdog is enabled and will continuously test all 423(i.e. the hard lockup detector) on x86 systems.
422online cpus to determine whether or not they are still functioning
423properly. Currently, passing "nmi_watchdog=" parameter at boot time is
424required for this function to work.
425 424
426If LAPIC NMI watchdog method is in use (nmi_watchdog=2 kernel 425 0 - disable the hard lockup detector
427parameter), the NMI watchdog shares registers with oprofile. By 426 1 - enable the hard lockup detector
428disabling the NMI watchdog, oprofile may have more registers to 427
429utilize. 428The hard lockup detector monitors each CPU for its ability to respond to
429timer interrupts. The mechanism utilizes CPU performance counter registers
430that are programmed to generate Non-Maskable Interrupts (NMIs) periodically
431while a CPU is busy. Hence, the alternative name 'NMI watchdog'.
432
433The NMI watchdog is disabled by default if the kernel is running as a guest
434in a KVM virtual machine. This default can be overridden by adding
435
436 nmi_watchdog=1
437
438to the guest kernel command line (see Documentation/kernel-parameters.txt).
430 439
431============================================================== 440==============================================================
432 441
@@ -816,6 +825,22 @@ NMI.
816 825
817============================================================== 826==============================================================
818 827
828soft_watchdog
829
830This parameter can be used to control the soft lockup detector.
831
832 0 - disable the soft lockup detector
833 1 - enable the soft lockup detector
834
835The soft lockup detector monitors CPUs for threads that are hogging the CPUs
836without rescheduling voluntarily, and thus prevent the 'watchdog/N' threads
837from running. The mechanism depends on the CPUs ability to respond to timer
838interrupts which are needed for the 'watchdog/N' threads to be woken up by
839the watchdog timer function, otherwise the NMI watchdog - if enabled - can
840detect a hard lockup condition.
841
842==============================================================
843
819tainted: 844tainted:
820 845
821Non-zero if the kernel has been tainted. Numeric values, which 846Non-zero if the kernel has been tainted. Numeric values, which
@@ -858,6 +883,25 @@ example. If a system hangs up, try pressing the NMI switch.
858 883
859============================================================== 884==============================================================
860 885
886watchdog:
887
888This parameter can be used to disable or enable the soft lockup detector
889_and_ the NMI watchdog (i.e. the hard lockup detector) at the same time.
890
891 0 - disable both lockup detectors
892 1 - enable both lockup detectors
893
894The soft lockup detector and the NMI watchdog can also be disabled or
895enabled individually, using the soft_watchdog and nmi_watchdog parameters.
896If the watchdog parameter is read, for example by executing
897
898 cat /proc/sys/kernel/watchdog
899
900the output of this command (0 or 1) shows the logical OR of soft_watchdog
901and nmi_watchdog.
902
903==============================================================
904
861watchdog_thresh: 905watchdog_thresh:
862 906
863This value can be used to control the frequency of hrtimer and NMI 907This value can be used to control the frequency of hrtimer and NMI
diff --git a/Documentation/vm/cleancache.txt b/Documentation/vm/cleancache.txt
index 01d76282444e..e4b49df7a048 100644
--- a/Documentation/vm/cleancache.txt
+++ b/Documentation/vm/cleancache.txt
@@ -28,9 +28,7 @@ IMPLEMENTATION OVERVIEW
28A cleancache "backend" that provides transcendent memory registers itself 28A cleancache "backend" that provides transcendent memory registers itself
29to the kernel's cleancache "frontend" by calling cleancache_register_ops, 29to the kernel's cleancache "frontend" by calling cleancache_register_ops,
30passing a pointer to a cleancache_ops structure with funcs set appropriately. 30passing a pointer to a cleancache_ops structure with funcs set appropriately.
31Note that cleancache_register_ops returns the previous settings so that 31The functions provided must conform to certain semantics as follows:
32chaining can be performed if desired. The functions provided must conform to
33certain semantics as follows:
34 32
35Most important, cleancache is "ephemeral". Pages which are copied into 33Most important, cleancache is "ephemeral". Pages which are copied into
36cleancache have an indefinite lifetime which is completely unknowable 34cleancache have an indefinite lifetime which is completely unknowable
diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt
index 744f82f86c58..86cb4624fc5a 100644
--- a/Documentation/vm/unevictable-lru.txt
+++ b/Documentation/vm/unevictable-lru.txt
@@ -317,7 +317,7 @@ If the VMA passes some filtering as described in "Filtering Special Vmas"
317below, mlock_fixup() will attempt to merge the VMA with its neighbors or split 317below, mlock_fixup() will attempt to merge the VMA with its neighbors or split
318off a subset of the VMA if the range does not cover the entire VMA. Once the 318off a subset of the VMA if the range does not cover the entire VMA. Once the
319VMA has been merged or split or neither, mlock_fixup() will call 319VMA has been merged or split or neither, mlock_fixup() will call
320__mlock_vma_pages_range() to fault in the pages via get_user_pages() and to 320populate_vma_page_range() to fault in the pages via get_user_pages() and to
321mark the pages as mlocked via mlock_vma_page(). 321mark the pages as mlocked via mlock_vma_page().
322 322
323Note that the VMA being mlocked might be mapped with PROT_NONE. In this case, 323Note that the VMA being mlocked might be mapped with PROT_NONE. In this case,
@@ -327,7 +327,7 @@ fault path or in vmscan.
327 327
328Also note that a page returned by get_user_pages() could be truncated or 328Also note that a page returned by get_user_pages() could be truncated or
329migrated out from under us, while we're trying to mlock it. To detect this, 329migrated out from under us, while we're trying to mlock it. To detect this,
330__mlock_vma_pages_range() checks page_mapping() after acquiring the page lock. 330populate_vma_page_range() checks page_mapping() after acquiring the page lock.
331If the page is still associated with its mapping, we'll go ahead and call 331If the page is still associated with its mapping, we'll go ahead and call
332mlock_vma_page(). If the mapping is gone, we just unlock the page and move on. 332mlock_vma_page(). If the mapping is gone, we just unlock the page and move on.
333In the worst case, this will result in a page mapped in a VM_LOCKED VMA 333In the worst case, this will result in a page mapped in a VM_LOCKED VMA
@@ -392,7 +392,7 @@ ignored for munlock.
392 392
393If the VMA is VM_LOCKED, mlock_fixup() again attempts to merge or split off the 393If the VMA is VM_LOCKED, mlock_fixup() again attempts to merge or split off the
394specified range. The range is then munlocked via the function 394specified range. The range is then munlocked via the function
395__mlock_vma_pages_range() - the same function used to mlock a VMA range - 395populate_vma_page_range() - the same function used to mlock a VMA range -
396passing a flag to indicate that munlock() is being performed. 396passing a flag to indicate that munlock() is being performed.
397 397
398Because the VMA access protections could have been changed to PROT_NONE after 398Because the VMA access protections could have been changed to PROT_NONE after
@@ -402,7 +402,7 @@ get_user_pages() was enhanced to accept a flag to ignore the permissions when
402fetching the pages - all of which should be resident as a result of previous 402fetching the pages - all of which should be resident as a result of previous
403mlocking. 403mlocking.
404 404
405For munlock(), __mlock_vma_pages_range() unlocks individual pages by calling 405For munlock(), populate_vma_page_range() unlocks individual pages by calling
406munlock_vma_page(). munlock_vma_page() unconditionally clears the PG_mlocked 406munlock_vma_page(). munlock_vma_page() unconditionally clears the PG_mlocked
407flag using TestClearPageMlocked(). As with mlock_vma_page(), 407flag using TestClearPageMlocked(). As with mlock_vma_page(),
408munlock_vma_page() use the Test*PageMlocked() function to handle the case where 408munlock_vma_page() use the Test*PageMlocked() function to handle the case where
@@ -463,21 +463,11 @@ populate the page table.
463 463
464To mlock a range of memory under the unevictable/mlock infrastructure, the 464To mlock a range of memory under the unevictable/mlock infrastructure, the
465mmap() handler and task address space expansion functions call 465mmap() handler and task address space expansion functions call
466mlock_vma_pages_range() specifying the vma and the address range to mlock. 466populate_vma_page_range() specifying the vma and the address range to mlock.
467mlock_vma_pages_range() filters VMAs like mlock_fixup(), as described above in 467
468"Filtering Special VMAs". It will clear the VM_LOCKED flag, which will have 468The callers of populate_vma_page_range() will have already added the memory range
469already been set by the caller, in filtered VMAs. Thus these VMA's need not be
470visited for munlock when the region is unmapped.
471
472For "normal" VMAs, mlock_vma_pages_range() calls __mlock_vma_pages_range() to
473fault/allocate the pages and mlock them. Again, like mlock_fixup(),
474mlock_vma_pages_range() downgrades the mmap semaphore to read mode before
475attempting to fault/allocate and mlock the pages and "upgrades" the semaphore
476back to write mode before returning.
477
478The callers of mlock_vma_pages_range() will have already added the memory range
479to be mlocked to the task's "locked_vm". To account for filtered VMAs, 469to be mlocked to the task's "locked_vm". To account for filtered VMAs,
480mlock_vma_pages_range() returns the number of pages NOT mlocked. All of the 470populate_vma_page_range() returns the number of pages NOT mlocked. All of the
481callers then subtract a non-negative return value from the task's locked_vm. A 471callers then subtract a non-negative return value from the task's locked_vm. A
482negative return value represent an error - for example, from get_user_pages() 472negative return value represent an error - for example, from get_user_pages()
483attempting to fault in a VMA with PROT_NONE access. In this case, we leave the 473attempting to fault in a VMA with PROT_NONE access. In this case, we leave the
diff --git a/arch/Kconfig b/arch/Kconfig
index 05d7a8a458d5..e1068987bad1 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -446,6 +446,9 @@ config HAVE_IRQ_TIME_ACCOUNTING
446config HAVE_ARCH_TRANSPARENT_HUGEPAGE 446config HAVE_ARCH_TRANSPARENT_HUGEPAGE
447 bool 447 bool
448 448
449config HAVE_ARCH_HUGE_VMAP
450 bool
451
449config HAVE_ARCH_SOFT_DIRTY 452config HAVE_ARCH_SOFT_DIRTY
450 bool 453 bool
451 454
@@ -484,6 +487,18 @@ config HAVE_IRQ_EXIT_ON_IRQ_STACK
484 This spares a stack switch and improves cache usage on softirq 487 This spares a stack switch and improves cache usage on softirq
485 processing. 488 processing.
486 489
490config PGTABLE_LEVELS
491 int
492 default 2
493
494config ARCH_HAS_ELF_RANDOMIZE
495 bool
496 help
497 An architecture supports choosing randomized locations for
498 stack, mmap, brk, and ET_DYN. Defined functions:
499 - arch_mmap_rnd()
500 - arch_randomize_brk()
501
487# 502#
488# ABI hall of shame 503# ABI hall of shame
489# 504#
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index b7ff9a318c31..bf9e9d3b3792 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -76,6 +76,10 @@ config GENERIC_ISA_DMA
76 bool 76 bool
77 default y 77 default y
78 78
79config PGTABLE_LEVELS
80 int
81 default 3
82
79source "init/Kconfig" 83source "init/Kconfig"
80source "kernel/Kconfig.freezer" 84source "kernel/Kconfig.freezer"
81 85
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index cf4c0c99aa25..4b62f4caf0ce 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1,8 +1,8 @@
1config ARM 1config ARM
2 bool 2 bool
3 default y 3 default y
4 select ARCH_BINFMT_ELF_RANDOMIZE_PIE
5 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE 4 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
5 select ARCH_HAS_ELF_RANDOMIZE
6 select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST 6 select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
7 select ARCH_HAVE_CUSTOM_GPIO_H 7 select ARCH_HAVE_CUSTOM_GPIO_H
8 select ARCH_HAS_GCOV_PROFILE_ALL 8 select ARCH_HAS_GCOV_PROFILE_ALL
@@ -286,6 +286,11 @@ config GENERIC_BUG
286 def_bool y 286 def_bool y
287 depends on BUG 287 depends on BUG
288 288
289config PGTABLE_LEVELS
290 int
291 default 3 if ARM_LPAE
292 default 2
293
289source "init/Kconfig" 294source "init/Kconfig"
290 295
291source "kernel/Kconfig.freezer" 296source "kernel/Kconfig.freezer"
diff --git a/arch/arm/include/asm/elf.h b/arch/arm/include/asm/elf.h
index afb9cafd3786..c1ff8ab12914 100644
--- a/arch/arm/include/asm/elf.h
+++ b/arch/arm/include/asm/elf.h
@@ -125,10 +125,6 @@ int dump_task_regs(struct task_struct *t, elf_gregset_t *elfregs);
125extern void elf_set_personality(const struct elf32_hdr *); 125extern void elf_set_personality(const struct elf32_hdr *);
126#define SET_PERSONALITY(ex) elf_set_personality(&(ex)) 126#define SET_PERSONALITY(ex) elf_set_personality(&(ex))
127 127
128struct mm_struct;
129extern unsigned long arch_randomize_brk(struct mm_struct *mm);
130#define arch_randomize_brk arch_randomize_brk
131
132#ifdef CONFIG_MMU 128#ifdef CONFIG_MMU
133#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 129#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
134struct linux_binprm; 130struct linux_binprm;
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 1609b022a72f..3d0e9aed4b40 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -335,6 +335,9 @@ void __init bootmem_init(void)
335 335
336 find_limits(&min, &max_low, &max_high); 336 find_limits(&min, &max_low, &max_high);
337 337
338 early_memtest((phys_addr_t)min << PAGE_SHIFT,
339 (phys_addr_t)max_low << PAGE_SHIFT);
340
338 /* 341 /*
339 * Sparsemem tries to allocate bootmem in memory_present(), 342 * Sparsemem tries to allocate bootmem in memory_present(),
340 * so must be done after the fixed reservations 343 * so must be done after the fixed reservations
diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c
index 5e85ed371364..407dc786583a 100644
--- a/arch/arm/mm/mmap.c
+++ b/arch/arm/mm/mmap.c
@@ -169,14 +169,22 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
169 return addr; 169 return addr;
170} 170}
171 171
172unsigned long arch_mmap_rnd(void)
173{
174 unsigned long rnd;
175
176 /* 8 bits of randomness in 20 address space bits */
177 rnd = (unsigned long)get_random_int() % (1 << 8);
178
179 return rnd << PAGE_SHIFT;
180}
181
172void arch_pick_mmap_layout(struct mm_struct *mm) 182void arch_pick_mmap_layout(struct mm_struct *mm)
173{ 183{
174 unsigned long random_factor = 0UL; 184 unsigned long random_factor = 0UL;
175 185
176 /* 8 bits of randomness in 20 address space bits */ 186 if (current->flags & PF_RANDOMIZE)
177 if ((current->flags & PF_RANDOMIZE) && 187 random_factor = arch_mmap_rnd();
178 !(current->personality & ADDR_NO_RANDOMIZE))
179 random_factor = (get_random_int() % (1 << 8)) << PAGE_SHIFT;
180 188
181 if (mmap_is_legacy()) { 189 if (mmap_is_legacy()) {
182 mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; 190 mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 1b8e97331ffb..34f487d5d84e 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1,7 +1,7 @@
1config ARM64 1config ARM64
2 def_bool y 2 def_bool y
3 select ARCH_BINFMT_ELF_RANDOMIZE_PIE
4 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE 3 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
4 select ARCH_HAS_ELF_RANDOMIZE
5 select ARCH_HAS_GCOV_PROFILE_ALL 5 select ARCH_HAS_GCOV_PROFILE_ALL
6 select ARCH_HAS_SG_CHAIN 6 select ARCH_HAS_SG_CHAIN
7 select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST 7 select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
@@ -143,6 +143,13 @@ config KERNEL_MODE_NEON
143config FIX_EARLYCON_MEM 143config FIX_EARLYCON_MEM
144 def_bool y 144 def_bool y
145 145
146config PGTABLE_LEVELS
147 int
148 default 2 if ARM64_64K_PAGES && ARM64_VA_BITS_42
149 default 3 if ARM64_64K_PAGES && ARM64_VA_BITS_48
150 default 3 if ARM64_4K_PAGES && ARM64_VA_BITS_39
151 default 4 if ARM64_4K_PAGES && ARM64_VA_BITS_48
152
146source "init/Kconfig" 153source "init/Kconfig"
147 154
148source "kernel/Kconfig.freezer" 155source "kernel/Kconfig.freezer"
@@ -413,13 +420,6 @@ config ARM64_VA_BITS
413 default 42 if ARM64_VA_BITS_42 420 default 42 if ARM64_VA_BITS_42
414 default 48 if ARM64_VA_BITS_48 421 default 48 if ARM64_VA_BITS_48
415 422
416config ARM64_PGTABLE_LEVELS
417 int
418 default 2 if ARM64_64K_PAGES && ARM64_VA_BITS_42
419 default 3 if ARM64_64K_PAGES && ARM64_VA_BITS_48
420 default 3 if ARM64_4K_PAGES && ARM64_VA_BITS_39
421 default 4 if ARM64_4K_PAGES && ARM64_VA_BITS_48
422
423config CPU_BIG_ENDIAN 423config CPU_BIG_ENDIAN
424 bool "Build big-endian kernel" 424 bool "Build big-endian kernel"
425 help 425 help
diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
index 1f65be393139..faad6df49e5b 100644
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -125,7 +125,6 @@ typedef struct user_fpsimd_state elf_fpregset_t;
125 * the loader. We need to make sure that it is out of the way of the program 125 * the loader. We need to make sure that it is out of the way of the program
126 * that it will "exec", and that there is sufficient room for the brk. 126 * that it will "exec", and that there is sufficient room for the brk.
127 */ 127 */
128extern unsigned long randomize_et_dyn(unsigned long base);
129#define ELF_ET_DYN_BASE (2 * TASK_SIZE_64 / 3) 128#define ELF_ET_DYN_BASE (2 * TASK_SIZE_64 / 3)
130 129
131/* 130/*
@@ -157,10 +156,6 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm,
157#define STACK_RND_MASK (0x3ffff >> (PAGE_SHIFT - 12)) 156#define STACK_RND_MASK (0x3ffff >> (PAGE_SHIFT - 12))
158#endif 157#endif
159 158
160struct mm_struct;
161extern unsigned long arch_randomize_brk(struct mm_struct *mm);
162#define arch_randomize_brk arch_randomize_brk
163
164#ifdef CONFIG_COMPAT 159#ifdef CONFIG_COMPAT
165 160
166#ifdef __AARCH64EB__ 161#ifdef __AARCH64EB__
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index bbfb600fa822..36250705dc4c 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -163,12 +163,12 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
163/* 163/*
164 * If we are concatenating first level stage-2 page tables, we would have less 164 * If we are concatenating first level stage-2 page tables, we would have less
165 * than or equal to 16 pointers in the fake PGD, because that's what the 165 * than or equal to 16 pointers in the fake PGD, because that's what the
166 * architecture allows. In this case, (4 - CONFIG_ARM64_PGTABLE_LEVELS) 166 * architecture allows. In this case, (4 - CONFIG_PGTABLE_LEVELS)
167 * represents the first level for the host, and we add 1 to go to the next 167 * represents the first level for the host, and we add 1 to go to the next
168 * level (which uses contatenation) for the stage-2 tables. 168 * level (which uses contatenation) for the stage-2 tables.
169 */ 169 */
170#if PTRS_PER_S2_PGD <= 16 170#if PTRS_PER_S2_PGD <= 16
171#define KVM_PREALLOC_LEVEL (4 - CONFIG_ARM64_PGTABLE_LEVELS + 1) 171#define KVM_PREALLOC_LEVEL (4 - CONFIG_PGTABLE_LEVELS + 1)
172#else 172#else
173#define KVM_PREALLOC_LEVEL (0) 173#define KVM_PREALLOC_LEVEL (0)
174#endif 174#endif
diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h
index 22b16232bd60..8fc8fa280e92 100644
--- a/arch/arm64/include/asm/page.h
+++ b/arch/arm64/include/asm/page.h
@@ -36,9 +36,9 @@
36 * for more information). 36 * for more information).
37 */ 37 */
38#ifdef CONFIG_ARM64_64K_PAGES 38#ifdef CONFIG_ARM64_64K_PAGES
39#define SWAPPER_PGTABLE_LEVELS (CONFIG_ARM64_PGTABLE_LEVELS) 39#define SWAPPER_PGTABLE_LEVELS (CONFIG_PGTABLE_LEVELS)
40#else 40#else
41#define SWAPPER_PGTABLE_LEVELS (CONFIG_ARM64_PGTABLE_LEVELS - 1) 41#define SWAPPER_PGTABLE_LEVELS (CONFIG_PGTABLE_LEVELS - 1)
42#endif 42#endif
43 43
44#define SWAPPER_DIR_SIZE (SWAPPER_PGTABLE_LEVELS * PAGE_SIZE) 44#define SWAPPER_DIR_SIZE (SWAPPER_PGTABLE_LEVELS * PAGE_SIZE)
diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index e20df38a8ff3..76420568d66a 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -28,7 +28,7 @@
28 28
29#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO) 29#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
30 30
31#if CONFIG_ARM64_PGTABLE_LEVELS > 2 31#if CONFIG_PGTABLE_LEVELS > 2
32 32
33static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) 33static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
34{ 34{
@@ -46,9 +46,9 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
46 set_pud(pud, __pud(__pa(pmd) | PMD_TYPE_TABLE)); 46 set_pud(pud, __pud(__pa(pmd) | PMD_TYPE_TABLE));
47} 47}
48 48
49#endif /* CONFIG_ARM64_PGTABLE_LEVELS > 2 */ 49#endif /* CONFIG_PGTABLE_LEVELS > 2 */
50 50
51#if CONFIG_ARM64_PGTABLE_LEVELS > 3 51#if CONFIG_PGTABLE_LEVELS > 3
52 52
53static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) 53static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
54{ 54{
@@ -66,7 +66,7 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
66 set_pgd(pgd, __pgd(__pa(pud) | PUD_TYPE_TABLE)); 66 set_pgd(pgd, __pgd(__pa(pud) | PUD_TYPE_TABLE));
67} 67}
68 68
69#endif /* CONFIG_ARM64_PGTABLE_LEVELS > 3 */ 69#endif /* CONFIG_PGTABLE_LEVELS > 3 */
70 70
71extern pgd_t *pgd_alloc(struct mm_struct *mm); 71extern pgd_t *pgd_alloc(struct mm_struct *mm);
72extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); 72extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index 5f930cc9ea83..80f3d241cff8 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -21,7 +21,7 @@
21/* 21/*
22 * PMD_SHIFT determines the size a level 2 page table entry can map. 22 * PMD_SHIFT determines the size a level 2 page table entry can map.
23 */ 23 */
24#if CONFIG_ARM64_PGTABLE_LEVELS > 2 24#if CONFIG_PGTABLE_LEVELS > 2
25#define PMD_SHIFT ((PAGE_SHIFT - 3) * 2 + 3) 25#define PMD_SHIFT ((PAGE_SHIFT - 3) * 2 + 3)
26#define PMD_SIZE (_AC(1, UL) << PMD_SHIFT) 26#define PMD_SIZE (_AC(1, UL) << PMD_SHIFT)
27#define PMD_MASK (~(PMD_SIZE-1)) 27#define PMD_MASK (~(PMD_SIZE-1))
@@ -31,7 +31,7 @@
31/* 31/*
32 * PUD_SHIFT determines the size a level 1 page table entry can map. 32 * PUD_SHIFT determines the size a level 1 page table entry can map.
33 */ 33 */
34#if CONFIG_ARM64_PGTABLE_LEVELS > 3 34#if CONFIG_PGTABLE_LEVELS > 3
35#define PUD_SHIFT ((PAGE_SHIFT - 3) * 3 + 3) 35#define PUD_SHIFT ((PAGE_SHIFT - 3) * 3 + 3)
36#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT) 36#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT)
37#define PUD_MASK (~(PUD_SIZE-1)) 37#define PUD_MASK (~(PUD_SIZE-1))
@@ -42,7 +42,7 @@
42 * PGDIR_SHIFT determines the size a top-level page table entry can map 42 * PGDIR_SHIFT determines the size a top-level page table entry can map
43 * (depending on the configuration, this level can be 0, 1 or 2). 43 * (depending on the configuration, this level can be 0, 1 or 2).
44 */ 44 */
45#define PGDIR_SHIFT ((PAGE_SHIFT - 3) * CONFIG_ARM64_PGTABLE_LEVELS + 3) 45#define PGDIR_SHIFT ((PAGE_SHIFT - 3) * CONFIG_PGTABLE_LEVELS + 3)
46#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) 46#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
47#define PGDIR_MASK (~(PGDIR_SIZE-1)) 47#define PGDIR_MASK (~(PGDIR_SIZE-1))
48#define PTRS_PER_PGD (1 << (VA_BITS - PGDIR_SHIFT)) 48#define PTRS_PER_PGD (1 << (VA_BITS - PGDIR_SHIFT))
diff --git a/arch/arm64/include/asm/pgtable-types.h b/arch/arm64/include/asm/pgtable-types.h
index ca9df80af896..2b1bd7e52c3b 100644
--- a/arch/arm64/include/asm/pgtable-types.h
+++ b/arch/arm64/include/asm/pgtable-types.h
@@ -38,13 +38,13 @@ typedef struct { pteval_t pte; } pte_t;
38#define pte_val(x) ((x).pte) 38#define pte_val(x) ((x).pte)
39#define __pte(x) ((pte_t) { (x) } ) 39#define __pte(x) ((pte_t) { (x) } )
40 40
41#if CONFIG_ARM64_PGTABLE_LEVELS > 2 41#if CONFIG_PGTABLE_LEVELS > 2
42typedef struct { pmdval_t pmd; } pmd_t; 42typedef struct { pmdval_t pmd; } pmd_t;
43#define pmd_val(x) ((x).pmd) 43#define pmd_val(x) ((x).pmd)
44#define __pmd(x) ((pmd_t) { (x) } ) 44#define __pmd(x) ((pmd_t) { (x) } )
45#endif 45#endif
46 46
47#if CONFIG_ARM64_PGTABLE_LEVELS > 3 47#if CONFIG_PGTABLE_LEVELS > 3
48typedef struct { pudval_t pud; } pud_t; 48typedef struct { pudval_t pud; } pud_t;
49#define pud_val(x) ((x).pud) 49#define pud_val(x) ((x).pud)
50#define __pud(x) ((pud_t) { (x) } ) 50#define __pud(x) ((pud_t) { (x) } )
@@ -64,13 +64,13 @@ typedef pteval_t pte_t;
64#define pte_val(x) (x) 64#define pte_val(x) (x)
65#define __pte(x) (x) 65#define __pte(x) (x)
66 66
67#if CONFIG_ARM64_PGTABLE_LEVELS > 2 67#if CONFIG_PGTABLE_LEVELS > 2
68typedef pmdval_t pmd_t; 68typedef pmdval_t pmd_t;
69#define pmd_val(x) (x) 69#define pmd_val(x) (x)
70#define __pmd(x) (x) 70#define __pmd(x) (x)
71#endif 71#endif
72 72
73#if CONFIG_ARM64_PGTABLE_LEVELS > 3 73#if CONFIG_PGTABLE_LEVELS > 3
74typedef pudval_t pud_t; 74typedef pudval_t pud_t;
75#define pud_val(x) (x) 75#define pud_val(x) (x)
76#define __pud(x) (x) 76#define __pud(x) (x)
@@ -86,9 +86,9 @@ typedef pteval_t pgprot_t;
86 86
87#endif /* STRICT_MM_TYPECHECKS */ 87#endif /* STRICT_MM_TYPECHECKS */
88 88
89#if CONFIG_ARM64_PGTABLE_LEVELS == 2 89#if CONFIG_PGTABLE_LEVELS == 2
90#include <asm-generic/pgtable-nopmd.h> 90#include <asm-generic/pgtable-nopmd.h>
91#elif CONFIG_ARM64_PGTABLE_LEVELS == 3 91#elif CONFIG_PGTABLE_LEVELS == 3
92#include <asm-generic/pgtable-nopud.h> 92#include <asm-generic/pgtable-nopud.h>
93#endif 93#endif
94 94
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 800ec0e87ed9..56283f8a675c 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -374,7 +374,7 @@ static inline pte_t *pmd_page_vaddr(pmd_t pmd)
374 */ 374 */
375#define mk_pte(page,prot) pfn_pte(page_to_pfn(page),prot) 375#define mk_pte(page,prot) pfn_pte(page_to_pfn(page),prot)
376 376
377#if CONFIG_ARM64_PGTABLE_LEVELS > 2 377#if CONFIG_PGTABLE_LEVELS > 2
378 378
379#define pmd_ERROR(pmd) __pmd_error(__FILE__, __LINE__, pmd_val(pmd)) 379#define pmd_ERROR(pmd) __pmd_error(__FILE__, __LINE__, pmd_val(pmd))
380 380
@@ -409,9 +409,9 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
409 409
410#define pud_page(pud) pfn_to_page(__phys_to_pfn(pud_val(pud) & PHYS_MASK)) 410#define pud_page(pud) pfn_to_page(__phys_to_pfn(pud_val(pud) & PHYS_MASK))
411 411
412#endif /* CONFIG_ARM64_PGTABLE_LEVELS > 2 */ 412#endif /* CONFIG_PGTABLE_LEVELS > 2 */
413 413
414#if CONFIG_ARM64_PGTABLE_LEVELS > 3 414#if CONFIG_PGTABLE_LEVELS > 3
415 415
416#define pud_ERROR(pud) __pud_error(__FILE__, __LINE__, pud_val(pud)) 416#define pud_ERROR(pud) __pud_error(__FILE__, __LINE__, pud_val(pud))
417 417
@@ -445,7 +445,7 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long addr)
445 445
446#define pgd_page(pgd) pfn_to_page(__phys_to_pfn(pgd_val(pgd) & PHYS_MASK)) 446#define pgd_page(pgd) pfn_to_page(__phys_to_pfn(pgd_val(pgd) & PHYS_MASK))
447 447
448#endif /* CONFIG_ARM64_PGTABLE_LEVELS > 3 */ 448#endif /* CONFIG_PGTABLE_LEVELS > 3 */
449 449
450#define pgd_ERROR(pgd) __pgd_error(__FILE__, __LINE__, pgd_val(pgd)) 450#define pgd_ERROR(pgd) __pgd_error(__FILE__, __LINE__, pgd_val(pgd))
451 451
diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
index 53d9c354219f..3a0242c7eb8d 100644
--- a/arch/arm64/include/asm/tlb.h
+++ b/arch/arm64/include/asm/tlb.h
@@ -53,7 +53,7 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
53 tlb_remove_entry(tlb, pte); 53 tlb_remove_entry(tlb, pte);
54} 54}
55 55
56#if CONFIG_ARM64_PGTABLE_LEVELS > 2 56#if CONFIG_PGTABLE_LEVELS > 2
57static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, 57static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp,
58 unsigned long addr) 58 unsigned long addr)
59{ 59{
@@ -62,7 +62,7 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp,
62} 62}
63#endif 63#endif
64 64
65#if CONFIG_ARM64_PGTABLE_LEVELS > 3 65#if CONFIG_PGTABLE_LEVELS > 3
66static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pudp, 66static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pudp,
67 unsigned long addr) 67 unsigned long addr)
68{ 68{
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index ae85da6307bb..597831bdddf3 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -190,6 +190,8 @@ void __init bootmem_init(void)
190 min = PFN_UP(memblock_start_of_DRAM()); 190 min = PFN_UP(memblock_start_of_DRAM());
191 max = PFN_DOWN(memblock_end_of_DRAM()); 191 max = PFN_DOWN(memblock_end_of_DRAM());
192 192
193 early_memtest(min << PAGE_SHIFT, max << PAGE_SHIFT);
194
193 /* 195 /*
194 * Sparsemem tries to allocate bootmem in memory_present(), so must be 196 * Sparsemem tries to allocate bootmem in memory_present(), so must be
195 * done after the fixed reservations. 197 * done after the fixed reservations.
diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index 54922d1275b8..ed177475dd8c 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c
@@ -47,17 +47,16 @@ static int mmap_is_legacy(void)
47 return sysctl_legacy_va_layout; 47 return sysctl_legacy_va_layout;
48} 48}
49 49
50static unsigned long mmap_rnd(void) 50unsigned long arch_mmap_rnd(void)
51{ 51{
52 unsigned long rnd = 0; 52 unsigned long rnd;
53 53
54 if (current->flags & PF_RANDOMIZE) 54 rnd = (unsigned long)get_random_int() & STACK_RND_MASK;
55 rnd = (long)get_random_int() & STACK_RND_MASK;
56 55
57 return rnd << PAGE_SHIFT; 56 return rnd << PAGE_SHIFT;
58} 57}
59 58
60static unsigned long mmap_base(void) 59static unsigned long mmap_base(unsigned long rnd)
61{ 60{
62 unsigned long gap = rlimit(RLIMIT_STACK); 61 unsigned long gap = rlimit(RLIMIT_STACK);
63 62
@@ -66,7 +65,7 @@ static unsigned long mmap_base(void)
66 else if (gap > MAX_GAP) 65 else if (gap > MAX_GAP)
67 gap = MAX_GAP; 66 gap = MAX_GAP;
68 67
69 return PAGE_ALIGN(STACK_TOP - gap - mmap_rnd()); 68 return PAGE_ALIGN(STACK_TOP - gap - rnd);
70} 69}
71 70
72/* 71/*
@@ -75,15 +74,20 @@ static unsigned long mmap_base(void)
75 */ 74 */
76void arch_pick_mmap_layout(struct mm_struct *mm) 75void arch_pick_mmap_layout(struct mm_struct *mm)
77{ 76{
77 unsigned long random_factor = 0UL;
78
79 if (current->flags & PF_RANDOMIZE)
80 random_factor = arch_mmap_rnd();
81
78 /* 82 /*
79 * Fall back to the standard layout if the personality bit is set, or 83 * Fall back to the standard layout if the personality bit is set, or
80 * if the expected stack growth is unlimited: 84 * if the expected stack growth is unlimited:
81 */ 85 */
82 if (mmap_is_legacy()) { 86 if (mmap_is_legacy()) {
83 mm->mmap_base = TASK_UNMAPPED_BASE; 87 mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
84 mm->get_unmapped_area = arch_get_unmapped_area; 88 mm->get_unmapped_area = arch_get_unmapped_area;
85 } else { 89 } else {
86 mm->mmap_base = mmap_base(); 90 mm->mmap_base = mmap_base(random_factor);
87 mm->get_unmapped_area = arch_get_unmapped_area_topdown; 91 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
88 } 92 }
89} 93}
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index c6daaf6c6f97..79e01163a981 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -550,10 +550,10 @@ void vmemmap_free(unsigned long start, unsigned long end)
550#endif /* CONFIG_SPARSEMEM_VMEMMAP */ 550#endif /* CONFIG_SPARSEMEM_VMEMMAP */
551 551
552static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss; 552static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss;
553#if CONFIG_ARM64_PGTABLE_LEVELS > 2 553#if CONFIG_PGTABLE_LEVELS > 2
554static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss; 554static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss;
555#endif 555#endif
556#if CONFIG_ARM64_PGTABLE_LEVELS > 3 556#if CONFIG_PGTABLE_LEVELS > 3
557static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss; 557static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss;
558#endif 558#endif
559 559
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 074e52bf815c..4f9a6661491b 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -1,3 +1,8 @@
1config PGTABLE_LEVELS
2 int "Page Table Levels" if !IA64_PAGE_SIZE_64KB
3 range 3 4 if !IA64_PAGE_SIZE_64KB
4 default 3
5
1source "init/Kconfig" 6source "init/Kconfig"
2 7
3source "kernel/Kconfig.freezer" 8source "kernel/Kconfig.freezer"
@@ -286,19 +291,6 @@ config IA64_PAGE_SIZE_64KB
286 291
287endchoice 292endchoice
288 293
289choice
290 prompt "Page Table Levels"
291 default PGTABLE_3
292
293config PGTABLE_3
294 bool "3 Levels"
295
296config PGTABLE_4
297 depends on !IA64_PAGE_SIZE_64KB
298 bool "4 Levels"
299
300endchoice
301
302if IA64_HP_SIM 294if IA64_HP_SIM
303config HZ 295config HZ
304 default 32 296 default 32
diff --git a/arch/ia64/include/asm/page.h b/arch/ia64/include/asm/page.h
index 1f1bf144fe62..ec48bb9f95e1 100644
--- a/arch/ia64/include/asm/page.h
+++ b/arch/ia64/include/asm/page.h
@@ -173,7 +173,7 @@ get_order (unsigned long size)
173 */ 173 */
174 typedef struct { unsigned long pte; } pte_t; 174 typedef struct { unsigned long pte; } pte_t;
175 typedef struct { unsigned long pmd; } pmd_t; 175 typedef struct { unsigned long pmd; } pmd_t;
176#ifdef CONFIG_PGTABLE_4 176#if CONFIG_PGTABLE_LEVELS == 4
177 typedef struct { unsigned long pud; } pud_t; 177 typedef struct { unsigned long pud; } pud_t;
178#endif 178#endif
179 typedef struct { unsigned long pgd; } pgd_t; 179 typedef struct { unsigned long pgd; } pgd_t;
@@ -182,7 +182,7 @@ get_order (unsigned long size)
182 182
183# define pte_val(x) ((x).pte) 183# define pte_val(x) ((x).pte)
184# define pmd_val(x) ((x).pmd) 184# define pmd_val(x) ((x).pmd)
185#ifdef CONFIG_PGTABLE_4 185#if CONFIG_PGTABLE_LEVELS == 4
186# define pud_val(x) ((x).pud) 186# define pud_val(x) ((x).pud)
187#endif 187#endif
188# define pgd_val(x) ((x).pgd) 188# define pgd_val(x) ((x).pgd)
diff --git a/arch/ia64/include/asm/pgalloc.h b/arch/ia64/include/asm/pgalloc.h
index 5767cdfc08db..f5e70e961948 100644
--- a/arch/ia64/include/asm/pgalloc.h
+++ b/arch/ia64/include/asm/pgalloc.h
@@ -32,7 +32,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
32 quicklist_free(0, NULL, pgd); 32 quicklist_free(0, NULL, pgd);
33} 33}
34 34
35#ifdef CONFIG_PGTABLE_4 35#if CONFIG_PGTABLE_LEVELS == 4
36static inline void 36static inline void
37pgd_populate(struct mm_struct *mm, pgd_t * pgd_entry, pud_t * pud) 37pgd_populate(struct mm_struct *mm, pgd_t * pgd_entry, pud_t * pud)
38{ 38{
@@ -49,7 +49,7 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud)
49 quicklist_free(0, NULL, pud); 49 quicklist_free(0, NULL, pud);
50} 50}
51#define __pud_free_tlb(tlb, pud, address) pud_free((tlb)->mm, pud) 51#define __pud_free_tlb(tlb, pud, address) pud_free((tlb)->mm, pud)
52#endif /* CONFIG_PGTABLE_4 */ 52#endif /* CONFIG_PGTABLE_LEVELS == 4 */
53 53
54static inline void 54static inline void
55pud_populate(struct mm_struct *mm, pud_t * pud_entry, pmd_t * pmd) 55pud_populate(struct mm_struct *mm, pud_t * pud_entry, pmd_t * pmd)
diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h
index 7b6f8801df57..9f3ed9ee8f13 100644
--- a/arch/ia64/include/asm/pgtable.h
+++ b/arch/ia64/include/asm/pgtable.h
@@ -99,7 +99,7 @@
99#define PMD_MASK (~(PMD_SIZE-1)) 99#define PMD_MASK (~(PMD_SIZE-1))
100#define PTRS_PER_PMD (1UL << (PTRS_PER_PTD_SHIFT)) 100#define PTRS_PER_PMD (1UL << (PTRS_PER_PTD_SHIFT))
101 101
102#ifdef CONFIG_PGTABLE_4 102#if CONFIG_PGTABLE_LEVELS == 4
103/* 103/*
104 * Definitions for second level: 104 * Definitions for second level:
105 * 105 *
@@ -117,7 +117,7 @@
117 * 117 *
118 * PGDIR_SHIFT determines what a first-level page table entry can map. 118 * PGDIR_SHIFT determines what a first-level page table entry can map.
119 */ 119 */
120#ifdef CONFIG_PGTABLE_4 120#if CONFIG_PGTABLE_LEVELS == 4
121#define PGDIR_SHIFT (PUD_SHIFT + (PTRS_PER_PTD_SHIFT)) 121#define PGDIR_SHIFT (PUD_SHIFT + (PTRS_PER_PTD_SHIFT))
122#else 122#else
123#define PGDIR_SHIFT (PMD_SHIFT + (PTRS_PER_PTD_SHIFT)) 123#define PGDIR_SHIFT (PMD_SHIFT + (PTRS_PER_PTD_SHIFT))
@@ -180,7 +180,7 @@
180#define __S111 __pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_RWX) 180#define __S111 __pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_RWX)
181 181
182#define pgd_ERROR(e) printk("%s:%d: bad pgd %016lx.\n", __FILE__, __LINE__, pgd_val(e)) 182#define pgd_ERROR(e) printk("%s:%d: bad pgd %016lx.\n", __FILE__, __LINE__, pgd_val(e))
183#ifdef CONFIG_PGTABLE_4 183#if CONFIG_PGTABLE_LEVELS == 4
184#define pud_ERROR(e) printk("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e)) 184#define pud_ERROR(e) printk("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e))
185#endif 185#endif
186#define pmd_ERROR(e) printk("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e)) 186#define pmd_ERROR(e) printk("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e))
@@ -281,7 +281,7 @@ extern unsigned long VMALLOC_END;
281#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & _PFN_MASK)) 281#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & _PFN_MASK))
282#define pud_page(pud) virt_to_page((pud_val(pud) + PAGE_OFFSET)) 282#define pud_page(pud) virt_to_page((pud_val(pud) + PAGE_OFFSET))
283 283
284#ifdef CONFIG_PGTABLE_4 284#if CONFIG_PGTABLE_LEVELS == 4
285#define pgd_none(pgd) (!pgd_val(pgd)) 285#define pgd_none(pgd) (!pgd_val(pgd))
286#define pgd_bad(pgd) (!ia64_phys_addr_valid(pgd_val(pgd))) 286#define pgd_bad(pgd) (!ia64_phys_addr_valid(pgd_val(pgd)))
287#define pgd_present(pgd) (pgd_val(pgd) != 0UL) 287#define pgd_present(pgd) (pgd_val(pgd) != 0UL)
@@ -384,7 +384,7 @@ pgd_offset (const struct mm_struct *mm, unsigned long address)
384 here. */ 384 here. */
385#define pgd_offset_gate(mm, addr) pgd_offset_k(addr) 385#define pgd_offset_gate(mm, addr) pgd_offset_k(addr)
386 386
387#ifdef CONFIG_PGTABLE_4 387#if CONFIG_PGTABLE_LEVELS == 4
388/* Find an entry in the second-level page table.. */ 388/* Find an entry in the second-level page table.. */
389#define pud_offset(dir,addr) \ 389#define pud_offset(dir,addr) \
390 ((pud_t *) pgd_page_vaddr(*(dir)) + (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))) 390 ((pud_t *) pgd_page_vaddr(*(dir)) + (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)))
@@ -586,7 +586,7 @@ extern struct page *zero_page_memmap_ptr;
586#define __HAVE_ARCH_PGD_OFFSET_GATE 586#define __HAVE_ARCH_PGD_OFFSET_GATE
587 587
588 588
589#ifndef CONFIG_PGTABLE_4 589#if CONFIG_PGTABLE_LEVELS == 3
590#include <asm-generic/pgtable-nopud.h> 590#include <asm-generic/pgtable-nopud.h>
591#endif 591#endif
592#include <asm-generic/pgtable.h> 592#include <asm-generic/pgtable.h>
diff --git a/arch/ia64/kernel/ivt.S b/arch/ia64/kernel/ivt.S
index 18e794a57248..e42bf7a913f3 100644
--- a/arch/ia64/kernel/ivt.S
+++ b/arch/ia64/kernel/ivt.S
@@ -146,7 +146,7 @@ ENTRY(vhpt_miss)
146(p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=pgd_offset for region 5 146(p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=pgd_offset for region 5
147(p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=pgd_offset for region[0-4] 147(p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=pgd_offset for region[0-4]
148 cmp.eq p7,p6=0,r21 // unused address bits all zeroes? 148 cmp.eq p7,p6=0,r21 // unused address bits all zeroes?
149#ifdef CONFIG_PGTABLE_4 149#if CONFIG_PGTABLE_LEVELS == 4
150 shr.u r28=r22,PUD_SHIFT // shift pud index into position 150 shr.u r28=r22,PUD_SHIFT // shift pud index into position
151#else 151#else
152 shr.u r18=r22,PMD_SHIFT // shift pmd index into position 152 shr.u r18=r22,PMD_SHIFT // shift pmd index into position
@@ -155,7 +155,7 @@ ENTRY(vhpt_miss)
155 ld8 r17=[r17] // get *pgd (may be 0) 155 ld8 r17=[r17] // get *pgd (may be 0)
156 ;; 156 ;;
157(p7) cmp.eq p6,p7=r17,r0 // was pgd_present(*pgd) == NULL? 157(p7) cmp.eq p6,p7=r17,r0 // was pgd_present(*pgd) == NULL?
158#ifdef CONFIG_PGTABLE_4 158#if CONFIG_PGTABLE_LEVELS == 4
159 dep r28=r28,r17,3,(PAGE_SHIFT-3) // r28=pud_offset(pgd,addr) 159 dep r28=r28,r17,3,(PAGE_SHIFT-3) // r28=pud_offset(pgd,addr)
160 ;; 160 ;;
161 shr.u r18=r22,PMD_SHIFT // shift pmd index into position 161 shr.u r18=r22,PMD_SHIFT // shift pmd index into position
@@ -222,13 +222,13 @@ ENTRY(vhpt_miss)
222 */ 222 */
223 ld8 r25=[r21] // read *pte again 223 ld8 r25=[r21] // read *pte again
224 ld8 r26=[r17] // read *pmd again 224 ld8 r26=[r17] // read *pmd again
225#ifdef CONFIG_PGTABLE_4 225#if CONFIG_PGTABLE_LEVELS == 4
226 ld8 r19=[r28] // read *pud again 226 ld8 r19=[r28] // read *pud again
227#endif 227#endif
228 cmp.ne p6,p7=r0,r0 228 cmp.ne p6,p7=r0,r0
229 ;; 229 ;;
230 cmp.ne.or.andcm p6,p7=r26,r20 // did *pmd change 230 cmp.ne.or.andcm p6,p7=r26,r20 // did *pmd change
231#ifdef CONFIG_PGTABLE_4 231#if CONFIG_PGTABLE_LEVELS == 4
232 cmp.ne.or.andcm p6,p7=r19,r29 // did *pud change 232 cmp.ne.or.andcm p6,p7=r19,r29 // did *pud change
233#endif 233#endif
234 mov r27=PAGE_SHIFT<<2 234 mov r27=PAGE_SHIFT<<2
@@ -476,7 +476,7 @@ ENTRY(nested_dtlb_miss)
476(p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=pgd_offset for region 5 476(p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=pgd_offset for region 5
477(p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=pgd_offset for region[0-4] 477(p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=pgd_offset for region[0-4]
478 cmp.eq p7,p6=0,r21 // unused address bits all zeroes? 478 cmp.eq p7,p6=0,r21 // unused address bits all zeroes?
479#ifdef CONFIG_PGTABLE_4 479#if CONFIG_PGTABLE_LEVELS == 4
480 shr.u r18=r22,PUD_SHIFT // shift pud index into position 480 shr.u r18=r22,PUD_SHIFT // shift pud index into position
481#else 481#else
482 shr.u r18=r22,PMD_SHIFT // shift pmd index into position 482 shr.u r18=r22,PMD_SHIFT // shift pmd index into position
@@ -487,7 +487,7 @@ ENTRY(nested_dtlb_miss)
487(p7) cmp.eq p6,p7=r17,r0 // was pgd_present(*pgd) == NULL? 487(p7) cmp.eq p6,p7=r17,r0 // was pgd_present(*pgd) == NULL?
488 dep r17=r18,r17,3,(PAGE_SHIFT-3) // r17=p[u|m]d_offset(pgd,addr) 488 dep r17=r18,r17,3,(PAGE_SHIFT-3) // r17=p[u|m]d_offset(pgd,addr)
489 ;; 489 ;;
490#ifdef CONFIG_PGTABLE_4 490#if CONFIG_PGTABLE_LEVELS == 4
491(p7) ld8 r17=[r17] // get *pud (may be 0) 491(p7) ld8 r17=[r17] // get *pud (may be 0)
492 shr.u r18=r22,PMD_SHIFT // shift pmd index into position 492 shr.u r18=r22,PMD_SHIFT // shift pmd index into position
493 ;; 493 ;;
diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c
index 5151a649c96b..b72cd7a07222 100644
--- a/arch/ia64/kernel/machine_kexec.c
+++ b/arch/ia64/kernel/machine_kexec.c
@@ -156,9 +156,9 @@ void arch_crash_save_vmcoreinfo(void)
156 VMCOREINFO_OFFSET(node_memblk_s, start_paddr); 156 VMCOREINFO_OFFSET(node_memblk_s, start_paddr);
157 VMCOREINFO_OFFSET(node_memblk_s, size); 157 VMCOREINFO_OFFSET(node_memblk_s, size);
158#endif 158#endif
159#ifdef CONFIG_PGTABLE_3 159#if CONFIG_PGTABLE_LEVELS == 3
160 VMCOREINFO_CONFIG(PGTABLE_3); 160 VMCOREINFO_CONFIG(PGTABLE_3);
161#elif defined(CONFIG_PGTABLE_4) 161#elif CONFIG_PGTABLE_LEVELS == 4
162 VMCOREINFO_CONFIG(PGTABLE_4); 162 VMCOREINFO_CONFIG(PGTABLE_4);
163#endif 163#endif
164} 164}
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 87b7c7581b1d..2dd8f63bfbbb 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -67,6 +67,10 @@ config HZ
67 default 1000 if CLEOPATRA 67 default 1000 if CLEOPATRA
68 default 100 68 default 100
69 69
70config PGTABLE_LEVELS
71 default 2 if SUN3 || COLDFIRE
72 default 3
73
70source "init/Kconfig" 74source "init/Kconfig"
71 75
72source "kernel/Kconfig.freezer" 76source "kernel/Kconfig.freezer"
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index c7a16904cd03..a326c4cb8cf0 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -23,7 +23,7 @@ config MIPS
23 select HAVE_KRETPROBES 23 select HAVE_KRETPROBES
24 select HAVE_DEBUG_KMEMLEAK 24 select HAVE_DEBUG_KMEMLEAK
25 select HAVE_SYSCALL_TRACEPOINTS 25 select HAVE_SYSCALL_TRACEPOINTS
26 select ARCH_BINFMT_ELF_RANDOMIZE_PIE 26 select ARCH_HAS_ELF_RANDOMIZE
27 select HAVE_ARCH_TRANSPARENT_HUGEPAGE if CPU_SUPPORTS_HUGEPAGES && 64BIT 27 select HAVE_ARCH_TRANSPARENT_HUGEPAGE if CPU_SUPPORTS_HUGEPAGES && 64BIT
28 select RTC_LIB if !MACH_LOONGSON 28 select RTC_LIB if !MACH_LOONGSON
29 select GENERIC_ATOMIC64 if !64BIT 29 select GENERIC_ATOMIC64 if !64BIT
@@ -2600,6 +2600,11 @@ config STACKTRACE_SUPPORT
2600 bool 2600 bool
2601 default y 2601 default y
2602 2602
2603config PGTABLE_LEVELS
2604 int
2605 default 3 if 64BIT && !PAGE_SIZE_64KB
2606 default 2
2607
2603source "init/Kconfig" 2608source "init/Kconfig"
2604 2609
2605source "kernel/Kconfig.freezer" 2610source "kernel/Kconfig.freezer"
diff --git a/arch/mips/include/asm/elf.h b/arch/mips/include/asm/elf.h
index 535f196ffe02..31d747d46a23 100644
--- a/arch/mips/include/asm/elf.h
+++ b/arch/mips/include/asm/elf.h
@@ -410,10 +410,6 @@ struct linux_binprm;
410extern int arch_setup_additional_pages(struct linux_binprm *bprm, 410extern int arch_setup_additional_pages(struct linux_binprm *bprm,
411 int uses_interp); 411 int uses_interp);
412 412
413struct mm_struct;
414extern unsigned long arch_randomize_brk(struct mm_struct *mm);
415#define arch_randomize_brk arch_randomize_brk
416
417struct arch_elf_state { 413struct arch_elf_state {
418 int fp_abi; 414 int fp_abi;
419 int interp_fp_abi; 415 int interp_fp_abi;
diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c
index f1baadd56e82..5c81fdd032c3 100644
--- a/arch/mips/mm/mmap.c
+++ b/arch/mips/mm/mmap.c
@@ -142,18 +142,26 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp,
142 addr0, len, pgoff, flags, DOWN); 142 addr0, len, pgoff, flags, DOWN);
143} 143}
144 144
145unsigned long arch_mmap_rnd(void)
146{
147 unsigned long rnd;
148
149 rnd = (unsigned long)get_random_int();
150 rnd <<= PAGE_SHIFT;
151 if (TASK_IS_32BIT_ADDR)
152 rnd &= 0xfffffful;
153 else
154 rnd &= 0xffffffful;
155
156 return rnd;
157}
158
145void arch_pick_mmap_layout(struct mm_struct *mm) 159void arch_pick_mmap_layout(struct mm_struct *mm)
146{ 160{
147 unsigned long random_factor = 0UL; 161 unsigned long random_factor = 0UL;
148 162
149 if (current->flags & PF_RANDOMIZE) { 163 if (current->flags & PF_RANDOMIZE)
150 random_factor = get_random_int(); 164 random_factor = arch_mmap_rnd();
151 random_factor = random_factor << PAGE_SHIFT;
152 if (TASK_IS_32BIT_ADDR)
153 random_factor &= 0xfffffful;
154 else
155 random_factor &= 0xffffffful;
156 }
157 165
158 if (mmap_is_legacy()) { 166 if (mmap_is_legacy()) {
159 mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; 167 mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 8014727a2743..c36546959e86 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -103,6 +103,11 @@ config ARCH_MAY_HAVE_PC_FDC
103 depends on BROKEN 103 depends on BROKEN
104 default y 104 default y
105 105
106config PGTABLE_LEVELS
107 int
108 default 3 if 64BIT && PARISC_PAGE_SIZE_4KB
109 default 2
110
106source "init/Kconfig" 111source "init/Kconfig"
107 112
108source "kernel/Kconfig.freezer" 113source "kernel/Kconfig.freezer"
diff --git a/arch/parisc/include/asm/pgalloc.h b/arch/parisc/include/asm/pgalloc.h
index d17437238a2c..1ba29369257c 100644
--- a/arch/parisc/include/asm/pgalloc.h
+++ b/arch/parisc/include/asm/pgalloc.h
@@ -51,7 +51,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
51 free_pages((unsigned long)pgd, PGD_ALLOC_ORDER); 51 free_pages((unsigned long)pgd, PGD_ALLOC_ORDER);
52} 52}
53 53
54#if PT_NLEVELS == 3 54#if CONFIG_PGTABLE_LEVELS == 3
55 55
56/* Three Level Page Table Support for pmd's */ 56/* Three Level Page Table Support for pmd's */
57 57
diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h
index 15207b9362bf..0a183756d6ec 100644
--- a/arch/parisc/include/asm/pgtable.h
+++ b/arch/parisc/include/asm/pgtable.h
@@ -68,13 +68,11 @@ extern void purge_tlb_entries(struct mm_struct *, unsigned long);
68#define KERNEL_INITIAL_ORDER 24 /* 0 to 1<<24 = 16MB */ 68#define KERNEL_INITIAL_ORDER 24 /* 0 to 1<<24 = 16MB */
69#define KERNEL_INITIAL_SIZE (1 << KERNEL_INITIAL_ORDER) 69#define KERNEL_INITIAL_SIZE (1 << KERNEL_INITIAL_ORDER)
70 70
71#if defined(CONFIG_64BIT) && defined(CONFIG_PARISC_PAGE_SIZE_4KB) 71#if CONFIG_PGTABLE_LEVELS == 3
72#define PT_NLEVELS 3
73#define PGD_ORDER 1 /* Number of pages per pgd */ 72#define PGD_ORDER 1 /* Number of pages per pgd */
74#define PMD_ORDER 1 /* Number of pages per pmd */ 73#define PMD_ORDER 1 /* Number of pages per pmd */
75#define PGD_ALLOC_ORDER 2 /* first pgd contains pmd */ 74#define PGD_ALLOC_ORDER 2 /* first pgd contains pmd */
76#else 75#else
77#define PT_NLEVELS 2
78#define PGD_ORDER 1 /* Number of pages per pgd */ 76#define PGD_ORDER 1 /* Number of pages per pgd */
79#define PGD_ALLOC_ORDER PGD_ORDER 77#define PGD_ALLOC_ORDER PGD_ORDER
80#endif 78#endif
@@ -93,7 +91,7 @@ extern void purge_tlb_entries(struct mm_struct *, unsigned long);
93#define PMD_SHIFT (PLD_SHIFT + BITS_PER_PTE) 91#define PMD_SHIFT (PLD_SHIFT + BITS_PER_PTE)
94#define PMD_SIZE (1UL << PMD_SHIFT) 92#define PMD_SIZE (1UL << PMD_SHIFT)
95#define PMD_MASK (~(PMD_SIZE-1)) 93#define PMD_MASK (~(PMD_SIZE-1))
96#if PT_NLEVELS == 3 94#if CONFIG_PGTABLE_LEVELS == 3
97#define BITS_PER_PMD (PAGE_SHIFT + PMD_ORDER - BITS_PER_PMD_ENTRY) 95#define BITS_PER_PMD (PAGE_SHIFT + PMD_ORDER - BITS_PER_PMD_ENTRY)
98#else 96#else
99#define __PAGETABLE_PMD_FOLDED 97#define __PAGETABLE_PMD_FOLDED
@@ -277,7 +275,7 @@ extern unsigned long *empty_zero_page;
277#define pgd_flag(x) (pgd_val(x) & PxD_FLAG_MASK) 275#define pgd_flag(x) (pgd_val(x) & PxD_FLAG_MASK)
278#define pgd_address(x) ((unsigned long)(pgd_val(x) &~ PxD_FLAG_MASK) << PxD_VALUE_SHIFT) 276#define pgd_address(x) ((unsigned long)(pgd_val(x) &~ PxD_FLAG_MASK) << PxD_VALUE_SHIFT)
279 277
280#if PT_NLEVELS == 3 278#if CONFIG_PGTABLE_LEVELS == 3
281/* The first entry of the permanent pmd is not there if it contains 279/* The first entry of the permanent pmd is not there if it contains
282 * the gateway marker */ 280 * the gateway marker */
283#define pmd_none(x) (!pmd_val(x) || pmd_flag(x) == PxD_FLAG_ATTACHED) 281#define pmd_none(x) (!pmd_val(x) || pmd_flag(x) == PxD_FLAG_ATTACHED)
@@ -287,7 +285,7 @@ extern unsigned long *empty_zero_page;
287#define pmd_bad(x) (!(pmd_flag(x) & PxD_FLAG_VALID)) 285#define pmd_bad(x) (!(pmd_flag(x) & PxD_FLAG_VALID))
288#define pmd_present(x) (pmd_flag(x) & PxD_FLAG_PRESENT) 286#define pmd_present(x) (pmd_flag(x) & PxD_FLAG_PRESENT)
289static inline void pmd_clear(pmd_t *pmd) { 287static inline void pmd_clear(pmd_t *pmd) {
290#if PT_NLEVELS == 3 288#if CONFIG_PGTABLE_LEVELS == 3
291 if (pmd_flag(*pmd) & PxD_FLAG_ATTACHED) 289 if (pmd_flag(*pmd) & PxD_FLAG_ATTACHED)
292 /* This is the entry pointing to the permanent pmd 290 /* This is the entry pointing to the permanent pmd
293 * attached to the pgd; cannot clear it */ 291 * attached to the pgd; cannot clear it */
@@ -299,7 +297,7 @@ static inline void pmd_clear(pmd_t *pmd) {
299 297
300 298
301 299
302#if PT_NLEVELS == 3 300#if CONFIG_PGTABLE_LEVELS == 3
303#define pgd_page_vaddr(pgd) ((unsigned long) __va(pgd_address(pgd))) 301#define pgd_page_vaddr(pgd) ((unsigned long) __va(pgd_address(pgd)))
304#define pgd_page(pgd) virt_to_page((void *)pgd_page_vaddr(pgd)) 302#define pgd_page(pgd) virt_to_page((void *)pgd_page_vaddr(pgd))
305 303
@@ -309,7 +307,7 @@ static inline void pmd_clear(pmd_t *pmd) {
309#define pgd_bad(x) (!(pgd_flag(x) & PxD_FLAG_VALID)) 307#define pgd_bad(x) (!(pgd_flag(x) & PxD_FLAG_VALID))
310#define pgd_present(x) (pgd_flag(x) & PxD_FLAG_PRESENT) 308#define pgd_present(x) (pgd_flag(x) & PxD_FLAG_PRESENT)
311static inline void pgd_clear(pgd_t *pgd) { 309static inline void pgd_clear(pgd_t *pgd) {
312#if PT_NLEVELS == 3 310#if CONFIG_PGTABLE_LEVELS == 3
313 if(pgd_flag(*pgd) & PxD_FLAG_ATTACHED) 311 if(pgd_flag(*pgd) & PxD_FLAG_ATTACHED)
314 /* This is the permanent pmd attached to the pgd; cannot 312 /* This is the permanent pmd attached to the pgd; cannot
315 * free it */ 313 * free it */
@@ -393,7 +391,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
393 391
394/* Find an entry in the second-level page table.. */ 392/* Find an entry in the second-level page table.. */
395 393
396#if PT_NLEVELS == 3 394#if CONFIG_PGTABLE_LEVELS == 3
397#define pmd_offset(dir,address) \ 395#define pmd_offset(dir,address) \
398((pmd_t *) pgd_page_vaddr(*(dir)) + (((address)>>PMD_SHIFT) & (PTRS_PER_PMD-1))) 396((pmd_t *) pgd_page_vaddr(*(dir)) + (((address)>>PMD_SHIFT) & (PTRS_PER_PMD-1)))
399#else 397#else
diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S
index 2ab16bb160a8..75819617f93b 100644
--- a/arch/parisc/kernel/entry.S
+++ b/arch/parisc/kernel/entry.S
@@ -398,7 +398,7 @@
398 * can address up to 1TB 398 * can address up to 1TB
399 */ 399 */
400 .macro L2_ptep pmd,pte,index,va,fault 400 .macro L2_ptep pmd,pte,index,va,fault
401#if PT_NLEVELS == 3 401#if CONFIG_PGTABLE_LEVELS == 3
402 extru \va,31-ASM_PMD_SHIFT,ASM_BITS_PER_PMD,\index 402 extru \va,31-ASM_PMD_SHIFT,ASM_BITS_PER_PMD,\index
403#else 403#else
404# if defined(CONFIG_64BIT) 404# if defined(CONFIG_64BIT)
@@ -436,7 +436,7 @@
436 * all ILP32 processes and all the kernel for machines with 436 * all ILP32 processes and all the kernel for machines with
437 * under 4GB of memory) */ 437 * under 4GB of memory) */
438 .macro L3_ptep pgd,pte,index,va,fault 438 .macro L3_ptep pgd,pte,index,va,fault
439#if PT_NLEVELS == 3 /* we might have a 2-Level scheme, e.g. with 16kb page size */ 439#if CONFIG_PGTABLE_LEVELS == 3 /* we might have a 2-Level scheme, e.g. with 16kb page size */
440 extrd,u \va,63-ASM_PGDIR_SHIFT,ASM_BITS_PER_PGD,\index 440 extrd,u \va,63-ASM_PGDIR_SHIFT,ASM_BITS_PER_PGD,\index
441 copy %r0,\pte 441 copy %r0,\pte
442 extrd,u,*= \va,63-ASM_PGDIR_SHIFT,64-ASM_PGDIR_SHIFT,%r0 442 extrd,u,*= \va,63-ASM_PGDIR_SHIFT,64-ASM_PGDIR_SHIFT,%r0
diff --git a/arch/parisc/kernel/head.S b/arch/parisc/kernel/head.S
index d4dc588c0dc1..e7d64527aff9 100644
--- a/arch/parisc/kernel/head.S
+++ b/arch/parisc/kernel/head.S
@@ -74,7 +74,7 @@ $bss_loop:
74 mtctl %r4,%cr24 /* Initialize kernel root pointer */ 74 mtctl %r4,%cr24 /* Initialize kernel root pointer */
75 mtctl %r4,%cr25 /* Initialize user root pointer */ 75 mtctl %r4,%cr25 /* Initialize user root pointer */
76 76
77#if PT_NLEVELS == 3 77#if CONFIG_PGTABLE_LEVELS == 3
78 /* Set pmd in pgd */ 78 /* Set pmd in pgd */
79 load32 PA(pmd0),%r5 79 load32 PA(pmd0),%r5
80 shrd %r5,PxD_VALUE_SHIFT,%r3 80 shrd %r5,PxD_VALUE_SHIFT,%r3
@@ -97,7 +97,7 @@ $bss_loop:
97 stw %r3,0(%r4) 97 stw %r3,0(%r4)
98 ldo (PAGE_SIZE >> PxD_VALUE_SHIFT)(%r3),%r3 98 ldo (PAGE_SIZE >> PxD_VALUE_SHIFT)(%r3),%r3
99 addib,> -1,%r1,1b 99 addib,> -1,%r1,1b
100#if PT_NLEVELS == 3 100#if CONFIG_PGTABLE_LEVELS == 3
101 ldo ASM_PMD_ENTRY_SIZE(%r4),%r4 101 ldo ASM_PMD_ENTRY_SIZE(%r4),%r4
102#else 102#else
103 ldo ASM_PGD_ENTRY_SIZE(%r4),%r4 103 ldo ASM_PGD_ENTRY_SIZE(%r4),%r4
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 15dbe81cf5f3..c229427fa546 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -34,7 +34,7 @@
34extern int data_start; 34extern int data_start;
35extern void parisc_kernel_start(void); /* Kernel entry point in head.S */ 35extern void parisc_kernel_start(void); /* Kernel entry point in head.S */
36 36
37#if PT_NLEVELS == 3 37#if CONFIG_PGTABLE_LEVELS == 3
38/* NOTE: This layout exactly conforms to the hybrid L2/L3 page table layout 38/* NOTE: This layout exactly conforms to the hybrid L2/L3 page table layout
39 * with the first pmd adjacent to the pgd and below it. gcc doesn't actually 39 * with the first pmd adjacent to the pgd and below it. gcc doesn't actually
40 * guarantee that global objects will be laid out in memory in the same order 40 * guarantee that global objects will be laid out in memory in the same order
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 22b0940494bb..e99014adf017 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -88,7 +88,7 @@ config PPC
88 select ARCH_MIGHT_HAVE_PC_PARPORT 88 select ARCH_MIGHT_HAVE_PC_PARPORT
89 select ARCH_MIGHT_HAVE_PC_SERIO 89 select ARCH_MIGHT_HAVE_PC_SERIO
90 select BINFMT_ELF 90 select BINFMT_ELF
91 select ARCH_BINFMT_ELF_RANDOMIZE_PIE 91 select ARCH_HAS_ELF_RANDOMIZE
92 select OF 92 select OF
93 select OF_EARLY_FLATTREE 93 select OF_EARLY_FLATTREE
94 select OF_RESERVED_MEM 94 select OF_RESERVED_MEM
@@ -297,6 +297,12 @@ config ZONE_DMA32
297 bool 297 bool
298 default y if PPC64 298 default y if PPC64
299 299
300config PGTABLE_LEVELS
301 int
302 default 2 if !PPC64
303 default 3 if PPC_64K_PAGES
304 default 4
305
300source "init/Kconfig" 306source "init/Kconfig"
301 307
302source "kernel/Kconfig.freezer" 308source "kernel/Kconfig.freezer"
diff --git a/arch/powerpc/include/asm/elf.h b/arch/powerpc/include/asm/elf.h
index 57d289acb803..ee46ffef608e 100644
--- a/arch/powerpc/include/asm/elf.h
+++ b/arch/powerpc/include/asm/elf.h
@@ -128,10 +128,6 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm,
128 (0x7ff >> (PAGE_SHIFT - 12)) : \ 128 (0x7ff >> (PAGE_SHIFT - 12)) : \
129 (0x3ffff >> (PAGE_SHIFT - 12))) 129 (0x3ffff >> (PAGE_SHIFT - 12)))
130 130
131extern unsigned long arch_randomize_brk(struct mm_struct *mm);
132#define arch_randomize_brk arch_randomize_brk
133
134
135#ifdef CONFIG_SPU_BASE 131#ifdef CONFIG_SPU_BASE
136/* Notes used in ET_CORE. Note name is "SPU/<fd>/<filename>". */ 132/* Notes used in ET_CORE. Note name is "SPU/<fd>/<filename>". */
137#define NT_SPU 1 133#define NT_SPU 1
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
index cb8bdbe4972f..0f0502e12f6c 100644
--- a/arch/powerpc/mm/mmap.c
+++ b/arch/powerpc/mm/mmap.c
@@ -53,21 +53,20 @@ static inline int mmap_is_legacy(void)
53 return sysctl_legacy_va_layout; 53 return sysctl_legacy_va_layout;
54} 54}
55 55
56static unsigned long mmap_rnd(void) 56unsigned long arch_mmap_rnd(void)
57{ 57{
58 unsigned long rnd = 0; 58 unsigned long rnd;
59
60 /* 8MB for 32bit, 1GB for 64bit */
61 if (is_32bit_task())
62 rnd = (unsigned long)get_random_int() % (1<<(23-PAGE_SHIFT));
63 else
64 rnd = (unsigned long)get_random_int() % (1<<(30-PAGE_SHIFT));
59 65
60 if (current->flags & PF_RANDOMIZE) {
61 /* 8MB for 32bit, 1GB for 64bit */
62 if (is_32bit_task())
63 rnd = (long)(get_random_int() % (1<<(23-PAGE_SHIFT)));
64 else
65 rnd = (long)(get_random_int() % (1<<(30-PAGE_SHIFT)));
66 }
67 return rnd << PAGE_SHIFT; 66 return rnd << PAGE_SHIFT;
68} 67}
69 68
70static inline unsigned long mmap_base(void) 69static inline unsigned long mmap_base(unsigned long rnd)
71{ 70{
72 unsigned long gap = rlimit(RLIMIT_STACK); 71 unsigned long gap = rlimit(RLIMIT_STACK);
73 72
@@ -76,7 +75,7 @@ static inline unsigned long mmap_base(void)
76 else if (gap > MAX_GAP) 75 else if (gap > MAX_GAP)
77 gap = MAX_GAP; 76 gap = MAX_GAP;
78 77
79 return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd()); 78 return PAGE_ALIGN(TASK_SIZE - gap - rnd);
80} 79}
81 80
82/* 81/*
@@ -85,6 +84,11 @@ static inline unsigned long mmap_base(void)
85 */ 84 */
86void arch_pick_mmap_layout(struct mm_struct *mm) 85void arch_pick_mmap_layout(struct mm_struct *mm)
87{ 86{
87 unsigned long random_factor = 0UL;
88
89 if (current->flags & PF_RANDOMIZE)
90 random_factor = arch_mmap_rnd();
91
88 /* 92 /*
89 * Fall back to the standard layout if the personality 93 * Fall back to the standard layout if the personality
90 * bit is set, or if the expected stack growth is unlimited: 94 * bit is set, or if the expected stack growth is unlimited:
@@ -93,7 +97,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
93 mm->mmap_base = TASK_UNMAPPED_BASE; 97 mm->mmap_base = TASK_UNMAPPED_BASE;
94 mm->get_unmapped_area = arch_get_unmapped_area; 98 mm->get_unmapped_area = arch_get_unmapped_area;
95 } else { 99 } else {
96 mm->mmap_base = mmap_base(); 100 mm->mmap_base = mmap_base(random_factor);
97 mm->get_unmapped_area = arch_get_unmapped_area_topdown; 101 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
98 } 102 }
99} 103}
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index b2d7ec1669b4..6321fd8bf813 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -65,6 +65,7 @@ config S390
65 def_bool y 65 def_bool y
66 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE 66 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
67 select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS 67 select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
68 select ARCH_HAS_ELF_RANDOMIZE
68 select ARCH_HAS_GCOV_PROFILE_ALL 69 select ARCH_HAS_GCOV_PROFILE_ALL
69 select ARCH_HAS_SG_CHAIN 70 select ARCH_HAS_SG_CHAIN
70 select ARCH_HAVE_NMI_SAFE_CMPXCHG 71 select ARCH_HAVE_NMI_SAFE_CMPXCHG
@@ -156,6 +157,11 @@ config S390
156config SCHED_OMIT_FRAME_POINTER 157config SCHED_OMIT_FRAME_POINTER
157 def_bool y 158 def_bool y
158 159
160config PGTABLE_LEVELS
161 int
162 default 4 if 64BIT
163 default 2
164
159source "init/Kconfig" 165source "init/Kconfig"
160 166
161source "kernel/Kconfig.freezer" 167source "kernel/Kconfig.freezer"
diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h
index c9c875d9ed31..a5c4978462c1 100644
--- a/arch/s390/include/asm/elf.h
+++ b/arch/s390/include/asm/elf.h
@@ -161,10 +161,11 @@ extern unsigned int vdso_enabled;
161/* This is the location that an ET_DYN program is loaded if exec'ed. Typical 161/* This is the location that an ET_DYN program is loaded if exec'ed. Typical
162 use of this is to invoke "./ld.so someprog" to test out a new version of 162 use of this is to invoke "./ld.so someprog" to test out a new version of
163 the loader. We need to make sure that it is out of the way of the program 163 the loader. We need to make sure that it is out of the way of the program
164 that it will "exec", and that there is sufficient room for the brk. */ 164 that it will "exec", and that there is sufficient room for the brk. 64-bit
165 165 tasks are aligned to 4GB. */
166extern unsigned long randomize_et_dyn(void); 166#define ELF_ET_DYN_BASE (is_32bit_task() ? \
167#define ELF_ET_DYN_BASE randomize_et_dyn() 167 (STACK_TOP / 3 * 2) : \
168 (STACK_TOP / 3 * 2) & ~((1UL << 32) - 1))
168 169
169/* This yields a mask that user programs can use to figure out what 170/* This yields a mask that user programs can use to figure out what
170 instruction set this CPU supports. */ 171 instruction set this CPU supports. */
@@ -225,9 +226,6 @@ struct linux_binprm;
225#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 226#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
226int arch_setup_additional_pages(struct linux_binprm *, int); 227int arch_setup_additional_pages(struct linux_binprm *, int);
227 228
228extern unsigned long arch_randomize_brk(struct mm_struct *mm);
229#define arch_randomize_brk arch_randomize_brk
230
231void *fill_cpu_elf_notes(void *ptr, struct save_area *sa, __vector128 *vxrs); 229void *fill_cpu_elf_notes(void *ptr, struct save_area *sa, __vector128 *vxrs);
232 230
233#endif 231#endif
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index 179a2c20b01f..bb3367c5cb0b 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -60,22 +60,20 @@ static inline int mmap_is_legacy(void)
60 return sysctl_legacy_va_layout; 60 return sysctl_legacy_va_layout;
61} 61}
62 62
63static unsigned long mmap_rnd(void) 63unsigned long arch_mmap_rnd(void)
64{ 64{
65 if (!(current->flags & PF_RANDOMIZE))
66 return 0;
67 if (is_32bit_task()) 65 if (is_32bit_task())
68 return (get_random_int() & 0x7ff) << PAGE_SHIFT; 66 return (get_random_int() & 0x7ff) << PAGE_SHIFT;
69 else 67 else
70 return (get_random_int() & mmap_rnd_mask) << PAGE_SHIFT; 68 return (get_random_int() & mmap_rnd_mask) << PAGE_SHIFT;
71} 69}
72 70
73static unsigned long mmap_base_legacy(void) 71static unsigned long mmap_base_legacy(unsigned long rnd)
74{ 72{
75 return TASK_UNMAPPED_BASE + mmap_rnd(); 73 return TASK_UNMAPPED_BASE + rnd;
76} 74}
77 75
78static inline unsigned long mmap_base(void) 76static inline unsigned long mmap_base(unsigned long rnd)
79{ 77{
80 unsigned long gap = rlimit(RLIMIT_STACK); 78 unsigned long gap = rlimit(RLIMIT_STACK);
81 79
@@ -84,7 +82,7 @@ static inline unsigned long mmap_base(void)
84 else if (gap > MAX_GAP) 82 else if (gap > MAX_GAP)
85 gap = MAX_GAP; 83 gap = MAX_GAP;
86 gap &= PAGE_MASK; 84 gap &= PAGE_MASK;
87 return STACK_TOP - stack_maxrandom_size() - mmap_rnd() - gap; 85 return STACK_TOP - stack_maxrandom_size() - rnd - gap;
88} 86}
89 87
90unsigned long 88unsigned long
@@ -179,17 +177,6 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
179 return addr; 177 return addr;
180} 178}
181 179
182unsigned long randomize_et_dyn(void)
183{
184 unsigned long base;
185
186 base = STACK_TOP / 3 * 2;
187 if (!is_32bit_task())
188 /* Align to 4GB */
189 base &= ~((1UL << 32) - 1);
190 return base + mmap_rnd();
191}
192
193#ifndef CONFIG_64BIT 180#ifndef CONFIG_64BIT
194 181
195/* 182/*
@@ -198,15 +185,20 @@ unsigned long randomize_et_dyn(void)
198 */ 185 */
199void arch_pick_mmap_layout(struct mm_struct *mm) 186void arch_pick_mmap_layout(struct mm_struct *mm)
200{ 187{
188 unsigned long random_factor = 0UL;
189
190 if (current->flags & PF_RANDOMIZE)
191 random_factor = arch_mmap_rnd();
192
201 /* 193 /*
202 * Fall back to the standard layout if the personality 194 * Fall back to the standard layout if the personality
203 * bit is set, or if the expected stack growth is unlimited: 195 * bit is set, or if the expected stack growth is unlimited:
204 */ 196 */
205 if (mmap_is_legacy()) { 197 if (mmap_is_legacy()) {
206 mm->mmap_base = mmap_base_legacy(); 198 mm->mmap_base = mmap_base_legacy(random_factor);
207 mm->get_unmapped_area = arch_get_unmapped_area; 199 mm->get_unmapped_area = arch_get_unmapped_area;
208 } else { 200 } else {
209 mm->mmap_base = mmap_base(); 201 mm->mmap_base = mmap_base(random_factor);
210 mm->get_unmapped_area = arch_get_unmapped_area_topdown; 202 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
211 } 203 }
212} 204}
@@ -273,15 +265,20 @@ s390_get_unmapped_area_topdown(struct file *filp, const unsigned long addr,
273 */ 265 */
274void arch_pick_mmap_layout(struct mm_struct *mm) 266void arch_pick_mmap_layout(struct mm_struct *mm)
275{ 267{
268 unsigned long random_factor = 0UL;
269
270 if (current->flags & PF_RANDOMIZE)
271 random_factor = arch_mmap_rnd();
272
276 /* 273 /*
277 * Fall back to the standard layout if the personality 274 * Fall back to the standard layout if the personality
278 * bit is set, or if the expected stack growth is unlimited: 275 * bit is set, or if the expected stack growth is unlimited:
279 */ 276 */
280 if (mmap_is_legacy()) { 277 if (mmap_is_legacy()) {
281 mm->mmap_base = mmap_base_legacy(); 278 mm->mmap_base = mmap_base_legacy(random_factor);
282 mm->get_unmapped_area = s390_get_unmapped_area; 279 mm->get_unmapped_area = s390_get_unmapped_area;
283 } else { 280 } else {
284 mm->mmap_base = mmap_base(); 281 mm->mmap_base = mmap_base(random_factor);
285 mm->get_unmapped_area = s390_get_unmapped_area_topdown; 282 mm->get_unmapped_area = s390_get_unmapped_area_topdown;
286 } 283 }
287} 284}
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index eb4ef274ae9b..50057fed819d 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -162,6 +162,10 @@ config NEED_DMA_MAP_STATE
162config NEED_SG_DMA_LENGTH 162config NEED_SG_DMA_LENGTH
163 def_bool y 163 def_bool y
164 164
165config PGTABLE_LEVELS
166 default 3 if X2TLB
167 default 2
168
165source "init/Kconfig" 169source "init/Kconfig"
166 170
167source "kernel/Kconfig.freezer" 171source "kernel/Kconfig.freezer"
diff --git a/arch/sh/kernel/dwarf.c b/arch/sh/kernel/dwarf.c
index 67a049e75ec1..9d209a07235e 100644
--- a/arch/sh/kernel/dwarf.c
+++ b/arch/sh/kernel/dwarf.c
@@ -993,7 +993,7 @@ static struct unwinder dwarf_unwinder = {
993 .rating = 150, 993 .rating = 150,
994}; 994};
995 995
996static void dwarf_unwinder_cleanup(void) 996static void __init dwarf_unwinder_cleanup(void)
997{ 997{
998 struct dwarf_fde *fde, *next_fde; 998 struct dwarf_fde *fde, *next_fde;
999 struct dwarf_cie *cie, *next_cie; 999 struct dwarf_cie *cie, *next_cie;
@@ -1009,6 +1009,10 @@ static void dwarf_unwinder_cleanup(void)
1009 rbtree_postorder_for_each_entry_safe(cie, next_cie, &cie_root, node) 1009 rbtree_postorder_for_each_entry_safe(cie, next_cie, &cie_root, node)
1010 kfree(cie); 1010 kfree(cie);
1011 1011
1012 if (dwarf_reg_pool)
1013 mempool_destroy(dwarf_reg_pool);
1014 if (dwarf_frame_pool)
1015 mempool_destroy(dwarf_frame_pool);
1012 kmem_cache_destroy(dwarf_reg_cachep); 1016 kmem_cache_destroy(dwarf_reg_cachep);
1013 kmem_cache_destroy(dwarf_frame_cachep); 1017 kmem_cache_destroy(dwarf_frame_cachep);
1014} 1018}
@@ -1176,17 +1180,13 @@ static int __init dwarf_unwinder_init(void)
1176 sizeof(struct dwarf_reg), 0, 1180 sizeof(struct dwarf_reg), 0,
1177 SLAB_PANIC | SLAB_HWCACHE_ALIGN | SLAB_NOTRACK, NULL); 1181 SLAB_PANIC | SLAB_HWCACHE_ALIGN | SLAB_NOTRACK, NULL);
1178 1182
1179 dwarf_frame_pool = mempool_create(DWARF_FRAME_MIN_REQ, 1183 dwarf_frame_pool = mempool_create_slab_pool(DWARF_FRAME_MIN_REQ,
1180 mempool_alloc_slab, 1184 dwarf_frame_cachep);
1181 mempool_free_slab,
1182 dwarf_frame_cachep);
1183 if (!dwarf_frame_pool) 1185 if (!dwarf_frame_pool)
1184 goto out; 1186 goto out;
1185 1187
1186 dwarf_reg_pool = mempool_create(DWARF_REG_MIN_REQ, 1188 dwarf_reg_pool = mempool_create_slab_pool(DWARF_REG_MIN_REQ,
1187 mempool_alloc_slab, 1189 dwarf_reg_cachep);
1188 mempool_free_slab,
1189 dwarf_reg_cachep);
1190 if (!dwarf_reg_pool) 1190 if (!dwarf_reg_pool)
1191 goto out; 1191 goto out;
1192 1192
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index efb00ec75805..e49502acbab4 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -146,6 +146,10 @@ config GENERIC_ISA_DMA
146config ARCH_SUPPORTS_DEBUG_PAGEALLOC 146config ARCH_SUPPORTS_DEBUG_PAGEALLOC
147 def_bool y if SPARC64 147 def_bool y if SPARC64
148 148
149config PGTABLE_LEVELS
150 default 4 if 64BIT
151 default 3
152
149source "init/Kconfig" 153source "init/Kconfig"
150 154
151source "kernel/Kconfig.freezer" 155source "kernel/Kconfig.freezer"
diff --git a/arch/sparc/kernel/mdesc.c b/arch/sparc/kernel/mdesc.c
index 99632a87e697..26c80e18d7b1 100644
--- a/arch/sparc/kernel/mdesc.c
+++ b/arch/sparc/kernel/mdesc.c
@@ -130,26 +130,26 @@ static struct mdesc_mem_ops memblock_mdesc_ops = {
130static struct mdesc_handle *mdesc_kmalloc(unsigned int mdesc_size) 130static struct mdesc_handle *mdesc_kmalloc(unsigned int mdesc_size)
131{ 131{
132 unsigned int handle_size; 132 unsigned int handle_size;
133 struct mdesc_handle *hp;
134 unsigned long addr;
133 void *base; 135 void *base;
134 136
135 handle_size = (sizeof(struct mdesc_handle) - 137 handle_size = (sizeof(struct mdesc_handle) -
136 sizeof(struct mdesc_hdr) + 138 sizeof(struct mdesc_hdr) +
137 mdesc_size); 139 mdesc_size);
138 140
141 /*
142 * Allocation has to succeed because mdesc update would be missed
143 * and such events are not retransmitted.
144 */
139 base = kmalloc(handle_size + 15, GFP_KERNEL | __GFP_NOFAIL); 145 base = kmalloc(handle_size + 15, GFP_KERNEL | __GFP_NOFAIL);
140 if (base) { 146 addr = (unsigned long)base;
141 struct mdesc_handle *hp; 147 addr = (addr + 15UL) & ~15UL;
142 unsigned long addr; 148 hp = (struct mdesc_handle *) addr;
143
144 addr = (unsigned long)base;
145 addr = (addr + 15UL) & ~15UL;
146 hp = (struct mdesc_handle *) addr;
147 149
148 mdesc_handle_init(hp, handle_size, base); 150 mdesc_handle_init(hp, handle_size, base);
149 return hp;
150 }
151 151
152 return NULL; 152 return hp;
153} 153}
154 154
155static void mdesc_kfree(struct mdesc_handle *hp) 155static void mdesc_kfree(struct mdesc_handle *hp)
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 7cca41842a9e..0142d578b5a8 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -147,6 +147,11 @@ config ARCH_DEFCONFIG
147 default "arch/tile/configs/tilepro_defconfig" if !TILEGX 147 default "arch/tile/configs/tilepro_defconfig" if !TILEGX
148 default "arch/tile/configs/tilegx_defconfig" if TILEGX 148 default "arch/tile/configs/tilegx_defconfig" if TILEGX
149 149
150config PGTABLE_LEVELS
151 int
152 default 3 if 64BIT
153 default 2
154
150source "init/Kconfig" 155source "init/Kconfig"
151 156
152source "kernel/Kconfig.freezer" 157source "kernel/Kconfig.freezer"
diff --git a/arch/um/Kconfig.um b/arch/um/Kconfig.um
index a7520c90f62d..5dbfe3d9107c 100644
--- a/arch/um/Kconfig.um
+++ b/arch/um/Kconfig.um
@@ -155,3 +155,8 @@ config MMAPPER
155 155
156config NO_DMA 156config NO_DMA
157 def_bool y 157 def_bool y
158
159config PGTABLE_LEVELS
160 int
161 default 3 if 3_LEVEL_PGTABLES
162 default 2
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index faff6934c05a..d43e7e1c784b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -87,7 +87,7 @@ config X86
87 select HAVE_ARCH_KMEMCHECK 87 select HAVE_ARCH_KMEMCHECK
88 select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP 88 select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP
89 select HAVE_USER_RETURN_NOTIFIER 89 select HAVE_USER_RETURN_NOTIFIER
90 select ARCH_BINFMT_ELF_RANDOMIZE_PIE 90 select ARCH_HAS_ELF_RANDOMIZE
91 select HAVE_ARCH_JUMP_LABEL 91 select HAVE_ARCH_JUMP_LABEL
92 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE 92 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
93 select SPARSE_IRQ 93 select SPARSE_IRQ
@@ -99,6 +99,7 @@ config X86
99 select IRQ_FORCED_THREADING 99 select IRQ_FORCED_THREADING
100 select HAVE_BPF_JIT if X86_64 100 select HAVE_BPF_JIT if X86_64
101 select HAVE_ARCH_TRANSPARENT_HUGEPAGE 101 select HAVE_ARCH_TRANSPARENT_HUGEPAGE
102 select HAVE_ARCH_HUGE_VMAP if X86_64 || (X86_32 && X86_PAE)
102 select ARCH_HAS_SG_CHAIN 103 select ARCH_HAS_SG_CHAIN
103 select CLKEVT_I8253 104 select CLKEVT_I8253
104 select ARCH_HAVE_NMI_SAFE_CMPXCHG 105 select ARCH_HAVE_NMI_SAFE_CMPXCHG
@@ -277,6 +278,12 @@ config ARCH_SUPPORTS_UPROBES
277config FIX_EARLYCON_MEM 278config FIX_EARLYCON_MEM
278 def_bool y 279 def_bool y
279 280
281config PGTABLE_LEVELS
282 int
283 default 4 if X86_64
284 default 3 if X86_PAE
285 default 2
286
280source "init/Kconfig" 287source "init/Kconfig"
281source "kernel/Kconfig.freezer" 288source "kernel/Kconfig.freezer"
282 289
@@ -714,17 +721,6 @@ endif #HYPERVISOR_GUEST
714config NO_BOOTMEM 721config NO_BOOTMEM
715 def_bool y 722 def_bool y
716 723
717config MEMTEST
718 bool "Memtest"
719 ---help---
720 This option adds a kernel parameter 'memtest', which allows memtest
721 to be set.
722 memtest=0, mean disabled; -- default
723 memtest=1, mean do 1 test pattern;
724 ...
725 memtest=4, mean do 4 test patterns.
726 If you are unsure how to answer this question, answer N.
727
728source "arch/x86/Kconfig.cpu" 724source "arch/x86/Kconfig.cpu"
729 725
730config HPET_TIMER 726config HPET_TIMER
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 779c2efe2e97..3ab0537872fb 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -40,14 +40,6 @@ static inline void e820_mark_nosave_regions(unsigned long limit_pfn)
40} 40}
41#endif 41#endif
42 42
43#ifdef CONFIG_MEMTEST
44extern void early_memtest(unsigned long start, unsigned long end);
45#else
46static inline void early_memtest(unsigned long start, unsigned long end)
47{
48}
49#endif
50
51extern unsigned long e820_end_of_ram_pfn(void); 43extern unsigned long e820_end_of_ram_pfn(void);
52extern unsigned long e820_end_of_low_ram_pfn(void); 44extern unsigned long e820_end_of_low_ram_pfn(void);
53extern u64 early_reserve_e820(u64 sizet, u64 align); 45extern u64 early_reserve_e820(u64 sizet, u64 align);
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 935588d95c82..f161c189c27b 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -339,9 +339,6 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
339 int uses_interp); 339 int uses_interp);
340#define compat_arch_setup_additional_pages compat_arch_setup_additional_pages 340#define compat_arch_setup_additional_pages compat_arch_setup_additional_pages
341 341
342extern unsigned long arch_randomize_brk(struct mm_struct *mm);
343#define arch_randomize_brk arch_randomize_brk
344
345/* 342/*
346 * True on X86_32 or when emulating IA32 on X86_64 343 * True on X86_32 or when emulating IA32 on X86_64
347 */ 344 */
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index f97fbe3abb67..c7c712f2648b 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -40,8 +40,10 @@
40 40
41#ifdef CONFIG_X86_64 41#ifdef CONFIG_X86_64
42#include <asm/page_64_types.h> 42#include <asm/page_64_types.h>
43#define IOREMAP_MAX_ORDER (PUD_SHIFT)
43#else 44#else
44#include <asm/page_32_types.h> 45#include <asm/page_32_types.h>
46#define IOREMAP_MAX_ORDER (PMD_SHIFT)
45#endif /* CONFIG_X86_64 */ 47#endif /* CONFIG_X86_64 */
46 48
47#ifndef __ASSEMBLY__ 49#ifndef __ASSEMBLY__
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 5f6051d5d139..8957810ad7d1 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -545,7 +545,7 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
545 PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, val); 545 PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, val);
546} 546}
547 547
548#if PAGETABLE_LEVELS >= 3 548#if CONFIG_PGTABLE_LEVELS >= 3
549static inline pmd_t __pmd(pmdval_t val) 549static inline pmd_t __pmd(pmdval_t val)
550{ 550{
551 pmdval_t ret; 551 pmdval_t ret;
@@ -585,7 +585,7 @@ static inline void set_pud(pud_t *pudp, pud_t pud)
585 PVOP_VCALL2(pv_mmu_ops.set_pud, pudp, 585 PVOP_VCALL2(pv_mmu_ops.set_pud, pudp,
586 val); 586 val);
587} 587}
588#if PAGETABLE_LEVELS == 4 588#if CONFIG_PGTABLE_LEVELS == 4
589static inline pud_t __pud(pudval_t val) 589static inline pud_t __pud(pudval_t val)
590{ 590{
591 pudval_t ret; 591 pudval_t ret;
@@ -636,9 +636,9 @@ static inline void pud_clear(pud_t *pudp)
636 set_pud(pudp, __pud(0)); 636 set_pud(pudp, __pud(0));
637} 637}
638 638
639#endif /* PAGETABLE_LEVELS == 4 */ 639#endif /* CONFIG_PGTABLE_LEVELS == 4 */
640 640
641#endif /* PAGETABLE_LEVELS >= 3 */ 641#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
642 642
643#ifdef CONFIG_X86_PAE 643#ifdef CONFIG_X86_PAE
644/* Special-case pte-setting operations for PAE, which can't update a 644/* Special-case pte-setting operations for PAE, which can't update a
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 7549b8b369e4..f7b0b5c112f2 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -294,7 +294,7 @@ struct pv_mmu_ops {
294 struct paravirt_callee_save pgd_val; 294 struct paravirt_callee_save pgd_val;
295 struct paravirt_callee_save make_pgd; 295 struct paravirt_callee_save make_pgd;
296 296
297#if PAGETABLE_LEVELS >= 3 297#if CONFIG_PGTABLE_LEVELS >= 3
298#ifdef CONFIG_X86_PAE 298#ifdef CONFIG_X86_PAE
299 void (*set_pte_atomic)(pte_t *ptep, pte_t pteval); 299 void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
300 void (*pte_clear)(struct mm_struct *mm, unsigned long addr, 300 void (*pte_clear)(struct mm_struct *mm, unsigned long addr,
@@ -308,13 +308,13 @@ struct pv_mmu_ops {
308 struct paravirt_callee_save pmd_val; 308 struct paravirt_callee_save pmd_val;
309 struct paravirt_callee_save make_pmd; 309 struct paravirt_callee_save make_pmd;
310 310
311#if PAGETABLE_LEVELS == 4 311#if CONFIG_PGTABLE_LEVELS == 4
312 struct paravirt_callee_save pud_val; 312 struct paravirt_callee_save pud_val;
313 struct paravirt_callee_save make_pud; 313 struct paravirt_callee_save make_pud;
314 314
315 void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); 315 void (*set_pgd)(pgd_t *pudp, pgd_t pgdval);
316#endif /* PAGETABLE_LEVELS == 4 */ 316#endif /* CONFIG_PGTABLE_LEVELS == 4 */
317#endif /* PAGETABLE_LEVELS >= 3 */ 317#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
318 318
319 struct pv_lazy_ops lazy_mode; 319 struct pv_lazy_ops lazy_mode;
320 320
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index c4412e972bbd..bf7f8b55b0f9 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -77,7 +77,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
77 77
78#define pmd_pgtable(pmd) pmd_page(pmd) 78#define pmd_pgtable(pmd) pmd_page(pmd)
79 79
80#if PAGETABLE_LEVELS > 2 80#if CONFIG_PGTABLE_LEVELS > 2
81static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) 81static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
82{ 82{
83 struct page *page; 83 struct page *page;
@@ -116,7 +116,7 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
116} 116}
117#endif /* CONFIG_X86_PAE */ 117#endif /* CONFIG_X86_PAE */
118 118
119#if PAGETABLE_LEVELS > 3 119#if CONFIG_PGTABLE_LEVELS > 3
120static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) 120static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
121{ 121{
122 paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT); 122 paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
@@ -142,7 +142,7 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
142 ___pud_free_tlb(tlb, pud); 142 ___pud_free_tlb(tlb, pud);
143} 143}
144 144
145#endif /* PAGETABLE_LEVELS > 3 */ 145#endif /* CONFIG_PGTABLE_LEVELS > 3 */
146#endif /* PAGETABLE_LEVELS > 2 */ 146#endif /* CONFIG_PGTABLE_LEVELS > 2 */
147 147
148#endif /* _ASM_X86_PGALLOC_H */ 148#endif /* _ASM_X86_PGALLOC_H */
diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h
index daacc23e3fb9..392576433e77 100644
--- a/arch/x86/include/asm/pgtable-2level_types.h
+++ b/arch/x86/include/asm/pgtable-2level_types.h
@@ -17,7 +17,6 @@ typedef union {
17#endif /* !__ASSEMBLY__ */ 17#endif /* !__ASSEMBLY__ */
18 18
19#define SHARED_KERNEL_PMD 0 19#define SHARED_KERNEL_PMD 0
20#define PAGETABLE_LEVELS 2
21 20
22/* 21/*
23 * traditional i386 two-level paging structure: 22 * traditional i386 two-level paging structure:
diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h
index 1bd5876c8649..bcc89625ebe5 100644
--- a/arch/x86/include/asm/pgtable-3level_types.h
+++ b/arch/x86/include/asm/pgtable-3level_types.h
@@ -24,8 +24,6 @@ typedef union {
24#define SHARED_KERNEL_PMD 1 24#define SHARED_KERNEL_PMD 1
25#endif 25#endif
26 26
27#define PAGETABLE_LEVELS 3
28
29/* 27/*
30 * PGDIR_SHIFT determines what a top-level page table entry can map 28 * PGDIR_SHIFT determines what a top-level page table entry can map
31 */ 29 */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a0c35bf6cb92..fe57e7a98839 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -551,7 +551,7 @@ static inline unsigned long pages_to_mb(unsigned long npg)
551 return npg >> (20 - PAGE_SHIFT); 551 return npg >> (20 - PAGE_SHIFT);
552} 552}
553 553
554#if PAGETABLE_LEVELS > 2 554#if CONFIG_PGTABLE_LEVELS > 2
555static inline int pud_none(pud_t pud) 555static inline int pud_none(pud_t pud)
556{ 556{
557 return native_pud_val(pud) == 0; 557 return native_pud_val(pud) == 0;
@@ -594,9 +594,9 @@ static inline int pud_large(pud_t pud)
594{ 594{
595 return 0; 595 return 0;
596} 596}
597#endif /* PAGETABLE_LEVELS > 2 */ 597#endif /* CONFIG_PGTABLE_LEVELS > 2 */
598 598
599#if PAGETABLE_LEVELS > 3 599#if CONFIG_PGTABLE_LEVELS > 3
600static inline int pgd_present(pgd_t pgd) 600static inline int pgd_present(pgd_t pgd)
601{ 601{
602 return pgd_flags(pgd) & _PAGE_PRESENT; 602 return pgd_flags(pgd) & _PAGE_PRESENT;
@@ -633,7 +633,7 @@ static inline int pgd_none(pgd_t pgd)
633{ 633{
634 return !native_pgd_val(pgd); 634 return !native_pgd_val(pgd);
635} 635}
636#endif /* PAGETABLE_LEVELS > 3 */ 636#endif /* CONFIG_PGTABLE_LEVELS > 3 */
637 637
638#endif /* __ASSEMBLY__ */ 638#endif /* __ASSEMBLY__ */
639 639
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 602b6028c5b6..e6844dfb4471 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -20,7 +20,6 @@ typedef struct { pteval_t pte; } pte_t;
20#endif /* !__ASSEMBLY__ */ 20#endif /* !__ASSEMBLY__ */
21 21
22#define SHARED_KERNEL_PMD 0 22#define SHARED_KERNEL_PMD 0
23#define PAGETABLE_LEVELS 4
24 23
25/* 24/*
26 * PGDIR_SHIFT determines what a top-level page table entry can map 25 * PGDIR_SHIFT determines what a top-level page table entry can map
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 8c7c10802e9c..78f0c8cbe316 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -234,7 +234,7 @@ static inline pgdval_t pgd_flags(pgd_t pgd)
234 return native_pgd_val(pgd) & PTE_FLAGS_MASK; 234 return native_pgd_val(pgd) & PTE_FLAGS_MASK;
235} 235}
236 236
237#if PAGETABLE_LEVELS > 3 237#if CONFIG_PGTABLE_LEVELS > 3
238typedef struct { pudval_t pud; } pud_t; 238typedef struct { pudval_t pud; } pud_t;
239 239
240static inline pud_t native_make_pud(pmdval_t val) 240static inline pud_t native_make_pud(pmdval_t val)
@@ -255,7 +255,7 @@ static inline pudval_t native_pud_val(pud_t pud)
255} 255}
256#endif 256#endif
257 257
258#if PAGETABLE_LEVELS > 2 258#if CONFIG_PGTABLE_LEVELS > 2
259typedef struct { pmdval_t pmd; } pmd_t; 259typedef struct { pmdval_t pmd; } pmd_t;
260 260
261static inline pmd_t native_make_pmd(pmdval_t val) 261static inline pmd_t native_make_pmd(pmdval_t val)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index e354cc6446ab..9435620062df 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -513,7 +513,7 @@ void __init kvm_guest_init(void)
513 * can get false positives too easily, for example if the host is 513 * can get false positives too easily, for example if the host is
514 * overcommitted. 514 * overcommitted.
515 */ 515 */
516 watchdog_enable_hardlockup_detector(false); 516 hardlockup_detector_disable();
517} 517}
518 518
519static noinline uint32_t __kvm_cpuid_base(void) 519static noinline uint32_t __kvm_cpuid_base(void)
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 548d25f00c90..c614dd492f5f 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -443,7 +443,7 @@ struct pv_mmu_ops pv_mmu_ops = {
443 .ptep_modify_prot_start = __ptep_modify_prot_start, 443 .ptep_modify_prot_start = __ptep_modify_prot_start,
444 .ptep_modify_prot_commit = __ptep_modify_prot_commit, 444 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
445 445
446#if PAGETABLE_LEVELS >= 3 446#if CONFIG_PGTABLE_LEVELS >= 3
447#ifdef CONFIG_X86_PAE 447#ifdef CONFIG_X86_PAE
448 .set_pte_atomic = native_set_pte_atomic, 448 .set_pte_atomic = native_set_pte_atomic,
449 .pte_clear = native_pte_clear, 449 .pte_clear = native_pte_clear,
@@ -454,13 +454,13 @@ struct pv_mmu_ops pv_mmu_ops = {
454 .pmd_val = PTE_IDENT, 454 .pmd_val = PTE_IDENT,
455 .make_pmd = PTE_IDENT, 455 .make_pmd = PTE_IDENT,
456 456
457#if PAGETABLE_LEVELS == 4 457#if CONFIG_PGTABLE_LEVELS == 4
458 .pud_val = PTE_IDENT, 458 .pud_val = PTE_IDENT,
459 .make_pud = PTE_IDENT, 459 .make_pud = PTE_IDENT,
460 460
461 .set_pgd = native_set_pgd, 461 .set_pgd = native_set_pgd,
462#endif 462#endif
463#endif /* PAGETABLE_LEVELS >= 3 */ 463#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
464 464
465 .pte_val = PTE_IDENT, 465 .pte_val = PTE_IDENT,
466 .pgd_val = PTE_IDENT, 466 .pgd_val = PTE_IDENT,
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index c4cc74006c61..a482d105172b 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -32,6 +32,4 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o
32obj-$(CONFIG_ACPI_NUMA) += srat.o 32obj-$(CONFIG_ACPI_NUMA) += srat.o
33obj-$(CONFIG_NUMA_EMU) += numa_emulation.o 33obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
34 34
35obj-$(CONFIG_MEMTEST) += memtest.o
36
37obj-$(CONFIG_X86_INTEL_MPX) += mpx.o 35obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index fdf617c00e2f..5ead4d6cf3a7 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -67,8 +67,13 @@ static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages,
67 67
68/* 68/*
69 * Remap an arbitrary physical address space into the kernel virtual 69 * Remap an arbitrary physical address space into the kernel virtual
70 * address space. Needed when the kernel wants to access high addresses 70 * address space. It transparently creates kernel huge I/O mapping when
71 * directly. 71 * the physical address is aligned by a huge page size (1GB or 2MB) and
72 * the requested size is at least the huge page size.
73 *
74 * NOTE: MTRRs can override PAT memory types with a 4KB granularity.
75 * Therefore, the mapping code falls back to use a smaller page toward 4KB
76 * when a mapping range is covered by non-WB type of MTRRs.
72 * 77 *
73 * NOTE! We need to allow non-page-aligned mappings too: we will obviously 78 * NOTE! We need to allow non-page-aligned mappings too: we will obviously
74 * have to convert them into an offset in a page-aligned mapping, but the 79 * have to convert them into an offset in a page-aligned mapping, but the
@@ -326,6 +331,20 @@ void iounmap(volatile void __iomem *addr)
326} 331}
327EXPORT_SYMBOL(iounmap); 332EXPORT_SYMBOL(iounmap);
328 333
334int arch_ioremap_pud_supported(void)
335{
336#ifdef CONFIG_X86_64
337 return cpu_has_gbpages;
338#else
339 return 0;
340#endif
341}
342
343int arch_ioremap_pmd_supported(void)
344{
345 return cpu_has_pse;
346}
347
329/* 348/*
330 * Convert a physical pointer to a virtual kernel pointer for /dev/mem 349 * Convert a physical pointer to a virtual kernel pointer for /dev/mem
331 * access 350 * access
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index df4552bd239e..9d518d693b4b 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -65,24 +65,23 @@ static int mmap_is_legacy(void)
65 return sysctl_legacy_va_layout; 65 return sysctl_legacy_va_layout;
66} 66}
67 67
68static unsigned long mmap_rnd(void) 68unsigned long arch_mmap_rnd(void)
69{ 69{
70 unsigned long rnd = 0; 70 unsigned long rnd;
71 71
72 /* 72 /*
73 * 8 bits of randomness in 32bit mmaps, 20 address space bits 73 * 8 bits of randomness in 32bit mmaps, 20 address space bits
74 * 28 bits of randomness in 64bit mmaps, 40 address space bits 74 * 28 bits of randomness in 64bit mmaps, 40 address space bits
75 */ 75 */
76 if (current->flags & PF_RANDOMIZE) { 76 if (mmap_is_ia32())
77 if (mmap_is_ia32()) 77 rnd = (unsigned long)get_random_int() % (1<<8);
78 rnd = get_random_int() % (1<<8); 78 else
79 else 79 rnd = (unsigned long)get_random_int() % (1<<28);
80 rnd = get_random_int() % (1<<28); 80
81 }
82 return rnd << PAGE_SHIFT; 81 return rnd << PAGE_SHIFT;
83} 82}
84 83
85static unsigned long mmap_base(void) 84static unsigned long mmap_base(unsigned long rnd)
86{ 85{
87 unsigned long gap = rlimit(RLIMIT_STACK); 86 unsigned long gap = rlimit(RLIMIT_STACK);
88 87
@@ -91,19 +90,19 @@ static unsigned long mmap_base(void)
91 else if (gap > MAX_GAP) 90 else if (gap > MAX_GAP)
92 gap = MAX_GAP; 91 gap = MAX_GAP;
93 92
94 return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd()); 93 return PAGE_ALIGN(TASK_SIZE - gap - rnd);
95} 94}
96 95
97/* 96/*
98 * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64 97 * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64
99 * does, but not when emulating X86_32 98 * does, but not when emulating X86_32
100 */ 99 */
101static unsigned long mmap_legacy_base(void) 100static unsigned long mmap_legacy_base(unsigned long rnd)
102{ 101{
103 if (mmap_is_ia32()) 102 if (mmap_is_ia32())
104 return TASK_UNMAPPED_BASE; 103 return TASK_UNMAPPED_BASE;
105 else 104 else
106 return TASK_UNMAPPED_BASE + mmap_rnd(); 105 return TASK_UNMAPPED_BASE + rnd;
107} 106}
108 107
109/* 108/*
@@ -112,13 +111,18 @@ static unsigned long mmap_legacy_base(void)
112 */ 111 */
113void arch_pick_mmap_layout(struct mm_struct *mm) 112void arch_pick_mmap_layout(struct mm_struct *mm)
114{ 113{
115 mm->mmap_legacy_base = mmap_legacy_base(); 114 unsigned long random_factor = 0UL;
116 mm->mmap_base = mmap_base(); 115
116 if (current->flags & PF_RANDOMIZE)
117 random_factor = arch_mmap_rnd();
118
119 mm->mmap_legacy_base = mmap_legacy_base(random_factor);
117 120
118 if (mmap_is_legacy()) { 121 if (mmap_is_legacy()) {
119 mm->mmap_base = mm->mmap_legacy_base; 122 mm->mmap_base = mm->mmap_legacy_base;
120 mm->get_unmapped_area = arch_get_unmapped_area; 123 mm->get_unmapped_area = arch_get_unmapped_area;
121 } else { 124 } else {
125 mm->mmap_base = mmap_base(random_factor);
122 mm->get_unmapped_area = arch_get_unmapped_area_topdown; 126 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
123 } 127 }
124} 128}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 5a7e5252c878..0b97d2c75df3 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -4,6 +4,7 @@
4#include <asm/pgtable.h> 4#include <asm/pgtable.h>
5#include <asm/tlb.h> 5#include <asm/tlb.h>
6#include <asm/fixmap.h> 6#include <asm/fixmap.h>
7#include <asm/mtrr.h>
7 8
8#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO 9#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
9 10
@@ -58,7 +59,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
58 tlb_remove_page(tlb, pte); 59 tlb_remove_page(tlb, pte);
59} 60}
60 61
61#if PAGETABLE_LEVELS > 2 62#if CONFIG_PGTABLE_LEVELS > 2
62void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) 63void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
63{ 64{
64 struct page *page = virt_to_page(pmd); 65 struct page *page = virt_to_page(pmd);
@@ -74,14 +75,14 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
74 tlb_remove_page(tlb, page); 75 tlb_remove_page(tlb, page);
75} 76}
76 77
77#if PAGETABLE_LEVELS > 3 78#if CONFIG_PGTABLE_LEVELS > 3
78void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) 79void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
79{ 80{
80 paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); 81 paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
81 tlb_remove_page(tlb, virt_to_page(pud)); 82 tlb_remove_page(tlb, virt_to_page(pud));
82} 83}
83#endif /* PAGETABLE_LEVELS > 3 */ 84#endif /* CONFIG_PGTABLE_LEVELS > 3 */
84#endif /* PAGETABLE_LEVELS > 2 */ 85#endif /* CONFIG_PGTABLE_LEVELS > 2 */
85 86
86static inline void pgd_list_add(pgd_t *pgd) 87static inline void pgd_list_add(pgd_t *pgd)
87{ 88{
@@ -117,9 +118,9 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
117 /* If the pgd points to a shared pagetable level (either the 118 /* If the pgd points to a shared pagetable level (either the
118 ptes in non-PAE, or shared PMD in PAE), then just copy the 119 ptes in non-PAE, or shared PMD in PAE), then just copy the
119 references from swapper_pg_dir. */ 120 references from swapper_pg_dir. */
120 if (PAGETABLE_LEVELS == 2 || 121 if (CONFIG_PGTABLE_LEVELS == 2 ||
121 (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || 122 (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
122 PAGETABLE_LEVELS == 4) { 123 CONFIG_PGTABLE_LEVELS == 4) {
123 clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, 124 clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
124 swapper_pg_dir + KERNEL_PGD_BOUNDARY, 125 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
125 KERNEL_PGD_PTRS); 126 KERNEL_PGD_PTRS);
@@ -560,3 +561,67 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys,
560{ 561{
561 __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); 562 __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
562} 563}
564
565#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
566int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
567{
568 u8 mtrr;
569
570 /*
571 * Do not use a huge page when the range is covered by non-WB type
572 * of MTRRs.
573 */
574 mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE);
575 if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
576 return 0;
577
578 prot = pgprot_4k_2_large(prot);
579
580 set_pte((pte_t *)pud, pfn_pte(
581 (u64)addr >> PAGE_SHIFT,
582 __pgprot(pgprot_val(prot) | _PAGE_PSE)));
583
584 return 1;
585}
586
587int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
588{
589 u8 mtrr;
590
591 /*
592 * Do not use a huge page when the range is covered by non-WB type
593 * of MTRRs.
594 */
595 mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE);
596 if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
597 return 0;
598
599 prot = pgprot_4k_2_large(prot);
600
601 set_pte((pte_t *)pmd, pfn_pte(
602 (u64)addr >> PAGE_SHIFT,
603 __pgprot(pgprot_val(prot) | _PAGE_PSE)));
604
605 return 1;
606}
607
608int pud_clear_huge(pud_t *pud)
609{
610 if (pud_large(*pud)) {
611 pud_clear(pud);
612 return 1;
613 }
614
615 return 0;
616}
617
618int pmd_clear_huge(pmd_t *pmd)
619{
620 if (pmd_large(*pmd)) {
621 pmd_clear(pmd);
622 return 1;
623 }
624
625 return 0;
626}
627#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index adca9e2b6553..65083ad63b6f 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -502,7 +502,7 @@ __visible pmd_t xen_make_pmd(pmdval_t pmd)
502} 502}
503PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); 503PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
504 504
505#if PAGETABLE_LEVELS == 4 505#if CONFIG_PGTABLE_LEVELS == 4
506__visible pudval_t xen_pud_val(pud_t pud) 506__visible pudval_t xen_pud_val(pud_t pud)
507{ 507{
508 return pte_mfn_to_pfn(pud.pud); 508 return pte_mfn_to_pfn(pud.pud);
@@ -589,7 +589,7 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val)
589 589
590 xen_mc_issue(PARAVIRT_LAZY_MMU); 590 xen_mc_issue(PARAVIRT_LAZY_MMU);
591} 591}
592#endif /* PAGETABLE_LEVELS == 4 */ 592#endif /* CONFIG_PGTABLE_LEVELS == 4 */
593 593
594/* 594/*
595 * (Yet another) pagetable walker. This one is intended for pinning a 595 * (Yet another) pagetable walker. This one is intended for pinning a
@@ -1628,7 +1628,7 @@ static void xen_release_pmd(unsigned long pfn)
1628 xen_release_ptpage(pfn, PT_PMD); 1628 xen_release_ptpage(pfn, PT_PMD);
1629} 1629}
1630 1630
1631#if PAGETABLE_LEVELS == 4 1631#if CONFIG_PGTABLE_LEVELS == 4
1632static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) 1632static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1633{ 1633{
1634 xen_alloc_ptpage(mm, pfn, PT_PUD); 1634 xen_alloc_ptpage(mm, pfn, PT_PUD);
@@ -2046,7 +2046,7 @@ static void __init xen_post_allocator_init(void)
2046 pv_mmu_ops.set_pte = xen_set_pte; 2046 pv_mmu_ops.set_pte = xen_set_pte;
2047 pv_mmu_ops.set_pmd = xen_set_pmd; 2047 pv_mmu_ops.set_pmd = xen_set_pmd;
2048 pv_mmu_ops.set_pud = xen_set_pud; 2048 pv_mmu_ops.set_pud = xen_set_pud;
2049#if PAGETABLE_LEVELS == 4 2049#if CONFIG_PGTABLE_LEVELS == 4
2050 pv_mmu_ops.set_pgd = xen_set_pgd; 2050 pv_mmu_ops.set_pgd = xen_set_pgd;
2051#endif 2051#endif
2052 2052
@@ -2056,7 +2056,7 @@ static void __init xen_post_allocator_init(void)
2056 pv_mmu_ops.alloc_pmd = xen_alloc_pmd; 2056 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
2057 pv_mmu_ops.release_pte = xen_release_pte; 2057 pv_mmu_ops.release_pte = xen_release_pte;
2058 pv_mmu_ops.release_pmd = xen_release_pmd; 2058 pv_mmu_ops.release_pmd = xen_release_pmd;
2059#if PAGETABLE_LEVELS == 4 2059#if CONFIG_PGTABLE_LEVELS == 4
2060 pv_mmu_ops.alloc_pud = xen_alloc_pud; 2060 pv_mmu_ops.alloc_pud = xen_alloc_pud;
2061 pv_mmu_ops.release_pud = xen_release_pud; 2061 pv_mmu_ops.release_pud = xen_release_pud;
2062#endif 2062#endif
@@ -2122,14 +2122,14 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
2122 .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), 2122 .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
2123 .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), 2123 .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
2124 2124
2125#if PAGETABLE_LEVELS == 4 2125#if CONFIG_PGTABLE_LEVELS == 4
2126 .pud_val = PV_CALLEE_SAVE(xen_pud_val), 2126 .pud_val = PV_CALLEE_SAVE(xen_pud_val),
2127 .make_pud = PV_CALLEE_SAVE(xen_make_pud), 2127 .make_pud = PV_CALLEE_SAVE(xen_make_pud),
2128 .set_pgd = xen_set_pgd_hyper, 2128 .set_pgd = xen_set_pgd_hyper,
2129 2129
2130 .alloc_pud = xen_alloc_pmd_init, 2130 .alloc_pud = xen_alloc_pmd_init,
2131 .release_pud = xen_release_pmd_init, 2131 .release_pud = xen_release_pmd_init,
2132#endif /* PAGETABLE_LEVELS == 4 */ 2132#endif /* CONFIG_PGTABLE_LEVELS == 4 */
2133 2133
2134 .activate_mm = xen_activate_mm, 2134 .activate_mm = xen_activate_mm,
2135 .dup_mmap = xen_dup_mmap, 2135 .dup_mmap = xen_dup_mmap,
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index af9c911cd6b5..2804aed3f416 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -219,6 +219,7 @@ static bool pages_correctly_reserved(unsigned long start_pfn)
219/* 219/*
220 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is 220 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
221 * OK to have direct references to sparsemem variables in here. 221 * OK to have direct references to sparsemem variables in here.
222 * Must already be protected by mem_hotplug_begin().
222 */ 223 */
223static int 224static int
224memory_block_action(unsigned long phys_index, unsigned long action, int online_type) 225memory_block_action(unsigned long phys_index, unsigned long action, int online_type)
@@ -228,7 +229,7 @@ memory_block_action(unsigned long phys_index, unsigned long action, int online_t
228 struct page *first_page; 229 struct page *first_page;
229 int ret; 230 int ret;
230 231
231 start_pfn = phys_index << PFN_SECTION_SHIFT; 232 start_pfn = section_nr_to_pfn(phys_index);
232 first_page = pfn_to_page(start_pfn); 233 first_page = pfn_to_page(start_pfn);
233 234
234 switch (action) { 235 switch (action) {
@@ -286,6 +287,7 @@ static int memory_subsys_online(struct device *dev)
286 if (mem->online_type < 0) 287 if (mem->online_type < 0)
287 mem->online_type = MMOP_ONLINE_KEEP; 288 mem->online_type = MMOP_ONLINE_KEEP;
288 289
290 /* Already under protection of mem_hotplug_begin() */
289 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); 291 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
290 292
291 /* clear online_type */ 293 /* clear online_type */
@@ -328,17 +330,19 @@ store_mem_state(struct device *dev,
328 goto err; 330 goto err;
329 } 331 }
330 332
333 /*
334 * Memory hotplug needs to hold mem_hotplug_begin() for probe to find
335 * the correct memory block to online before doing device_online(dev),
336 * which will take dev->mutex. Take the lock early to prevent an
337 * inversion, memory_subsys_online() callbacks will be implemented by
338 * assuming it's already protected.
339 */
340 mem_hotplug_begin();
341
331 switch (online_type) { 342 switch (online_type) {
332 case MMOP_ONLINE_KERNEL: 343 case MMOP_ONLINE_KERNEL:
333 case MMOP_ONLINE_MOVABLE: 344 case MMOP_ONLINE_MOVABLE:
334 case MMOP_ONLINE_KEEP: 345 case MMOP_ONLINE_KEEP:
335 /*
336 * mem->online_type is not protected so there can be a
337 * race here. However, when racing online, the first
338 * will succeed and the second will just return as the
339 * block will already be online. The online type
340 * could be either one, but that is expected.
341 */
342 mem->online_type = online_type; 346 mem->online_type = online_type;
343 ret = device_online(&mem->dev); 347 ret = device_online(&mem->dev);
344 break; 348 break;
@@ -349,6 +353,7 @@ store_mem_state(struct device *dev,
349 ret = -EINVAL; /* should never happen */ 353 ret = -EINVAL; /* should never happen */
350 } 354 }
351 355
356 mem_hotplug_done();
352err: 357err:
353 unlock_device_hotplug(); 358 unlock_device_hotplug();
354 359
diff --git a/drivers/s390/scsi/zfcp_erp.c b/drivers/s390/scsi/zfcp_erp.c
index 2c5d4567d1da..acde3f5d6e9e 100644
--- a/drivers/s390/scsi/zfcp_erp.c
+++ b/drivers/s390/scsi/zfcp_erp.c
@@ -738,11 +738,11 @@ static int zfcp_erp_adapter_strategy_open_fsf(struct zfcp_erp_action *act)
738 return ZFCP_ERP_FAILED; 738 return ZFCP_ERP_FAILED;
739 739
740 if (mempool_resize(act->adapter->pool.sr_data, 740 if (mempool_resize(act->adapter->pool.sr_data,
741 act->adapter->stat_read_buf_num, GFP_KERNEL)) 741 act->adapter->stat_read_buf_num))
742 return ZFCP_ERP_FAILED; 742 return ZFCP_ERP_FAILED;
743 743
744 if (mempool_resize(act->adapter->pool.status_read_req, 744 if (mempool_resize(act->adapter->pool.status_read_req,
745 act->adapter->stat_read_buf_num, GFP_KERNEL)) 745 act->adapter->stat_read_buf_num))
746 return ZFCP_ERP_FAILED; 746 return ZFCP_ERP_FAILED;
747 747
748 atomic_set(&act->adapter->stat_miss, act->adapter->stat_read_buf_num); 748 atomic_set(&act->adapter->stat_miss, act->adapter->stat_read_buf_num);
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
index a260e99a4447..d72605864b0a 100644
--- a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
@@ -55,7 +55,9 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
55 if (PagePrivate(page)) 55 if (PagePrivate(page))
56 page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); 56 page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
57 57
58 cancel_dirty_page(page, PAGE_SIZE); 58 if (TestClearPageDirty(page))
59 account_page_cleaned(page, mapping);
60
59 ClearPageMappedToDisk(page); 61 ClearPageMappedToDisk(page);
60 ll_delete_from_page_cache(page); 62 ll_delete_from_page_cache(page);
61} 63}
diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c
index 8a65423bc696..c4211a31612d 100644
--- a/drivers/xen/tmem.c
+++ b/drivers/xen/tmem.c
@@ -397,13 +397,15 @@ static int __init xen_tmem_init(void)
397#ifdef CONFIG_CLEANCACHE 397#ifdef CONFIG_CLEANCACHE
398 BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid)); 398 BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid));
399 if (tmem_enabled && cleancache) { 399 if (tmem_enabled && cleancache) {
400 char *s = ""; 400 int err;
401 struct cleancache_ops *old_ops = 401
402 cleancache_register_ops(&tmem_cleancache_ops); 402 err = cleancache_register_ops(&tmem_cleancache_ops);
403 if (old_ops) 403 if (err)
404 s = " (WARNING: cleancache_ops overridden)"; 404 pr_warn("xen-tmem: failed to enable cleancache: %d\n",
405 pr_info("cleancache enabled, RAM provided by Xen Transcendent Memory%s\n", 405 err);
406 s); 406 else
407 pr_info("cleancache enabled, RAM provided by "
408 "Xen Transcendent Memory\n");
407 } 409 }
408#endif 410#endif
409#ifdef CONFIG_XEN_SELFBALLOONING 411#ifdef CONFIG_XEN_SELFBALLOONING
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 270c48148f79..2d0cbbd14cfc 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -27,9 +27,6 @@ config COMPAT_BINFMT_ELF
27 bool 27 bool
28 depends on COMPAT && BINFMT_ELF 28 depends on COMPAT && BINFMT_ELF
29 29
30config ARCH_BINFMT_ELF_RANDOMIZE_PIE
31 bool
32
33config ARCH_BINFMT_ELF_STATE 30config ARCH_BINFMT_ELF_STATE
34 bool 31 bool
35 32
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 995986b8e36b..241ef68d2893 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -31,6 +31,7 @@
31#include <linux/security.h> 31#include <linux/security.h>
32#include <linux/random.h> 32#include <linux/random.h>
33#include <linux/elf.h> 33#include <linux/elf.h>
34#include <linux/elf-randomize.h>
34#include <linux/utsname.h> 35#include <linux/utsname.h>
35#include <linux/coredump.h> 36#include <linux/coredump.h>
36#include <linux/sched.h> 37#include <linux/sched.h>
@@ -862,6 +863,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
862 i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { 863 i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
863 int elf_prot = 0, elf_flags; 864 int elf_prot = 0, elf_flags;
864 unsigned long k, vaddr; 865 unsigned long k, vaddr;
866 unsigned long total_size = 0;
865 867
866 if (elf_ppnt->p_type != PT_LOAD) 868 if (elf_ppnt->p_type != PT_LOAD)
867 continue; 869 continue;
@@ -909,25 +911,20 @@ static int load_elf_binary(struct linux_binprm *bprm)
909 * default mmap base, as well as whatever program they 911 * default mmap base, as well as whatever program they
910 * might try to exec. This is because the brk will 912 * might try to exec. This is because the brk will
911 * follow the loader, and is not movable. */ 913 * follow the loader, and is not movable. */
912#ifdef CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE 914 load_bias = ELF_ET_DYN_BASE - vaddr;
913 /* Memory randomization might have been switched off
914 * in runtime via sysctl or explicit setting of
915 * personality flags.
916 * If that is the case, retain the original non-zero
917 * load_bias value in order to establish proper
918 * non-randomized mappings.
919 */
920 if (current->flags & PF_RANDOMIZE) 915 if (current->flags & PF_RANDOMIZE)
921 load_bias = 0; 916 load_bias += arch_mmap_rnd();
922 else 917 load_bias = ELF_PAGESTART(load_bias);
923 load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); 918 total_size = total_mapping_size(elf_phdata,
924#else 919 loc->elf_ex.e_phnum);
925 load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); 920 if (!total_size) {
926#endif 921 error = -EINVAL;
922 goto out_free_dentry;
923 }
927 } 924 }
928 925
929 error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt, 926 error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
930 elf_prot, elf_flags, 0); 927 elf_prot, elf_flags, total_size);
931 if (BAD_ADDR(error)) { 928 if (BAD_ADDR(error)) {
932 retval = IS_ERR((void *)error) ? 929 retval = IS_ERR((void *)error) ?
933 PTR_ERR((void*)error) : -EINVAL; 930 PTR_ERR((void*)error) : -EINVAL;
@@ -1053,15 +1050,13 @@ static int load_elf_binary(struct linux_binprm *bprm)
1053 current->mm->end_data = end_data; 1050 current->mm->end_data = end_data;
1054 current->mm->start_stack = bprm->p; 1051 current->mm->start_stack = bprm->p;
1055 1052
1056#ifdef arch_randomize_brk
1057 if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) { 1053 if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) {
1058 current->mm->brk = current->mm->start_brk = 1054 current->mm->brk = current->mm->start_brk =
1059 arch_randomize_brk(current->mm); 1055 arch_randomize_brk(current->mm);
1060#ifdef CONFIG_COMPAT_BRK 1056#ifdef compat_brk_randomized
1061 current->brk_randomized = 1; 1057 current->brk_randomized = 1;
1062#endif 1058#endif
1063 } 1059 }
1064#endif
1065 1060
1066 if (current->personality & MMAP_PAGE_ZERO) { 1061 if (current->personality & MMAP_PAGE_ZERO) {
1067 /* Why this, you ask??? Well SVr4 maps page 0 as read-only, 1062 /* Why this, you ask??? Well SVr4 maps page 0 as read-only,
diff --git a/fs/buffer.c b/fs/buffer.c
index 20805db2c987..c7a5602d01ee 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3243,8 +3243,8 @@ int try_to_free_buffers(struct page *page)
3243 * to synchronise against __set_page_dirty_buffers and prevent the 3243 * to synchronise against __set_page_dirty_buffers and prevent the
3244 * dirty bit from being lost. 3244 * dirty bit from being lost.
3245 */ 3245 */
3246 if (ret) 3246 if (ret && TestClearPageDirty(page))
3247 cancel_dirty_page(page, PAGE_CACHE_SIZE); 3247 account_page_cleaned(page, mapping);
3248 spin_unlock(&mapping->private_lock); 3248 spin_unlock(&mapping->private_lock);
3249out: 3249out:
3250 if (buffers_to_free) { 3250 if (buffers_to_free) {
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 480cf9c81d50..f3bfe08e177b 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -773,8 +773,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
773 773
774 length = atomic_dec_return(&tcpSesAllocCount); 774 length = atomic_dec_return(&tcpSesAllocCount);
775 if (length > 0) 775 if (length > 0)
776 mempool_resize(cifs_req_poolp, length + cifs_min_rcv, 776 mempool_resize(cifs_req_poolp, length + cifs_min_rcv);
777 GFP_KERNEL);
778} 777}
779 778
780static int 779static int
@@ -848,8 +847,7 @@ cifs_demultiplex_thread(void *p)
848 847
849 length = atomic_inc_return(&tcpSesAllocCount); 848 length = atomic_inc_return(&tcpSesAllocCount);
850 if (length > 1) 849 if (length > 1)
851 mempool_resize(cifs_req_poolp, length + cifs_min_rcv, 850 mempool_resize(cifs_req_poolp, length + cifs_min_rcv);
852 GFP_KERNEL);
853 851
854 set_freezable(); 852 set_freezable();
855 while (server->tcpStatus != CifsExiting) { 853 while (server->tcpStatus != CifsExiting) {
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c274aca8e8dc..db76cec3ce21 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -319,7 +319,7 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
319 319
320static void truncate_huge_page(struct page *page) 320static void truncate_huge_page(struct page *page)
321{ 321{
322 cancel_dirty_page(page, /* No IO accounting for huge pages? */0); 322 ClearPageDirty(page);
323 ClearPageUptodate(page); 323 ClearPageUptodate(page);
324 delete_from_page_cache(page); 324 delete_from_page_cache(page);
325} 325}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 849ed784d6ac..759931088094 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1876,11 +1876,6 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
1876 * request from the inode / page_private pointer and 1876 * request from the inode / page_private pointer and
1877 * release it */ 1877 * release it */
1878 nfs_inode_remove_request(req); 1878 nfs_inode_remove_request(req);
1879 /*
1880 * In case nfs_inode_remove_request has marked the
1881 * page as being dirty
1882 */
1883 cancel_dirty_page(page, PAGE_CACHE_SIZE);
1884 nfs_unlock_and_release_request(req); 1879 nfs_unlock_and_release_request(req);
1885 } 1880 }
1886 1881
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 044158bd22be..2d7f76e52c37 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -3370,7 +3370,7 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
3370 ret = ocfs2_get_right_path(et, left_path, &right_path); 3370 ret = ocfs2_get_right_path(et, left_path, &right_path);
3371 if (ret) { 3371 if (ret) {
3372 mlog_errno(ret); 3372 mlog_errno(ret);
3373 goto out; 3373 return ret;
3374 } 3374 }
3375 3375
3376 right_el = path_leaf_el(right_path); 3376 right_el = path_leaf_el(right_path);
@@ -3453,8 +3453,7 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
3453 subtree_index); 3453 subtree_index);
3454 } 3454 }
3455out: 3455out:
3456 if (right_path) 3456 ocfs2_free_path(right_path);
3457 ocfs2_free_path(right_path);
3458 return ret; 3457 return ret;
3459} 3458}
3460 3459
@@ -3536,7 +3535,7 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
3536 ret = ocfs2_get_left_path(et, right_path, &left_path); 3535 ret = ocfs2_get_left_path(et, right_path, &left_path);
3537 if (ret) { 3536 if (ret) {
3538 mlog_errno(ret); 3537 mlog_errno(ret);
3539 goto out; 3538 return ret;
3540 } 3539 }
3541 3540
3542 left_el = path_leaf_el(left_path); 3541 left_el = path_leaf_el(left_path);
@@ -3647,8 +3646,7 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
3647 right_path, subtree_index); 3646 right_path, subtree_index);
3648 } 3647 }
3649out: 3648out:
3650 if (left_path) 3649 ocfs2_free_path(left_path);
3651 ocfs2_free_path(left_path);
3652 return ret; 3650 return ret;
3653} 3651}
3654 3652
@@ -4334,17 +4332,17 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
4334 } else if (path->p_tree_depth > 0) { 4332 } else if (path->p_tree_depth > 0) {
4335 status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos); 4333 status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
4336 if (status) 4334 if (status)
4337 goto out; 4335 goto exit;
4338 4336
4339 if (left_cpos != 0) { 4337 if (left_cpos != 0) {
4340 left_path = ocfs2_new_path_from_path(path); 4338 left_path = ocfs2_new_path_from_path(path);
4341 if (!left_path) 4339 if (!left_path)
4342 goto out; 4340 goto exit;
4343 4341
4344 status = ocfs2_find_path(et->et_ci, left_path, 4342 status = ocfs2_find_path(et->et_ci, left_path,
4345 left_cpos); 4343 left_cpos);
4346 if (status) 4344 if (status)
4347 goto out; 4345 goto free_left_path;
4348 4346
4349 new_el = path_leaf_el(left_path); 4347 new_el = path_leaf_el(left_path);
4350 4348
@@ -4361,7 +4359,7 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
4361 le16_to_cpu(new_el->l_next_free_rec), 4359 le16_to_cpu(new_el->l_next_free_rec),
4362 le16_to_cpu(new_el->l_count)); 4360 le16_to_cpu(new_el->l_count));
4363 status = -EINVAL; 4361 status = -EINVAL;
4364 goto out; 4362 goto free_left_path;
4365 } 4363 }
4366 rec = &new_el->l_recs[ 4364 rec = &new_el->l_recs[
4367 le16_to_cpu(new_el->l_next_free_rec) - 1]; 4365 le16_to_cpu(new_el->l_next_free_rec) - 1];
@@ -4388,18 +4386,18 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
4388 path->p_tree_depth > 0) { 4386 path->p_tree_depth > 0) {
4389 status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos); 4387 status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
4390 if (status) 4388 if (status)
4391 goto out; 4389 goto free_left_path;
4392 4390
4393 if (right_cpos == 0) 4391 if (right_cpos == 0)
4394 goto out; 4392 goto free_left_path;
4395 4393
4396 right_path = ocfs2_new_path_from_path(path); 4394 right_path = ocfs2_new_path_from_path(path);
4397 if (!right_path) 4395 if (!right_path)
4398 goto out; 4396 goto free_left_path;
4399 4397
4400 status = ocfs2_find_path(et->et_ci, right_path, right_cpos); 4398 status = ocfs2_find_path(et->et_ci, right_path, right_cpos);
4401 if (status) 4399 if (status)
4402 goto out; 4400 goto free_right_path;
4403 4401
4404 new_el = path_leaf_el(right_path); 4402 new_el = path_leaf_el(right_path);
4405 rec = &new_el->l_recs[0]; 4403 rec = &new_el->l_recs[0];
@@ -4413,7 +4411,7 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
4413 (unsigned long long)le64_to_cpu(eb->h_blkno), 4411 (unsigned long long)le64_to_cpu(eb->h_blkno),
4414 le16_to_cpu(new_el->l_next_free_rec)); 4412 le16_to_cpu(new_el->l_next_free_rec));
4415 status = -EINVAL; 4413 status = -EINVAL;
4416 goto out; 4414 goto free_right_path;
4417 } 4415 }
4418 rec = &new_el->l_recs[1]; 4416 rec = &new_el->l_recs[1];
4419 } 4417 }
@@ -4430,12 +4428,11 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
4430 ret = contig_type; 4428 ret = contig_type;
4431 } 4429 }
4432 4430
4433out: 4431free_right_path:
4434 if (left_path) 4432 ocfs2_free_path(right_path);
4435 ocfs2_free_path(left_path); 4433free_left_path:
4436 if (right_path) 4434 ocfs2_free_path(left_path);
4437 ocfs2_free_path(right_path); 4435exit:
4438
4439 return ret; 4436 return ret;
4440} 4437}
4441 4438
@@ -6858,13 +6855,13 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6858 if (pages == NULL) { 6855 if (pages == NULL) {
6859 ret = -ENOMEM; 6856 ret = -ENOMEM;
6860 mlog_errno(ret); 6857 mlog_errno(ret);
6861 goto out; 6858 return ret;
6862 } 6859 }
6863 6860
6864 ret = ocfs2_reserve_clusters(osb, 1, &data_ac); 6861 ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
6865 if (ret) { 6862 if (ret) {
6866 mlog_errno(ret); 6863 mlog_errno(ret);
6867 goto out; 6864 goto free_pages;
6868 } 6865 }
6869 } 6866 }
6870 6867
@@ -6996,9 +6993,8 @@ out_commit:
6996out: 6993out:
6997 if (data_ac) 6994 if (data_ac)
6998 ocfs2_free_alloc_context(data_ac); 6995 ocfs2_free_alloc_context(data_ac);
6999 if (pages) 6996free_pages:
7000 kfree(pages); 6997 kfree(pages);
7001
7002 return ret; 6998 return ret;
7003} 6999}
7004 7000
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index e1bf18c5d25e..8d2bc840c288 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -664,6 +664,117 @@ static int ocfs2_is_overwrite(struct ocfs2_super *osb,
664 return 0; 664 return 0;
665} 665}
666 666
667static int ocfs2_direct_IO_zero_extend(struct ocfs2_super *osb,
668 struct inode *inode, loff_t offset,
669 u64 zero_len, int cluster_align)
670{
671 u32 p_cpos = 0;
672 u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
673 unsigned int num_clusters = 0;
674 unsigned int ext_flags = 0;
675 int ret = 0;
676
677 if (offset <= i_size_read(inode) || cluster_align)
678 return 0;
679
680 ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
681 &ext_flags);
682 if (ret < 0) {
683 mlog_errno(ret);
684 return ret;
685 }
686
687 if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
688 u64 s = i_size_read(inode);
689 sector_t sector = (p_cpos << (osb->s_clustersize_bits - 9)) +
690 (do_div(s, osb->s_clustersize) >> 9);
691
692 ret = blkdev_issue_zeroout(osb->sb->s_bdev, sector,
693 zero_len >> 9, GFP_NOFS, false);
694 if (ret < 0)
695 mlog_errno(ret);
696 }
697
698 return ret;
699}
700
701static int ocfs2_direct_IO_extend_no_holes(struct ocfs2_super *osb,
702 struct inode *inode, loff_t offset)
703{
704 u64 zero_start, zero_len, total_zero_len;
705 u32 p_cpos = 0, clusters_to_add;
706 u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
707 unsigned int num_clusters = 0;
708 unsigned int ext_flags = 0;
709 u32 size_div, offset_div;
710 int ret = 0;
711
712 {
713 u64 o = offset;
714 u64 s = i_size_read(inode);
715
716 offset_div = do_div(o, osb->s_clustersize);
717 size_div = do_div(s, osb->s_clustersize);
718 }
719
720 if (offset <= i_size_read(inode))
721 return 0;
722
723 clusters_to_add = ocfs2_bytes_to_clusters(inode->i_sb, offset) -
724 ocfs2_bytes_to_clusters(inode->i_sb, i_size_read(inode));
725 total_zero_len = offset - i_size_read(inode);
726 if (clusters_to_add)
727 total_zero_len -= offset_div;
728
729 /* Allocate clusters to fill out holes, and this is only needed
730 * when we add more than one clusters. Otherwise the cluster will
731 * be allocated during direct IO */
732 if (clusters_to_add > 1) {
733 ret = ocfs2_extend_allocation(inode,
734 OCFS2_I(inode)->ip_clusters,
735 clusters_to_add - 1, 0);
736 if (ret) {
737 mlog_errno(ret);
738 goto out;
739 }
740 }
741
742 while (total_zero_len) {
743 ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
744 &ext_flags);
745 if (ret < 0) {
746 mlog_errno(ret);
747 goto out;
748 }
749
750 zero_start = ocfs2_clusters_to_bytes(osb->sb, p_cpos) +
751 size_div;
752 zero_len = ocfs2_clusters_to_bytes(osb->sb, num_clusters) -
753 size_div;
754 zero_len = min(total_zero_len, zero_len);
755
756 if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
757 ret = blkdev_issue_zeroout(osb->sb->s_bdev,
758 zero_start >> 9, zero_len >> 9,
759 GFP_NOFS, false);
760 if (ret < 0) {
761 mlog_errno(ret);
762 goto out;
763 }
764 }
765
766 total_zero_len -= zero_len;
767 v_cpos += ocfs2_bytes_to_clusters(osb->sb, zero_len + size_div);
768
769 /* Only at first iteration can be cluster not aligned.
770 * So set size_div to 0 for the rest */
771 size_div = 0;
772 }
773
774out:
775 return ret;
776}
777
667static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, 778static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
668 struct iov_iter *iter, 779 struct iov_iter *iter,
669 loff_t offset) 780 loff_t offset)
@@ -678,8 +789,8 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
678 struct buffer_head *di_bh = NULL; 789 struct buffer_head *di_bh = NULL;
679 size_t count = iter->count; 790 size_t count = iter->count;
680 journal_t *journal = osb->journal->j_journal; 791 journal_t *journal = osb->journal->j_journal;
681 u32 zero_len; 792 u64 zero_len_head, zero_len_tail;
682 int cluster_align; 793 int cluster_align_head, cluster_align_tail;
683 loff_t final_size = offset + count; 794 loff_t final_size = offset + count;
684 int append_write = offset >= i_size_read(inode) ? 1 : 0; 795 int append_write = offset >= i_size_read(inode) ? 1 : 0;
685 unsigned int num_clusters = 0; 796 unsigned int num_clusters = 0;
@@ -687,9 +798,16 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
687 798
688 { 799 {
689 u64 o = offset; 800 u64 o = offset;
801 u64 s = i_size_read(inode);
802
803 zero_len_head = do_div(o, 1 << osb->s_clustersize_bits);
804 cluster_align_head = !zero_len_head;
690 805
691 zero_len = do_div(o, 1 << osb->s_clustersize_bits); 806 zero_len_tail = osb->s_clustersize -
692 cluster_align = !zero_len; 807 do_div(s, osb->s_clustersize);
808 if ((offset - i_size_read(inode)) < zero_len_tail)
809 zero_len_tail = offset - i_size_read(inode);
810 cluster_align_tail = !zero_len_tail;
693 } 811 }
694 812
695 /* 813 /*
@@ -707,21 +825,23 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
707 } 825 }
708 826
709 if (append_write) { 827 if (append_write) {
710 ret = ocfs2_inode_lock(inode, &di_bh, 1); 828 ret = ocfs2_inode_lock(inode, NULL, 1);
711 if (ret < 0) { 829 if (ret < 0) {
712 mlog_errno(ret); 830 mlog_errno(ret);
713 goto clean_orphan; 831 goto clean_orphan;
714 } 832 }
715 833
834 /* zeroing out the previously allocated cluster tail
835 * that but not zeroed */
716 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) 836 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
717 ret = ocfs2_zero_extend(inode, di_bh, offset); 837 ret = ocfs2_direct_IO_zero_extend(osb, inode, offset,
838 zero_len_tail, cluster_align_tail);
718 else 839 else
719 ret = ocfs2_extend_no_holes(inode, di_bh, offset, 840 ret = ocfs2_direct_IO_extend_no_holes(osb, inode,
720 offset); 841 offset);
721 if (ret < 0) { 842 if (ret < 0) {
722 mlog_errno(ret); 843 mlog_errno(ret);
723 ocfs2_inode_unlock(inode, 1); 844 ocfs2_inode_unlock(inode, 1);
724 brelse(di_bh);
725 goto clean_orphan; 845 goto clean_orphan;
726 } 846 }
727 847
@@ -729,13 +849,10 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
729 if (is_overwrite < 0) { 849 if (is_overwrite < 0) {
730 mlog_errno(is_overwrite); 850 mlog_errno(is_overwrite);
731 ocfs2_inode_unlock(inode, 1); 851 ocfs2_inode_unlock(inode, 1);
732 brelse(di_bh);
733 goto clean_orphan; 852 goto clean_orphan;
734 } 853 }
735 854
736 ocfs2_inode_unlock(inode, 1); 855 ocfs2_inode_unlock(inode, 1);
737 brelse(di_bh);
738 di_bh = NULL;
739 } 856 }
740 857
741 written = __blockdev_direct_IO(WRITE, iocb, inode, inode->i_sb->s_bdev, 858 written = __blockdev_direct_IO(WRITE, iocb, inode, inode->i_sb->s_bdev,
@@ -772,15 +889,23 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
772 if (ret < 0) 889 if (ret < 0)
773 mlog_errno(ret); 890 mlog_errno(ret);
774 } 891 }
775 } else if (written < 0 && append_write && !is_overwrite && 892 } else if (written > 0 && append_write && !is_overwrite &&
776 !cluster_align) { 893 !cluster_align_head) {
894 /* zeroing out the allocated cluster head */
777 u32 p_cpos = 0; 895 u32 p_cpos = 0;
778 u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset); 896 u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
779 897
898 ret = ocfs2_inode_lock(inode, NULL, 0);
899 if (ret < 0) {
900 mlog_errno(ret);
901 goto clean_orphan;
902 }
903
780 ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, 904 ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
781 &num_clusters, &ext_flags); 905 &num_clusters, &ext_flags);
782 if (ret < 0) { 906 if (ret < 0) {
783 mlog_errno(ret); 907 mlog_errno(ret);
908 ocfs2_inode_unlock(inode, 0);
784 goto clean_orphan; 909 goto clean_orphan;
785 } 910 }
786 911
@@ -788,9 +913,11 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
788 913
789 ret = blkdev_issue_zeroout(osb->sb->s_bdev, 914 ret = blkdev_issue_zeroout(osb->sb->s_bdev,
790 p_cpos << (osb->s_clustersize_bits - 9), 915 p_cpos << (osb->s_clustersize_bits - 9),
791 zero_len >> 9, GFP_KERNEL, false); 916 zero_len_head >> 9, GFP_NOFS, false);
792 if (ret < 0) 917 if (ret < 0)
793 mlog_errno(ret); 918 mlog_errno(ret);
919
920 ocfs2_inode_unlock(inode, 0);
794 } 921 }
795 922
796clean_orphan: 923clean_orphan:
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 16eff45727ee..8e19b9d7aba8 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1312,7 +1312,9 @@ static int o2hb_debug_init(void)
1312 int ret = -ENOMEM; 1312 int ret = -ENOMEM;
1313 1313
1314 o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL); 1314 o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
1315 if (!o2hb_debug_dir) { 1315 if (IS_ERR_OR_NULL(o2hb_debug_dir)) {
1316 ret = o2hb_debug_dir ?
1317 PTR_ERR(o2hb_debug_dir) : -ENOMEM;
1316 mlog_errno(ret); 1318 mlog_errno(ret);
1317 goto bail; 1319 goto bail;
1318 } 1320 }
@@ -1325,7 +1327,9 @@ static int o2hb_debug_init(void)
1325 sizeof(o2hb_live_node_bitmap), 1327 sizeof(o2hb_live_node_bitmap),
1326 O2NM_MAX_NODES, 1328 O2NM_MAX_NODES,
1327 o2hb_live_node_bitmap); 1329 o2hb_live_node_bitmap);
1328 if (!o2hb_debug_livenodes) { 1330 if (IS_ERR_OR_NULL(o2hb_debug_livenodes)) {
1331 ret = o2hb_debug_livenodes ?
1332 PTR_ERR(o2hb_debug_livenodes) : -ENOMEM;
1329 mlog_errno(ret); 1333 mlog_errno(ret);
1330 goto bail; 1334 goto bail;
1331 } 1335 }
@@ -1338,7 +1342,9 @@ static int o2hb_debug_init(void)
1338 sizeof(o2hb_live_region_bitmap), 1342 sizeof(o2hb_live_region_bitmap),
1339 O2NM_MAX_REGIONS, 1343 O2NM_MAX_REGIONS,
1340 o2hb_live_region_bitmap); 1344 o2hb_live_region_bitmap);
1341 if (!o2hb_debug_liveregions) { 1345 if (IS_ERR_OR_NULL(o2hb_debug_liveregions)) {
1346 ret = o2hb_debug_liveregions ?
1347 PTR_ERR(o2hb_debug_liveregions) : -ENOMEM;
1342 mlog_errno(ret); 1348 mlog_errno(ret);
1343 goto bail; 1349 goto bail;
1344 } 1350 }
@@ -1352,7 +1358,9 @@ static int o2hb_debug_init(void)
1352 sizeof(o2hb_quorum_region_bitmap), 1358 sizeof(o2hb_quorum_region_bitmap),
1353 O2NM_MAX_REGIONS, 1359 O2NM_MAX_REGIONS,
1354 o2hb_quorum_region_bitmap); 1360 o2hb_quorum_region_bitmap);
1355 if (!o2hb_debug_quorumregions) { 1361 if (IS_ERR_OR_NULL(o2hb_debug_quorumregions)) {
1362 ret = o2hb_debug_quorumregions ?
1363 PTR_ERR(o2hb_debug_quorumregions) : -ENOMEM;
1356 mlog_errno(ret); 1364 mlog_errno(ret);
1357 goto bail; 1365 goto bail;
1358 } 1366 }
@@ -1366,7 +1374,9 @@ static int o2hb_debug_init(void)
1366 sizeof(o2hb_failed_region_bitmap), 1374 sizeof(o2hb_failed_region_bitmap),
1367 O2NM_MAX_REGIONS, 1375 O2NM_MAX_REGIONS,
1368 o2hb_failed_region_bitmap); 1376 o2hb_failed_region_bitmap);
1369 if (!o2hb_debug_failedregions) { 1377 if (IS_ERR_OR_NULL(o2hb_debug_failedregions)) {
1378 ret = o2hb_debug_failedregions ?
1379 PTR_ERR(o2hb_debug_failedregions) : -ENOMEM;
1370 mlog_errno(ret); 1380 mlog_errno(ret);
1371 goto bail; 1381 goto bail;
1372 } 1382 }
@@ -2000,7 +2010,8 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
2000 2010
2001 reg->hr_debug_dir = 2011 reg->hr_debug_dir =
2002 debugfs_create_dir(config_item_name(&reg->hr_item), dir); 2012 debugfs_create_dir(config_item_name(&reg->hr_item), dir);
2003 if (!reg->hr_debug_dir) { 2013 if (IS_ERR_OR_NULL(reg->hr_debug_dir)) {
2014 ret = reg->hr_debug_dir ? PTR_ERR(reg->hr_debug_dir) : -ENOMEM;
2004 mlog_errno(ret); 2015 mlog_errno(ret);
2005 goto bail; 2016 goto bail;
2006 } 2017 }
@@ -2013,7 +2024,9 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
2013 O2HB_DB_TYPE_REGION_LIVENODES, 2024 O2HB_DB_TYPE_REGION_LIVENODES,
2014 sizeof(reg->hr_live_node_bitmap), 2025 sizeof(reg->hr_live_node_bitmap),
2015 O2NM_MAX_NODES, reg); 2026 O2NM_MAX_NODES, reg);
2016 if (!reg->hr_debug_livenodes) { 2027 if (IS_ERR_OR_NULL(reg->hr_debug_livenodes)) {
2028 ret = reg->hr_debug_livenodes ?
2029 PTR_ERR(reg->hr_debug_livenodes) : -ENOMEM;
2017 mlog_errno(ret); 2030 mlog_errno(ret);
2018 goto bail; 2031 goto bail;
2019 } 2032 }
@@ -2025,7 +2038,9 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
2025 sizeof(*(reg->hr_db_regnum)), 2038 sizeof(*(reg->hr_db_regnum)),
2026 O2HB_DB_TYPE_REGION_NUMBER, 2039 O2HB_DB_TYPE_REGION_NUMBER,
2027 0, O2NM_MAX_NODES, reg); 2040 0, O2NM_MAX_NODES, reg);
2028 if (!reg->hr_debug_regnum) { 2041 if (IS_ERR_OR_NULL(reg->hr_debug_regnum)) {
2042 ret = reg->hr_debug_regnum ?
2043 PTR_ERR(reg->hr_debug_regnum) : -ENOMEM;
2029 mlog_errno(ret); 2044 mlog_errno(ret);
2030 goto bail; 2045 goto bail;
2031 } 2046 }
@@ -2037,7 +2052,9 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
2037 sizeof(*(reg->hr_db_elapsed_time)), 2052 sizeof(*(reg->hr_db_elapsed_time)),
2038 O2HB_DB_TYPE_REGION_ELAPSED_TIME, 2053 O2HB_DB_TYPE_REGION_ELAPSED_TIME,
2039 0, 0, reg); 2054 0, 0, reg);
2040 if (!reg->hr_debug_elapsed_time) { 2055 if (IS_ERR_OR_NULL(reg->hr_debug_elapsed_time)) {
2056 ret = reg->hr_debug_elapsed_time ?
2057 PTR_ERR(reg->hr_debug_elapsed_time) : -ENOMEM;
2041 mlog_errno(ret); 2058 mlog_errno(ret);
2042 goto bail; 2059 goto bail;
2043 } 2060 }
@@ -2049,13 +2066,16 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
2049 sizeof(*(reg->hr_db_pinned)), 2066 sizeof(*(reg->hr_db_pinned)),
2050 O2HB_DB_TYPE_REGION_PINNED, 2067 O2HB_DB_TYPE_REGION_PINNED,
2051 0, 0, reg); 2068 0, 0, reg);
2052 if (!reg->hr_debug_pinned) { 2069 if (IS_ERR_OR_NULL(reg->hr_debug_pinned)) {
2070 ret = reg->hr_debug_pinned ?
2071 PTR_ERR(reg->hr_debug_pinned) : -ENOMEM;
2053 mlog_errno(ret); 2072 mlog_errno(ret);
2054 goto bail; 2073 goto bail;
2055 } 2074 }
2056 2075
2057 ret = 0; 2076 return 0;
2058bail: 2077bail:
2078 debugfs_remove_recursive(reg->hr_debug_dir);
2059 return ret; 2079 return ret;
2060} 2080}
2061 2081
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 2260fb9e6508..7fdc25a4d8c0 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -196,13 +196,14 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
196 } \ 196 } \
197} while (0) 197} while (0)
198 198
199#define mlog_errno(st) do { \ 199#define mlog_errno(st) ({ \
200 int _st = (st); \ 200 int _st = (st); \
201 if (_st != -ERESTARTSYS && _st != -EINTR && \ 201 if (_st != -ERESTARTSYS && _st != -EINTR && \
202 _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC && \ 202 _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC && \
203 _st != -EDQUOT) \ 203 _st != -EDQUOT) \
204 mlog(ML_ERROR, "status = %lld\n", (long long)_st); \ 204 mlog(ML_ERROR, "status = %lld\n", (long long)_st); \
205} while (0) 205 _st; \
206})
206 207
207#define mlog_bug_on_msg(cond, fmt, args...) do { \ 208#define mlog_bug_on_msg(cond, fmt, args...) do { \
208 if (cond) { \ 209 if (cond) { \
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index b08050bd3f2e..ccd4dcfc3645 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -18,7 +18,7 @@
18 * 18 *
19 * linux/fs/minix/dir.c 19 * linux/fs/minix/dir.c
20 * 20 *
21 * Copyright (C) 1991, 1992 Linux Torvalds 21 * Copyright (C) 1991, 1992 Linus Torvalds
22 * 22 *
23 * This program is free software; you can redistribute it and/or 23 * This program is free software; you can redistribute it and/or
24 * modify it under the terms of the GNU General Public 24 * modify it under the terms of the GNU General Public
@@ -2047,22 +2047,19 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
2047 const char *name, 2047 const char *name,
2048 int namelen) 2048 int namelen)
2049{ 2049{
2050 int ret; 2050 int ret = 0;
2051 struct ocfs2_dir_lookup_result lookup = { NULL, }; 2051 struct ocfs2_dir_lookup_result lookup = { NULL, };
2052 2052
2053 trace_ocfs2_check_dir_for_entry( 2053 trace_ocfs2_check_dir_for_entry(
2054 (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name); 2054 (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
2055 2055
2056 ret = -EEXIST; 2056 if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0) {
2057 if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0) 2057 ret = -EEXIST;
2058 goto bail; 2058 mlog_errno(ret);
2059 }
2059 2060
2060 ret = 0;
2061bail:
2062 ocfs2_free_dir_lookup_result(&lookup); 2061 ocfs2_free_dir_lookup_result(&lookup);
2063 2062
2064 if (ret)
2065 mlog_errno(ret);
2066 return ret; 2063 return ret;
2067} 2064}
2068 2065
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 11849a44dc5a..956edf67be20 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1391,6 +1391,11 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
1391 int noqueue_attempted = 0; 1391 int noqueue_attempted = 0;
1392 int dlm_locked = 0; 1392 int dlm_locked = 0;
1393 1393
1394 if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) {
1395 mlog_errno(-EINVAL);
1396 return -EINVAL;
1397 }
1398
1394 ocfs2_init_mask_waiter(&mw); 1399 ocfs2_init_mask_waiter(&mw);
1395 1400
1396 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) 1401 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
@@ -2954,7 +2959,7 @@ static int ocfs2_dlm_init_debug(struct ocfs2_super *osb)
2954 osb->osb_debug_root, 2959 osb->osb_debug_root,
2955 osb, 2960 osb,
2956 &ocfs2_dlm_debug_fops); 2961 &ocfs2_dlm_debug_fops);
2957 if (!dlm_debug->d_locking_state) { 2962 if (IS_ERR_OR_NULL(dlm_debug->d_locking_state)) {
2958 ret = -EINVAL; 2963 ret = -EINVAL;
2959 mlog(ML_ERROR, 2964 mlog(ML_ERROR,
2960 "Unable to create locking state debugfs file.\n"); 2965 "Unable to create locking state debugfs file.\n");
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 29651167190d..540dc4bdd042 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -82,7 +82,6 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
82 } 82 }
83 83
84 status = ocfs2_test_inode_bit(osb, blkno, &set); 84 status = ocfs2_test_inode_bit(osb, blkno, &set);
85 trace_ocfs2_get_dentry_test_bit(status, set);
86 if (status < 0) { 85 if (status < 0) {
87 if (status == -EINVAL) { 86 if (status == -EINVAL) {
88 /* 87 /*
@@ -96,6 +95,7 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
96 goto unlock_nfs_sync; 95 goto unlock_nfs_sync;
97 } 96 }
98 97
98 trace_ocfs2_get_dentry_test_bit(status, set);
99 /* If the inode allocator bit is clear, this inode must be stale */ 99 /* If the inode allocator bit is clear, this inode must be stale */
100 if (!set) { 100 if (!set) {
101 status = -ESTALE; 101 status = -ESTALE;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 3025c0da6b8a..be71ca0937f7 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -624,7 +624,7 @@ static int ocfs2_remove_inode(struct inode *inode,
624 ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, 624 ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
625 le16_to_cpu(di->i_suballoc_slot)); 625 le16_to_cpu(di->i_suballoc_slot));
626 if (!inode_alloc_inode) { 626 if (!inode_alloc_inode) {
627 status = -EEXIST; 627 status = -ENOENT;
628 mlog_errno(status); 628 mlog_errno(status);
629 goto bail; 629 goto bail;
630 } 630 }
@@ -742,7 +742,7 @@ static int ocfs2_wipe_inode(struct inode *inode,
742 ORPHAN_DIR_SYSTEM_INODE, 742 ORPHAN_DIR_SYSTEM_INODE,
743 orphaned_slot); 743 orphaned_slot);
744 if (!orphan_dir_inode) { 744 if (!orphan_dir_inode) {
745 status = -EEXIST; 745 status = -ENOENT;
746 mlog_errno(status); 746 mlog_errno(status);
747 goto bail; 747 goto bail;
748 } 748 }
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 044013455621..857bbbcd39f3 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -666,7 +666,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
666 if (le32_to_cpu(alloc->id1.bitmap1.i_used) != 666 if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
667 ocfs2_local_alloc_count_bits(alloc)) { 667 ocfs2_local_alloc_count_bits(alloc)) {
668 ocfs2_error(osb->sb, "local alloc inode %llu says it has " 668 ocfs2_error(osb->sb, "local alloc inode %llu says it has "
669 "%u free bits, but a count shows %u", 669 "%u used bits, but a count shows %u",
670 (unsigned long long)le64_to_cpu(alloc->i_blkno), 670 (unsigned long long)le64_to_cpu(alloc->i_blkno),
671 le32_to_cpu(alloc->id1.bitmap1.i_used), 671 le32_to_cpu(alloc->id1.bitmap1.i_used),
672 ocfs2_local_alloc_count_bits(alloc)); 672 ocfs2_local_alloc_count_bits(alloc));
@@ -839,7 +839,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
839 u32 *numbits, 839 u32 *numbits,
840 struct ocfs2_alloc_reservation *resv) 840 struct ocfs2_alloc_reservation *resv)
841{ 841{
842 int numfound, bitoff, left, startoff, lastzero; 842 int numfound = 0, bitoff, left, startoff, lastzero;
843 int local_resv = 0; 843 int local_resv = 0;
844 struct ocfs2_alloc_reservation r; 844 struct ocfs2_alloc_reservation r;
845 void *bitmap = NULL; 845 void *bitmap = NULL;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index b5c3a5ea3ee6..09f90cbf0e24 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -2322,10 +2322,10 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
2322 2322
2323 trace_ocfs2_orphan_del( 2323 trace_ocfs2_orphan_del(
2324 (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno, 2324 (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
2325 name, namelen); 2325 name, strlen(name));
2326 2326
2327 /* find it's spot in the orphan directory */ 2327 /* find it's spot in the orphan directory */
2328 status = ocfs2_find_entry(name, namelen, orphan_dir_inode, 2328 status = ocfs2_find_entry(name, strlen(name), orphan_dir_inode,
2329 &lookup); 2329 &lookup);
2330 if (status) { 2330 if (status) {
2331 mlog_errno(status); 2331 mlog_errno(status);
@@ -2808,7 +2808,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
2808 ORPHAN_DIR_SYSTEM_INODE, 2808 ORPHAN_DIR_SYSTEM_INODE,
2809 osb->slot_num); 2809 osb->slot_num);
2810 if (!orphan_dir_inode) { 2810 if (!orphan_dir_inode) {
2811 status = -EEXIST; 2811 status = -ENOENT;
2812 mlog_errno(status); 2812 mlog_errno(status);
2813 goto leave; 2813 goto leave;
2814 } 2814 }
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index ee541f92dab4..df3a500789c7 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4276,7 +4276,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
4276 error = posix_acl_create(dir, &mode, &default_acl, &acl); 4276 error = posix_acl_create(dir, &mode, &default_acl, &acl);
4277 if (error) { 4277 if (error) {
4278 mlog_errno(error); 4278 mlog_errno(error);
4279 goto out; 4279 return error;
4280 } 4280 }
4281 4281
4282 error = ocfs2_create_inode_in_orphan(dir, mode, 4282 error = ocfs2_create_inode_in_orphan(dir, mode,
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index d5493e361a38..e78a203d44c8 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -427,7 +427,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
427 if (!si) { 427 if (!si) {
428 status = -ENOMEM; 428 status = -ENOMEM;
429 mlog_errno(status); 429 mlog_errno(status);
430 goto bail; 430 return status;
431 } 431 }
432 432
433 si->si_extended = ocfs2_uses_extended_slot_map(osb); 433 si->si_extended = ocfs2_uses_extended_slot_map(osb);
@@ -452,7 +452,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
452 452
453 osb->slot_info = (struct ocfs2_slot_info *)si; 453 osb->slot_info = (struct ocfs2_slot_info *)si;
454bail: 454bail:
455 if (status < 0 && si) 455 if (status < 0)
456 __ocfs2_free_slot_info(si); 456 __ocfs2_free_slot_info(si);
457 457
458 return status; 458 return status;
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 1724d43d3da1..220cae7bbdbc 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -295,7 +295,7 @@ static int o2cb_cluster_check(void)
295 set_bit(node_num, netmap); 295 set_bit(node_num, netmap);
296 if (!memcmp(hbmap, netmap, sizeof(hbmap))) 296 if (!memcmp(hbmap, netmap, sizeof(hbmap)))
297 return 0; 297 return 0;
298 if (i < O2CB_MAP_STABILIZE_COUNT) 298 if (i < O2CB_MAP_STABILIZE_COUNT - 1)
299 msleep(1000); 299 msleep(1000);
300 } 300 }
301 301
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 720aa389e0ea..2768eb1da2b8 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -1004,10 +1004,8 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
1004 BUG_ON(conn == NULL); 1004 BUG_ON(conn == NULL);
1005 1005
1006 lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL); 1006 lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
1007 if (!lc) { 1007 if (!lc)
1008 rc = -ENOMEM; 1008 return -ENOMEM;
1009 goto out;
1010 }
1011 1009
1012 init_waitqueue_head(&lc->oc_wait); 1010 init_waitqueue_head(&lc->oc_wait);
1013 init_completion(&lc->oc_sync_wait); 1011 init_completion(&lc->oc_sync_wait);
@@ -1063,7 +1061,7 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
1063 } 1061 }
1064 1062
1065out: 1063out:
1066 if (rc && lc) 1064 if (rc)
1067 kfree(lc); 1065 kfree(lc);
1068 return rc; 1066 return rc;
1069} 1067}
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 0cb889a17ae1..4479029630bb 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -2499,6 +2499,8 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle,
2499 alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE); 2499 alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
2500 if (status < 0) { 2500 if (status < 0) {
2501 mlog_errno(status); 2501 mlog_errno(status);
2502 ocfs2_block_group_set_bits(handle, alloc_inode, group, group_bh,
2503 start_bit, count);
2502 goto bail; 2504 goto bail;
2503 } 2505 }
2504 2506
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 26675185b886..837ddce4b659 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1112,7 +1112,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
1112 1112
1113 osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, 1113 osb->osb_debug_root = debugfs_create_dir(osb->uuid_str,
1114 ocfs2_debugfs_root); 1114 ocfs2_debugfs_root);
1115 if (!osb->osb_debug_root) { 1115 if (IS_ERR_OR_NULL(osb->osb_debug_root)) {
1116 status = -EINVAL; 1116 status = -EINVAL;
1117 mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n"); 1117 mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n");
1118 goto read_super_error; 1118 goto read_super_error;
@@ -1122,7 +1122,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
1122 osb->osb_debug_root, 1122 osb->osb_debug_root,
1123 osb, 1123 osb,
1124 &ocfs2_osb_debug_fops); 1124 &ocfs2_osb_debug_fops);
1125 if (!osb->osb_ctxt) { 1125 if (IS_ERR_OR_NULL(osb->osb_ctxt)) {
1126 status = -EINVAL; 1126 status = -EINVAL;
1127 mlog_errno(status); 1127 mlog_errno(status);
1128 goto read_super_error; 1128 goto read_super_error;
@@ -1606,8 +1606,9 @@ static int __init ocfs2_init(void)
1606 } 1606 }
1607 1607
1608 ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); 1608 ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
1609 if (!ocfs2_debugfs_root) { 1609 if (IS_ERR_OR_NULL(ocfs2_debugfs_root)) {
1610 status = -ENOMEM; 1610 status = ocfs2_debugfs_root ?
1611 PTR_ERR(ocfs2_debugfs_root) : -ENOMEM;
1611 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); 1612 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
1612 goto out4; 1613 goto out4;
1613 } 1614 }
@@ -2069,6 +2070,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
2069 cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits); 2070 cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits);
2070 bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); 2071 bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
2071 sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits); 2072 sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits);
2073 memcpy(sb->s_uuid, di->id2.i_super.s_uuid,
2074 sizeof(di->id2.i_super.s_uuid));
2072 2075
2073 osb->osb_dx_mask = (1 << (cbits - bbits)) - 1; 2076 osb->osb_dx_mask = (1 << (cbits - bbits)) - 1;
2074 2077
@@ -2333,7 +2336,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
2333 mlog_errno(status); 2336 mlog_errno(status);
2334 goto bail; 2337 goto bail;
2335 } 2338 }
2336 cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb); 2339 cleancache_init_shared_fs(sb);
2337 2340
2338bail: 2341bail:
2339 return status; 2342 return status;
@@ -2563,22 +2566,22 @@ static void ocfs2_handle_error(struct super_block *sb)
2563 ocfs2_set_ro_flag(osb, 0); 2566 ocfs2_set_ro_flag(osb, 0);
2564} 2567}
2565 2568
2566static char error_buf[1024]; 2569void __ocfs2_error(struct super_block *sb, const char *function,
2567 2570 const char *fmt, ...)
2568void __ocfs2_error(struct super_block *sb,
2569 const char *function,
2570 const char *fmt, ...)
2571{ 2571{
2572 struct va_format vaf;
2572 va_list args; 2573 va_list args;
2573 2574
2574 va_start(args, fmt); 2575 va_start(args, fmt);
2575 vsnprintf(error_buf, sizeof(error_buf), fmt, args); 2576 vaf.fmt = fmt;
2576 va_end(args); 2577 vaf.va = &args;
2577 2578
2578 /* Not using mlog here because we want to show the actual 2579 /* Not using mlog here because we want to show the actual
2579 * function the error came from. */ 2580 * function the error came from. */
2580 printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %s\n", 2581 printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV\n",
2581 sb->s_id, function, error_buf); 2582 sb->s_id, function, &vaf);
2583
2584 va_end(args);
2582 2585
2583 ocfs2_handle_error(sb); 2586 ocfs2_handle_error(sb);
2584} 2587}
@@ -2586,18 +2589,21 @@ void __ocfs2_error(struct super_block *sb,
2586/* Handle critical errors. This is intentionally more drastic than 2589/* Handle critical errors. This is intentionally more drastic than
2587 * ocfs2_handle_error, so we only use for things like journal errors, 2590 * ocfs2_handle_error, so we only use for things like journal errors,
2588 * etc. */ 2591 * etc. */
2589void __ocfs2_abort(struct super_block* sb, 2592void __ocfs2_abort(struct super_block *sb, const char *function,
2590 const char *function,
2591 const char *fmt, ...) 2593 const char *fmt, ...)
2592{ 2594{
2595 struct va_format vaf;
2593 va_list args; 2596 va_list args;
2594 2597
2595 va_start(args, fmt); 2598 va_start(args, fmt);
2596 vsnprintf(error_buf, sizeof(error_buf), fmt, args);
2597 va_end(args);
2598 2599
2599 printk(KERN_CRIT "OCFS2: abort (device %s): %s: %s\n", 2600 vaf.fmt = fmt;
2600 sb->s_id, function, error_buf); 2601 vaf.va = &args;
2602
2603 printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV\n",
2604 sb->s_id, function, &vaf);
2605
2606 va_end(args);
2601 2607
2602 /* We don't have the cluster support yet to go straight to 2608 /* We don't have the cluster support yet to go straight to
2603 * hard readonly in here. Until then, we want to keep 2609 * hard readonly in here. Until then, we want to keep
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 85b190dc132f..4ca7533be479 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1238,6 +1238,10 @@ static int ocfs2_xattr_block_get(struct inode *inode,
1238 i, 1238 i,
1239 &block_off, 1239 &block_off,
1240 &name_offset); 1240 &name_offset);
1241 if (ret) {
1242 mlog_errno(ret);
1243 goto cleanup;
1244 }
1241 xs->base = bucket_block(xs->bucket, block_off); 1245 xs->base = bucket_block(xs->bucket, block_off);
1242 } 1246 }
1243 if (ocfs2_xattr_is_local(xs->here)) { 1247 if (ocfs2_xattr_is_local(xs->here)) {
@@ -5665,6 +5669,10 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
5665 5669
5666 ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket, 5670 ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket,
5667 i, &xv, NULL); 5671 i, &xv, NULL);
5672 if (ret) {
5673 mlog_errno(ret);
5674 break;
5675 }
5668 5676
5669 ret = ocfs2_lock_xattr_remove_allocators(inode, xv, 5677 ret = ocfs2_lock_xattr_remove_allocators(inode, xv,
5670 args->ref_ci, 5678 args->ref_ci,
diff --git a/fs/super.c b/fs/super.c
index 2b7dc90ccdbb..928c20f47af9 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -224,7 +224,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
224 s->s_maxbytes = MAX_NON_LFS; 224 s->s_maxbytes = MAX_NON_LFS;
225 s->s_op = &default_op; 225 s->s_op = &default_op;
226 s->s_time_gran = 1000000000; 226 s->s_time_gran = 1000000000;
227 s->cleancache_poolid = -1; 227 s->cleancache_poolid = CLEANCACHE_NO_POOL;
228 228
229 s->s_shrink.seeks = DEFAULT_SEEKS; 229 s->s_shrink.seeks = DEFAULT_SEEKS;
230 s->s_shrink.scan_objects = super_cache_scan; 230 s->s_shrink.scan_objects = super_cache_scan;
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 4d46085c1b90..39f1d6a2b04d 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -6,6 +6,12 @@
6 6
7#include <linux/mm_types.h> 7#include <linux/mm_types.h>
8#include <linux/bug.h> 8#include <linux/bug.h>
9#include <linux/errno.h>
10
11#if 4 - defined(__PAGETABLE_PUD_FOLDED) - defined(__PAGETABLE_PMD_FOLDED) != \
12 CONFIG_PGTABLE_LEVELS
13#error CONFIG_PGTABLE_LEVELS is not consistent with __PAGETABLE_{PUD,PMD}_FOLDED
14#endif
9 15
10/* 16/*
11 * On almost all architectures and configurations, 0 can be used as the 17 * On almost all architectures and configurations, 0 can be used as the
@@ -691,6 +697,30 @@ static inline int pmd_protnone(pmd_t pmd)
691 697
692#endif /* CONFIG_MMU */ 698#endif /* CONFIG_MMU */
693 699
700#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
701int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot);
702int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot);
703int pud_clear_huge(pud_t *pud);
704int pmd_clear_huge(pmd_t *pmd);
705#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
706static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
707{
708 return 0;
709}
710static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
711{
712 return 0;
713}
714static inline int pud_clear_huge(pud_t *pud)
715{
716 return 0;
717}
718static inline int pmd_clear_huge(pmd_t *pmd)
719{
720 return 0;
721}
722#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
723
694#endif /* !__ASSEMBLY__ */ 724#endif /* !__ASSEMBLY__ */
695 725
696#ifndef io_remap_pfn_range 726#ifndef io_remap_pfn_range
diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h
index 4ce9056b31a8..bda5ec0b4b4d 100644
--- a/include/linux/cleancache.h
+++ b/include/linux/cleancache.h
@@ -5,6 +5,10 @@
5#include <linux/exportfs.h> 5#include <linux/exportfs.h>
6#include <linux/mm.h> 6#include <linux/mm.h>
7 7
8#define CLEANCACHE_NO_POOL -1
9#define CLEANCACHE_NO_BACKEND -2
10#define CLEANCACHE_NO_BACKEND_SHARED -3
11
8#define CLEANCACHE_KEY_MAX 6 12#define CLEANCACHE_KEY_MAX 6
9 13
10/* 14/*
@@ -33,10 +37,9 @@ struct cleancache_ops {
33 void (*invalidate_fs)(int); 37 void (*invalidate_fs)(int);
34}; 38};
35 39
36extern struct cleancache_ops * 40extern int cleancache_register_ops(struct cleancache_ops *ops);
37 cleancache_register_ops(struct cleancache_ops *ops);
38extern void __cleancache_init_fs(struct super_block *); 41extern void __cleancache_init_fs(struct super_block *);
39extern void __cleancache_init_shared_fs(char *, struct super_block *); 42extern void __cleancache_init_shared_fs(struct super_block *);
40extern int __cleancache_get_page(struct page *); 43extern int __cleancache_get_page(struct page *);
41extern void __cleancache_put_page(struct page *); 44extern void __cleancache_put_page(struct page *);
42extern void __cleancache_invalidate_page(struct address_space *, struct page *); 45extern void __cleancache_invalidate_page(struct address_space *, struct page *);
@@ -78,10 +81,10 @@ static inline void cleancache_init_fs(struct super_block *sb)
78 __cleancache_init_fs(sb); 81 __cleancache_init_fs(sb);
79} 82}
80 83
81static inline void cleancache_init_shared_fs(char *uuid, struct super_block *sb) 84static inline void cleancache_init_shared_fs(struct super_block *sb)
82{ 85{
83 if (cleancache_enabled) 86 if (cleancache_enabled)
84 __cleancache_init_shared_fs(uuid, sb); 87 __cleancache_init_shared_fs(sb);
85} 88}
86 89
87static inline int cleancache_get_page(struct page *page) 90static inline int cleancache_get_page(struct page *page)
diff --git a/include/linux/cma.h b/include/linux/cma.h
index 9384ba66e975..f7ef093ec49a 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -16,16 +16,16 @@
16struct cma; 16struct cma;
17 17
18extern unsigned long totalcma_pages; 18extern unsigned long totalcma_pages;
19extern phys_addr_t cma_get_base(struct cma *cma); 19extern phys_addr_t cma_get_base(const struct cma *cma);
20extern unsigned long cma_get_size(struct cma *cma); 20extern unsigned long cma_get_size(const struct cma *cma);
21 21
22extern int __init cma_declare_contiguous(phys_addr_t base, 22extern int __init cma_declare_contiguous(phys_addr_t base,
23 phys_addr_t size, phys_addr_t limit, 23 phys_addr_t size, phys_addr_t limit,
24 phys_addr_t alignment, unsigned int order_per_bit, 24 phys_addr_t alignment, unsigned int order_per_bit,
25 bool fixed, struct cma **res_cma); 25 bool fixed, struct cma **res_cma);
26extern int cma_init_reserved_mem(phys_addr_t base, 26extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
27 phys_addr_t size, int order_per_bit, 27 unsigned int order_per_bit,
28 struct cma **res_cma); 28 struct cma **res_cma);
29extern struct page *cma_alloc(struct cma *cma, int count, unsigned int align); 29extern struct page *cma_alloc(struct cma *cma, unsigned int count, unsigned int align);
30extern bool cma_release(struct cma *cma, struct page *pages, int count); 30extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count);
31#endif 31#endif
diff --git a/include/linux/elf-randomize.h b/include/linux/elf-randomize.h
new file mode 100644
index 000000000000..b5f0bda9472e
--- /dev/null
+++ b/include/linux/elf-randomize.h
@@ -0,0 +1,22 @@
1#ifndef _ELF_RANDOMIZE_H
2#define _ELF_RANDOMIZE_H
3
4struct mm_struct;
5
6#ifndef CONFIG_ARCH_HAS_ELF_RANDOMIZE
7static inline unsigned long arch_mmap_rnd(void) { return 0; }
8# if defined(arch_randomize_brk) && defined(CONFIG_COMPAT_BRK)
9# define compat_brk_randomized
10# endif
11# ifndef arch_randomize_brk
12# define arch_randomize_brk(mm) (mm->brk)
13# endif
14#else
15extern unsigned long arch_mmap_rnd(void);
16extern unsigned long arch_randomize_brk(struct mm_struct *mm);
17# ifdef CONFIG_COMPAT_BRK
18# define compat_brk_randomized
19# endif
20#endif
21
22#endif
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 51bd1e72a917..97a9373e61e8 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -57,8 +57,10 @@ struct vm_area_struct;
57 * _might_ fail. This depends upon the particular VM implementation. 57 * _might_ fail. This depends upon the particular VM implementation.
58 * 58 *
59 * __GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller 59 * __GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller
60 * cannot handle allocation failures. This modifier is deprecated and no new 60 * cannot handle allocation failures. New users should be evaluated carefully
61 * users should be added. 61 * (and the flag should be used only when there is no reasonable failure policy)
62 * but it is definitely preferable to use the flag rather than opencode endless
63 * loop around allocator.
62 * 64 *
63 * __GFP_NORETRY: The VM implementation must not retry indefinitely. 65 * __GFP_NORETRY: The VM implementation must not retry indefinitely.
64 * 66 *
@@ -117,16 +119,6 @@ struct vm_area_struct;
117 __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \ 119 __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \
118 __GFP_NO_KSWAPD) 120 __GFP_NO_KSWAPD)
119 121
120/*
121 * GFP_THISNODE does not perform any reclaim, you most likely want to
122 * use __GFP_THISNODE to allocate from a given node without fallback!
123 */
124#ifdef CONFIG_NUMA
125#define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)
126#else
127#define GFP_THISNODE ((__force gfp_t)0)
128#endif
129
130/* This mask makes up all the page movable related flags */ 122/* This mask makes up all the page movable related flags */
131#define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) 123#define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
132 124
diff --git a/include/linux/io.h b/include/linux/io.h
index fa02e55e5a2e..4cc299c598e0 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -38,6 +38,14 @@ static inline int ioremap_page_range(unsigned long addr, unsigned long end,
38} 38}
39#endif 39#endif
40 40
41#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
42void __init ioremap_huge_init(void);
43int arch_ioremap_pud_supported(void);
44int arch_ioremap_pmd_supported(void);
45#else
46static inline void ioremap_huge_init(void) { }
47#endif
48
41/* 49/*
42 * Managed iomap interface 50 * Managed iomap interface
43 */ 51 */
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index e8cc45307f8f..9497ec7c77ea 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -365,6 +365,14 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo
365#define __initdata_memblock 365#define __initdata_memblock
366#endif 366#endif
367 367
368#ifdef CONFIG_MEMTEST
369extern void early_memtest(phys_addr_t start, phys_addr_t end);
370#else
371static inline void early_memtest(phys_addr_t start, phys_addr_t end)
372{
373}
374#endif
375
368#else 376#else
369static inline phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align) 377static inline phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align)
370{ 378{
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 8f1a41951df9..6ffa0ac7f7d6 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -192,6 +192,9 @@ extern void get_page_bootmem(unsigned long ingo, struct page *page,
192void get_online_mems(void); 192void get_online_mems(void);
193void put_online_mems(void); 193void put_online_mems(void);
194 194
195void mem_hotplug_begin(void);
196void mem_hotplug_done(void);
197
195#else /* ! CONFIG_MEMORY_HOTPLUG */ 198#else /* ! CONFIG_MEMORY_HOTPLUG */
196/* 199/*
197 * Stub functions for when hotplug is off 200 * Stub functions for when hotplug is off
@@ -231,6 +234,9 @@ static inline int try_online_node(int nid)
231static inline void get_online_mems(void) {} 234static inline void get_online_mems(void) {}
232static inline void put_online_mems(void) {} 235static inline void put_online_mems(void) {}
233 236
237static inline void mem_hotplug_begin(void) {}
238static inline void mem_hotplug_done(void) {}
239
234#endif /* ! CONFIG_MEMORY_HOTPLUG */ 240#endif /* ! CONFIG_MEMORY_HOTPLUG */
235 241
236#ifdef CONFIG_MEMORY_HOTREMOVE 242#ifdef CONFIG_MEMORY_HOTREMOVE
diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 39ed62ab5b8a..b19b3023c880 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -29,7 +29,7 @@ extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
29 mempool_free_t *free_fn, void *pool_data, 29 mempool_free_t *free_fn, void *pool_data,
30 gfp_t gfp_mask, int nid); 30 gfp_t gfp_mask, int nid);
31 31
32extern int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask); 32extern int mempool_resize(mempool_t *pool, int new_min_nr);
33extern void mempool_destroy(mempool_t *pool); 33extern void mempool_destroy(mempool_t *pool);
34extern void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask); 34extern void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask);
35extern void mempool_free(void *element, mempool_t *pool); 35extern void mempool_free(void *element, mempool_t *pool);
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 78baed5f2952..cac1c0904d5f 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -69,7 +69,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
69extern bool pmd_trans_migrating(pmd_t pmd); 69extern bool pmd_trans_migrating(pmd_t pmd);
70extern int migrate_misplaced_page(struct page *page, 70extern int migrate_misplaced_page(struct page *page,
71 struct vm_area_struct *vma, int node); 71 struct vm_area_struct *vma, int node);
72extern bool migrate_ratelimited(int node);
73#else 72#else
74static inline bool pmd_trans_migrating(pmd_t pmd) 73static inline bool pmd_trans_migrating(pmd_t pmd)
75{ 74{
@@ -80,10 +79,6 @@ static inline int migrate_misplaced_page(struct page *page,
80{ 79{
81 return -EAGAIN; /* can't migrate now */ 80 return -EAGAIN; /* can't migrate now */
82} 81}
83static inline bool migrate_ratelimited(int node)
84{
85 return false;
86}
87#endif /* CONFIG_NUMA_BALANCING */ 82#endif /* CONFIG_NUMA_BALANCING */
88 83
89#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) 84#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 47a93928b90f..6571dd78e984 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1294,9 +1294,11 @@ int __set_page_dirty_no_writeback(struct page *page);
1294int redirty_page_for_writepage(struct writeback_control *wbc, 1294int redirty_page_for_writepage(struct writeback_control *wbc,
1295 struct page *page); 1295 struct page *page);
1296void account_page_dirtied(struct page *page, struct address_space *mapping); 1296void account_page_dirtied(struct page *page, struct address_space *mapping);
1297void account_page_cleaned(struct page *page, struct address_space *mapping);
1297int set_page_dirty(struct page *page); 1298int set_page_dirty(struct page *page);
1298int set_page_dirty_lock(struct page *page); 1299int set_page_dirty_lock(struct page *page);
1299int clear_page_dirty_for_io(struct page *page); 1300int clear_page_dirty_for_io(struct page *page);
1301
1300int get_cmdline(struct task_struct *task, char *buffer, int buflen); 1302int get_cmdline(struct task_struct *task, char *buffer, int buflen);
1301 1303
1302/* Is the vma a continuation of the stack vma above it? */ 1304/* Is the vma a continuation of the stack vma above it? */
@@ -2109,7 +2111,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma,
2109#define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ 2111#define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */
2110#define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO 2112#define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO
2111 * and return without waiting upon it */ 2113 * and return without waiting upon it */
2112#define FOLL_MLOCK 0x40 /* mark page as mlocked */ 2114#define FOLL_POPULATE 0x40 /* fault in page */
2113#define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */ 2115#define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */
2114#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ 2116#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */
2115#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ 2117#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 199a03aab8dc..590630eb59ba 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -364,7 +364,9 @@ struct mm_struct {
364 atomic_t mm_users; /* How many users with user space? */ 364 atomic_t mm_users; /* How many users with user space? */
365 atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ 365 atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */
366 atomic_long_t nr_ptes; /* PTE page table pages */ 366 atomic_long_t nr_ptes; /* PTE page table pages */
367#if CONFIG_PGTABLE_LEVELS > 2
367 atomic_long_t nr_pmds; /* PMD page table pages */ 368 atomic_long_t nr_pmds; /* PMD page table pages */
369#endif
368 int map_count; /* number of VMAs */ 370 int map_count; /* number of VMAs */
369 371
370 spinlock_t page_table_lock; /* Protects page tables and some counters */ 372 spinlock_t page_table_lock; /* Protects page tables and some counters */
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 9b2022ab4d85..3d46fb4708e0 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -25,16 +25,11 @@ static inline void touch_nmi_watchdog(void)
25#endif 25#endif
26 26
27#if defined(CONFIG_HARDLOCKUP_DETECTOR) 27#if defined(CONFIG_HARDLOCKUP_DETECTOR)
28extern void watchdog_enable_hardlockup_detector(bool val); 28extern void hardlockup_detector_disable(void);
29extern bool watchdog_hardlockup_detector_is_enabled(void);
30#else 29#else
31static inline void watchdog_enable_hardlockup_detector(bool val) 30static inline void hardlockup_detector_disable(void)
32{ 31{
33} 32}
34static inline bool watchdog_hardlockup_detector_is_enabled(void)
35{
36 return true;
37}
38#endif 33#endif
39 34
40/* 35/*
@@ -68,12 +63,20 @@ static inline bool trigger_allbutself_cpu_backtrace(void)
68#ifdef CONFIG_LOCKUP_DETECTOR 63#ifdef CONFIG_LOCKUP_DETECTOR
69int hw_nmi_is_cpu_stuck(struct pt_regs *); 64int hw_nmi_is_cpu_stuck(struct pt_regs *);
70u64 hw_nmi_get_sample_period(int watchdog_thresh); 65u64 hw_nmi_get_sample_period(int watchdog_thresh);
66extern int nmi_watchdog_enabled;
67extern int soft_watchdog_enabled;
71extern int watchdog_user_enabled; 68extern int watchdog_user_enabled;
72extern int watchdog_thresh; 69extern int watchdog_thresh;
73extern int sysctl_softlockup_all_cpu_backtrace; 70extern int sysctl_softlockup_all_cpu_backtrace;
74struct ctl_table; 71struct ctl_table;
75extern int proc_dowatchdog(struct ctl_table *, int , 72extern int proc_watchdog(struct ctl_table *, int ,
76 void __user *, size_t *, loff_t *); 73 void __user *, size_t *, loff_t *);
74extern int proc_nmi_watchdog(struct ctl_table *, int ,
75 void __user *, size_t *, loff_t *);
76extern int proc_soft_watchdog(struct ctl_table *, int ,
77 void __user *, size_t *, loff_t *);
78extern int proc_watchdog_thresh(struct ctl_table *, int ,
79 void __user *, size_t *, loff_t *);
77#endif 80#endif
78 81
79#ifdef CONFIG_HAVE_ACPI_APEI_NMI 82#ifdef CONFIG_HAVE_ACPI_APEI_NMI
diff --git a/include/linux/oom.h b/include/linux/oom.h
index d5771bed59c9..44b2f6f7bbd8 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -66,7 +66,8 @@ extern bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_flags);
66extern void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_flags); 66extern void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_flags);
67 67
68extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, 68extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
69 int order, const nodemask_t *nodemask); 69 int order, const nodemask_t *nodemask,
70 struct mem_cgroup *memcg);
70 71
71extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task, 72extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
72 unsigned long totalpages, const nodemask_t *nodemask, 73 unsigned long totalpages, const nodemask_t *nodemask,
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 5ed7bdaf22d5..c851ff92d5b3 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -328,8 +328,6 @@ static inline void SetPageUptodate(struct page *page)
328 328
329CLEARPAGEFLAG(Uptodate, uptodate) 329CLEARPAGEFLAG(Uptodate, uptodate)
330 330
331extern void cancel_dirty_page(struct page *page, unsigned int account_size);
332
333int test_clear_page_writeback(struct page *page); 331int test_clear_page_writeback(struct page *page);
334int __test_set_page_writeback(struct page *page, bool keep_write); 332int __test_set_page_writeback(struct page *page, bool keep_write);
335 333
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 76f1feeabd38..ffd24c830151 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -18,7 +18,7 @@
18 18
19/* 19/*
20 * Flags to pass to kmem_cache_create(). 20 * Flags to pass to kmem_cache_create().
21 * The ones marked DEBUG are only valid if CONFIG_SLAB_DEBUG is set. 21 * The ones marked DEBUG are only valid if CONFIG_DEBUG_SLAB is set.
22 */ 22 */
23#define SLAB_DEBUG_FREE 0x00000100UL /* DEBUG: Perform (expensive) checks on free */ 23#define SLAB_DEBUG_FREE 0x00000100UL /* DEBUG: Perform (expensive) checks on free */
24#define SLAB_RED_ZONE 0x00000400UL /* DEBUG: Red zone objs in a cache */ 24#define SLAB_RED_ZONE 0x00000400UL /* DEBUG: Red zone objs in a cache */
diff --git a/include/trace/events/xen.h b/include/trace/events/xen.h
index d06b6da5c1e3..bce990f5a35d 100644
--- a/include/trace/events/xen.h
+++ b/include/trace/events/xen.h
@@ -224,7 +224,7 @@ TRACE_EVENT(xen_mmu_pmd_clear,
224 TP_printk("pmdp %p", __entry->pmdp) 224 TP_printk("pmdp %p", __entry->pmdp)
225 ); 225 );
226 226
227#if PAGETABLE_LEVELS >= 4 227#if CONFIG_PGTABLE_LEVELS >= 4
228 228
229TRACE_EVENT(xen_mmu_set_pud, 229TRACE_EVENT(xen_mmu_set_pud,
230 TP_PROTO(pud_t *pudp, pud_t pudval), 230 TP_PROTO(pud_t *pudp, pud_t pudval),
diff --git a/init/main.c b/init/main.c
index e82171b99874..a7e969d12f51 100644
--- a/init/main.c
+++ b/init/main.c
@@ -80,6 +80,7 @@
80#include <linux/list.h> 80#include <linux/list.h>
81#include <linux/integrity.h> 81#include <linux/integrity.h>
82#include <linux/proc_ns.h> 82#include <linux/proc_ns.h>
83#include <linux/io.h>
83 84
84#include <asm/io.h> 85#include <asm/io.h>
85#include <asm/bugs.h> 86#include <asm/bugs.h>
@@ -485,6 +486,7 @@ static void __init mm_init(void)
485 percpu_init_late(); 486 percpu_init_late();
486 pgtable_init(); 487 pgtable_init();
487 vmalloc_init(); 488 vmalloc_init();
489 ioremap_huge_init();
488} 490}
489 491
490asmlinkage __visible void __init start_kernel(void) 492asmlinkage __visible void __init start_kernel(void)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index c68f0721df10..ee14e3a35a29 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2453,20 +2453,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
2453 * @node: is this an allowed node? 2453 * @node: is this an allowed node?
2454 * @gfp_mask: memory allocation flags 2454 * @gfp_mask: memory allocation flags
2455 * 2455 *
2456 * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is 2456 * If we're in interrupt, yes, we can always allocate. If @node is set in
2457 * set, yes, we can always allocate. If node is in our task's mems_allowed, 2457 * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this
2458 * yes. If it's not a __GFP_HARDWALL request and this node is in the nearest 2458 * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
2459 * hardwalled cpuset ancestor to this task's cpuset, yes. If the task has been 2459 * yes. If current has access to memory reserves due to TIF_MEMDIE, yes.
2460 * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE
2461 * flag, yes.
2462 * Otherwise, no. 2460 * Otherwise, no.
2463 * 2461 *
2464 * The __GFP_THISNODE placement logic is really handled elsewhere,
2465 * by forcibly using a zonelist starting at a specified node, and by
2466 * (in get_page_from_freelist()) refusing to consider the zones for
2467 * any node on the zonelist except the first. By the time any such
2468 * calls get to this routine, we should just shut up and say 'yes'.
2469 *
2470 * GFP_USER allocations are marked with the __GFP_HARDWALL bit, 2462 * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
2471 * and do not allow allocations outside the current tasks cpuset 2463 * and do not allow allocations outside the current tasks cpuset
2472 * unless the task has been OOM killed as is marked TIF_MEMDIE. 2464 * unless the task has been OOM killed as is marked TIF_MEMDIE.
@@ -2502,7 +2494,7 @@ int __cpuset_node_allowed(int node, gfp_t gfp_mask)
2502 int allowed; /* is allocation in zone z allowed? */ 2494 int allowed; /* is allocation in zone z allowed? */
2503 unsigned long flags; 2495 unsigned long flags;
2504 2496
2505 if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) 2497 if (in_interrupt())
2506 return 1; 2498 return 1;
2507 if (node_isset(node, current->mems_allowed)) 2499 if (node_isset(node, current->mems_allowed))
2508 return 1; 2500 return 1;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4012336de30f..8c0eabd41886 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -847,7 +847,7 @@ static struct ctl_table kern_table[] = {
847 .data = &watchdog_user_enabled, 847 .data = &watchdog_user_enabled,
848 .maxlen = sizeof (int), 848 .maxlen = sizeof (int),
849 .mode = 0644, 849 .mode = 0644,
850 .proc_handler = proc_dowatchdog, 850 .proc_handler = proc_watchdog,
851 .extra1 = &zero, 851 .extra1 = &zero,
852 .extra2 = &one, 852 .extra2 = &one,
853 }, 853 },
@@ -856,11 +856,33 @@ static struct ctl_table kern_table[] = {
856 .data = &watchdog_thresh, 856 .data = &watchdog_thresh,
857 .maxlen = sizeof(int), 857 .maxlen = sizeof(int),
858 .mode = 0644, 858 .mode = 0644,
859 .proc_handler = proc_dowatchdog, 859 .proc_handler = proc_watchdog_thresh,
860 .extra1 = &zero, 860 .extra1 = &zero,
861 .extra2 = &sixty, 861 .extra2 = &sixty,
862 }, 862 },
863 { 863 {
864 .procname = "nmi_watchdog",
865 .data = &nmi_watchdog_enabled,
866 .maxlen = sizeof (int),
867 .mode = 0644,
868 .proc_handler = proc_nmi_watchdog,
869 .extra1 = &zero,
870#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
871 .extra2 = &one,
872#else
873 .extra2 = &zero,
874#endif
875 },
876 {
877 .procname = "soft_watchdog",
878 .data = &soft_watchdog_enabled,
879 .maxlen = sizeof (int),
880 .mode = 0644,
881 .proc_handler = proc_soft_watchdog,
882 .extra1 = &zero,
883 .extra2 = &one,
884 },
885 {
864 .procname = "softlockup_panic", 886 .procname = "softlockup_panic",
865 .data = &softlockup_panic, 887 .data = &softlockup_panic,
866 .maxlen = sizeof(int), 888 .maxlen = sizeof(int),
@@ -880,15 +902,6 @@ static struct ctl_table kern_table[] = {
880 .extra2 = &one, 902 .extra2 = &one,
881 }, 903 },
882#endif /* CONFIG_SMP */ 904#endif /* CONFIG_SMP */
883 {
884 .procname = "nmi_watchdog",
885 .data = &watchdog_user_enabled,
886 .maxlen = sizeof (int),
887 .mode = 0644,
888 .proc_handler = proc_dowatchdog,
889 .extra1 = &zero,
890 .extra2 = &one,
891 },
892#endif 905#endif
893#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) 906#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
894 { 907 {
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 9a056f5bc02c..2316f50b07a4 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -24,8 +24,33 @@
24#include <linux/kvm_para.h> 24#include <linux/kvm_para.h>
25#include <linux/perf_event.h> 25#include <linux/perf_event.h>
26 26
27int watchdog_user_enabled = 1; 27/*
28 * The run state of the lockup detectors is controlled by the content of the
29 * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit -
30 * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector.
31 *
32 * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled'
33 * are variables that are only used as an 'interface' between the parameters
34 * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The
35 * 'watchdog_thresh' variable is handled differently because its value is not
36 * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh'
37 * is equal zero.
38 */
39#define NMI_WATCHDOG_ENABLED_BIT 0
40#define SOFT_WATCHDOG_ENABLED_BIT 1
41#define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT)
42#define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT)
43
44#ifdef CONFIG_HARDLOCKUP_DETECTOR
45static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED;
46#else
47static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
48#endif
49int __read_mostly nmi_watchdog_enabled;
50int __read_mostly soft_watchdog_enabled;
51int __read_mostly watchdog_user_enabled;
28int __read_mostly watchdog_thresh = 10; 52int __read_mostly watchdog_thresh = 10;
53
29#ifdef CONFIG_SMP 54#ifdef CONFIG_SMP
30int __read_mostly sysctl_softlockup_all_cpu_backtrace; 55int __read_mostly sysctl_softlockup_all_cpu_backtrace;
31#else 56#else
@@ -58,8 +83,6 @@ static unsigned long soft_lockup_nmi_warn;
58#ifdef CONFIG_HARDLOCKUP_DETECTOR 83#ifdef CONFIG_HARDLOCKUP_DETECTOR
59static int hardlockup_panic = 84static int hardlockup_panic =
60 CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; 85 CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
61
62static bool hardlockup_detector_enabled = true;
63/* 86/*
64 * We may not want to enable hard lockup detection by default in all cases, 87 * We may not want to enable hard lockup detection by default in all cases,
65 * for example when running the kernel as a guest on a hypervisor. In these 88 * for example when running the kernel as a guest on a hypervisor. In these
@@ -68,14 +91,9 @@ static bool hardlockup_detector_enabled = true;
68 * kernel command line parameters are parsed, because otherwise it is not 91 * kernel command line parameters are parsed, because otherwise it is not
69 * possible to override this in hardlockup_panic_setup(). 92 * possible to override this in hardlockup_panic_setup().
70 */ 93 */
71void watchdog_enable_hardlockup_detector(bool val) 94void hardlockup_detector_disable(void)
72{
73 hardlockup_detector_enabled = val;
74}
75
76bool watchdog_hardlockup_detector_is_enabled(void)
77{ 95{
78 return hardlockup_detector_enabled; 96 watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
79} 97}
80 98
81static int __init hardlockup_panic_setup(char *str) 99static int __init hardlockup_panic_setup(char *str)
@@ -85,15 +103,9 @@ static int __init hardlockup_panic_setup(char *str)
85 else if (!strncmp(str, "nopanic", 7)) 103 else if (!strncmp(str, "nopanic", 7))
86 hardlockup_panic = 0; 104 hardlockup_panic = 0;
87 else if (!strncmp(str, "0", 1)) 105 else if (!strncmp(str, "0", 1))
88 watchdog_user_enabled = 0; 106 watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
89 else if (!strncmp(str, "1", 1) || !strncmp(str, "2", 1)) { 107 else if (!strncmp(str, "1", 1))
90 /* 108 watchdog_enabled |= NMI_WATCHDOG_ENABLED;
91 * Setting 'nmi_watchdog=1' or 'nmi_watchdog=2' (legacy option)
92 * has the same effect.
93 */
94 watchdog_user_enabled = 1;
95 watchdog_enable_hardlockup_detector(true);
96 }
97 return 1; 109 return 1;
98} 110}
99__setup("nmi_watchdog=", hardlockup_panic_setup); 111__setup("nmi_watchdog=", hardlockup_panic_setup);
@@ -112,19 +124,18 @@ __setup("softlockup_panic=", softlockup_panic_setup);
112 124
113static int __init nowatchdog_setup(char *str) 125static int __init nowatchdog_setup(char *str)
114{ 126{
115 watchdog_user_enabled = 0; 127 watchdog_enabled = 0;
116 return 1; 128 return 1;
117} 129}
118__setup("nowatchdog", nowatchdog_setup); 130__setup("nowatchdog", nowatchdog_setup);
119 131
120/* deprecated */
121static int __init nosoftlockup_setup(char *str) 132static int __init nosoftlockup_setup(char *str)
122{ 133{
123 watchdog_user_enabled = 0; 134 watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED;
124 return 1; 135 return 1;
125} 136}
126__setup("nosoftlockup", nosoftlockup_setup); 137__setup("nosoftlockup", nosoftlockup_setup);
127/* */ 138
128#ifdef CONFIG_SMP 139#ifdef CONFIG_SMP
129static int __init softlockup_all_cpu_backtrace_setup(char *str) 140static int __init softlockup_all_cpu_backtrace_setup(char *str)
130{ 141{
@@ -239,10 +250,11 @@ static int is_softlockup(unsigned long touch_ts)
239{ 250{
240 unsigned long now = get_timestamp(); 251 unsigned long now = get_timestamp();
241 252
242 /* Warn about unreasonable delays: */ 253 if (watchdog_enabled & SOFT_WATCHDOG_ENABLED) {
243 if (time_after(now, touch_ts + get_softlockup_thresh())) 254 /* Warn about unreasonable delays. */
244 return now - touch_ts; 255 if (time_after(now, touch_ts + get_softlockup_thresh()))
245 256 return now - touch_ts;
257 }
246 return 0; 258 return 0;
247} 259}
248 260
@@ -477,6 +489,21 @@ static void watchdog(unsigned int cpu)
477 __this_cpu_write(soft_lockup_hrtimer_cnt, 489 __this_cpu_write(soft_lockup_hrtimer_cnt,
478 __this_cpu_read(hrtimer_interrupts)); 490 __this_cpu_read(hrtimer_interrupts));
479 __touch_watchdog(); 491 __touch_watchdog();
492
493 /*
494 * watchdog_nmi_enable() clears the NMI_WATCHDOG_ENABLED bit in the
495 * failure path. Check for failures that can occur asynchronously -
496 * for example, when CPUs are on-lined - and shut down the hardware
497 * perf event on each CPU accordingly.
498 *
499 * The only non-obvious place this bit can be cleared is through
500 * watchdog_nmi_enable(), so a pr_info() is placed there. Placing a
501 * pr_info here would be too noisy as it would result in a message
502 * every few seconds if the hardlockup was disabled but the softlockup
503 * enabled.
504 */
505 if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
506 watchdog_nmi_disable(cpu);
480} 507}
481 508
482#ifdef CONFIG_HARDLOCKUP_DETECTOR 509#ifdef CONFIG_HARDLOCKUP_DETECTOR
@@ -492,14 +519,9 @@ static int watchdog_nmi_enable(unsigned int cpu)
492 struct perf_event_attr *wd_attr; 519 struct perf_event_attr *wd_attr;
493 struct perf_event *event = per_cpu(watchdog_ev, cpu); 520 struct perf_event *event = per_cpu(watchdog_ev, cpu);
494 521
495 /* 522 /* nothing to do if the hard lockup detector is disabled */
496 * Some kernels need to default hard lockup detection to 523 if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
497 * 'disabled', for example a guest on a hypervisor. 524 goto out;
498 */
499 if (!watchdog_hardlockup_detector_is_enabled()) {
500 event = ERR_PTR(-ENOENT);
501 goto handle_err;
502 }
503 525
504 /* is it already setup and enabled? */ 526 /* is it already setup and enabled? */
505 if (event && event->state > PERF_EVENT_STATE_OFF) 527 if (event && event->state > PERF_EVENT_STATE_OFF)
@@ -515,7 +537,6 @@ static int watchdog_nmi_enable(unsigned int cpu)
515 /* Try to register using hardware perf events */ 537 /* Try to register using hardware perf events */
516 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); 538 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
517 539
518handle_err:
519 /* save cpu0 error for future comparision */ 540 /* save cpu0 error for future comparision */
520 if (cpu == 0 && IS_ERR(event)) 541 if (cpu == 0 && IS_ERR(event))
521 cpu0_err = PTR_ERR(event); 542 cpu0_err = PTR_ERR(event);
@@ -527,6 +548,18 @@ handle_err:
527 goto out_save; 548 goto out_save;
528 } 549 }
529 550
551 /*
552 * Disable the hard lockup detector if _any_ CPU fails to set up
553 * set up the hardware perf event. The watchdog() function checks
554 * the NMI_WATCHDOG_ENABLED bit periodically.
555 *
556 * The barriers are for syncing up watchdog_enabled across all the
557 * cpus, as clear_bit() does not use barriers.
558 */
559 smp_mb__before_atomic();
560 clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
561 smp_mb__after_atomic();
562
530 /* skip displaying the same error again */ 563 /* skip displaying the same error again */
531 if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) 564 if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
532 return PTR_ERR(event); 565 return PTR_ERR(event);
@@ -540,6 +573,9 @@ handle_err:
540 else 573 else
541 pr_err("disabled (cpu%i): unable to create perf event: %ld\n", 574 pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
542 cpu, PTR_ERR(event)); 575 cpu, PTR_ERR(event));
576
577 pr_info("Shutting down hard lockup detector on all cpus\n");
578
543 return PTR_ERR(event); 579 return PTR_ERR(event);
544 580
545 /* success path */ 581 /* success path */
@@ -628,7 +664,7 @@ static void restart_watchdog_hrtimer(void *info)
628 HRTIMER_MODE_REL_PINNED); 664 HRTIMER_MODE_REL_PINNED);
629} 665}
630 666
631static void update_timers(int cpu) 667static void update_watchdog(int cpu)
632{ 668{
633 /* 669 /*
634 * Make sure that perf event counter will adopt to a new 670 * Make sure that perf event counter will adopt to a new
@@ -643,17 +679,17 @@ static void update_timers(int cpu)
643 watchdog_nmi_enable(cpu); 679 watchdog_nmi_enable(cpu);
644} 680}
645 681
646static void update_timers_all_cpus(void) 682static void update_watchdog_all_cpus(void)
647{ 683{
648 int cpu; 684 int cpu;
649 685
650 get_online_cpus(); 686 get_online_cpus();
651 for_each_online_cpu(cpu) 687 for_each_online_cpu(cpu)
652 update_timers(cpu); 688 update_watchdog(cpu);
653 put_online_cpus(); 689 put_online_cpus();
654} 690}
655 691
656static int watchdog_enable_all_cpus(bool sample_period_changed) 692static int watchdog_enable_all_cpus(void)
657{ 693{
658 int err = 0; 694 int err = 0;
659 695
@@ -663,8 +699,12 @@ static int watchdog_enable_all_cpus(bool sample_period_changed)
663 pr_err("Failed to create watchdog threads, disabled\n"); 699 pr_err("Failed to create watchdog threads, disabled\n");
664 else 700 else
665 watchdog_running = 1; 701 watchdog_running = 1;
666 } else if (sample_period_changed) { 702 } else {
667 update_timers_all_cpus(); 703 /*
704 * Enable/disable the lockup detectors or
705 * change the sample period 'on the fly'.
706 */
707 update_watchdog_all_cpus();
668 } 708 }
669 709
670 return err; 710 return err;
@@ -682,48 +722,149 @@ static void watchdog_disable_all_cpus(void)
682} 722}
683 723
684/* 724/*
685 * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh 725 * Update the run state of the lockup detectors.
726 */
727static int proc_watchdog_update(void)
728{
729 int err = 0;
730
731 /*
732 * Watchdog threads won't be started if they are already active.
733 * The 'watchdog_running' variable in watchdog_*_all_cpus() takes
734 * care of this. If those threads are already active, the sample
735 * period will be updated and the lockup detectors will be enabled
736 * or disabled 'on the fly'.
737 */
738 if (watchdog_enabled && watchdog_thresh)
739 err = watchdog_enable_all_cpus();
740 else
741 watchdog_disable_all_cpus();
742
743 return err;
744
745}
746
747static DEFINE_MUTEX(watchdog_proc_mutex);
748
749/*
750 * common function for watchdog, nmi_watchdog and soft_watchdog parameter
751 *
752 * caller | table->data points to | 'which' contains the flag(s)
753 * -------------------|-----------------------|-----------------------------
754 * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED or'ed
755 * | | with SOFT_WATCHDOG_ENABLED
756 * -------------------|-----------------------|-----------------------------
757 * proc_nmi_watchdog | nmi_watchdog_enabled | NMI_WATCHDOG_ENABLED
758 * -------------------|-----------------------|-----------------------------
759 * proc_soft_watchdog | soft_watchdog_enabled | SOFT_WATCHDOG_ENABLED
760 */
761static int proc_watchdog_common(int which, struct ctl_table *table, int write,
762 void __user *buffer, size_t *lenp, loff_t *ppos)
763{
764 int err, old, new;
765 int *watchdog_param = (int *)table->data;
766
767 mutex_lock(&watchdog_proc_mutex);
768
769 /*
770 * If the parameter is being read return the state of the corresponding
771 * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the
772 * run state of the lockup detectors.
773 */
774 if (!write) {
775 *watchdog_param = (watchdog_enabled & which) != 0;
776 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
777 } else {
778 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
779 if (err)
780 goto out;
781
782 /*
783 * There is a race window between fetching the current value
784 * from 'watchdog_enabled' and storing the new value. During
785 * this race window, watchdog_nmi_enable() can sneak in and
786 * clear the NMI_WATCHDOG_ENABLED bit in 'watchdog_enabled'.
787 * The 'cmpxchg' detects this race and the loop retries.
788 */
789 do {
790 old = watchdog_enabled;
791 /*
792 * If the parameter value is not zero set the
793 * corresponding bit(s), else clear it(them).
794 */
795 if (*watchdog_param)
796 new = old | which;
797 else
798 new = old & ~which;
799 } while (cmpxchg(&watchdog_enabled, old, new) != old);
800
801 /*
802 * Update the run state of the lockup detectors.
803 * Restore 'watchdog_enabled' on failure.
804 */
805 err = proc_watchdog_update();
806 if (err)
807 watchdog_enabled = old;
808 }
809out:
810 mutex_unlock(&watchdog_proc_mutex);
811 return err;
812}
813
814/*
815 * /proc/sys/kernel/watchdog
816 */
817int proc_watchdog(struct ctl_table *table, int write,
818 void __user *buffer, size_t *lenp, loff_t *ppos)
819{
820 return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED,
821 table, write, buffer, lenp, ppos);
822}
823
824/*
825 * /proc/sys/kernel/nmi_watchdog
686 */ 826 */
827int proc_nmi_watchdog(struct ctl_table *table, int write,
828 void __user *buffer, size_t *lenp, loff_t *ppos)
829{
830 return proc_watchdog_common(NMI_WATCHDOG_ENABLED,
831 table, write, buffer, lenp, ppos);
832}
833
834/*
835 * /proc/sys/kernel/soft_watchdog
836 */
837int proc_soft_watchdog(struct ctl_table *table, int write,
838 void __user *buffer, size_t *lenp, loff_t *ppos)
839{
840 return proc_watchdog_common(SOFT_WATCHDOG_ENABLED,
841 table, write, buffer, lenp, ppos);
842}
687 843
688int proc_dowatchdog(struct ctl_table *table, int write, 844/*
689 void __user *buffer, size_t *lenp, loff_t *ppos) 845 * /proc/sys/kernel/watchdog_thresh
846 */
847int proc_watchdog_thresh(struct ctl_table *table, int write,
848 void __user *buffer, size_t *lenp, loff_t *ppos)
690{ 849{
691 int err, old_thresh, old_enabled; 850 int err, old;
692 bool old_hardlockup;
693 static DEFINE_MUTEX(watchdog_proc_mutex);
694 851
695 mutex_lock(&watchdog_proc_mutex); 852 mutex_lock(&watchdog_proc_mutex);
696 old_thresh = ACCESS_ONCE(watchdog_thresh);
697 old_enabled = ACCESS_ONCE(watchdog_user_enabled);
698 old_hardlockup = watchdog_hardlockup_detector_is_enabled();
699 853
854 old = ACCESS_ONCE(watchdog_thresh);
700 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 855 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
856
701 if (err || !write) 857 if (err || !write)
702 goto out; 858 goto out;
703 859
704 set_sample_period();
705 /* 860 /*
706 * Watchdog threads shouldn't be enabled if they are 861 * Update the sample period.
707 * disabled. The 'watchdog_running' variable check in 862 * Restore 'watchdog_thresh' on failure.
708 * watchdog_*_all_cpus() function takes care of this.
709 */ 863 */
710 if (watchdog_user_enabled && watchdog_thresh) { 864 set_sample_period();
711 /* 865 err = proc_watchdog_update();
712 * Prevent a change in watchdog_thresh accidentally overriding 866 if (err)
713 * the enablement of the hardlockup detector. 867 watchdog_thresh = old;
714 */
715 if (watchdog_user_enabled != old_enabled)
716 watchdog_enable_hardlockup_detector(true);
717 err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh);
718 } else
719 watchdog_disable_all_cpus();
720
721 /* Restore old values on failure */
722 if (err) {
723 watchdog_thresh = old_thresh;
724 watchdog_user_enabled = old_enabled;
725 watchdog_enable_hardlockup_detector(old_hardlockup);
726 }
727out: 868out:
728 mutex_unlock(&watchdog_proc_mutex); 869 mutex_unlock(&watchdog_proc_mutex);
729 return err; 870 return err;
@@ -734,6 +875,6 @@ void __init lockup_detector_init(void)
734{ 875{
735 set_sample_period(); 876 set_sample_period();
736 877
737 if (watchdog_user_enabled) 878 if (watchdog_enabled)
738 watchdog_enable_all_cpus(false); 879 watchdog_enable_all_cpus();
739} 880}
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 93967e634a1e..17670573dda8 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1760,6 +1760,18 @@ config TEST_UDELAY
1760 1760
1761 If unsure, say N. 1761 If unsure, say N.
1762 1762
1763config MEMTEST
1764 bool "Memtest"
1765 depends on HAVE_MEMBLOCK
1766 ---help---
1767 This option adds a kernel parameter 'memtest', which allows memtest
1768 to be set.
1769 memtest=0, mean disabled; -- default
1770 memtest=1, mean do 1 test pattern;
1771 ...
1772 memtest=17, mean do 17 test patterns.
1773 If you are unsure how to answer this question, answer N.
1774
1763source "samples/Kconfig" 1775source "samples/Kconfig"
1764 1776
1765source "lib/Kconfig.kgdb" 1777source "lib/Kconfig.kgdb"
diff --git a/lib/ioremap.c b/lib/ioremap.c
index 0c9216c48762..86c8911b0e3a 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -13,6 +13,43 @@
13#include <asm/cacheflush.h> 13#include <asm/cacheflush.h>
14#include <asm/pgtable.h> 14#include <asm/pgtable.h>
15 15
16#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
17static int __read_mostly ioremap_pud_capable;
18static int __read_mostly ioremap_pmd_capable;
19static int __read_mostly ioremap_huge_disabled;
20
21static int __init set_nohugeiomap(char *str)
22{
23 ioremap_huge_disabled = 1;
24 return 0;
25}
26early_param("nohugeiomap", set_nohugeiomap);
27
28void __init ioremap_huge_init(void)
29{
30 if (!ioremap_huge_disabled) {
31 if (arch_ioremap_pud_supported())
32 ioremap_pud_capable = 1;
33 if (arch_ioremap_pmd_supported())
34 ioremap_pmd_capable = 1;
35 }
36}
37
38static inline int ioremap_pud_enabled(void)
39{
40 return ioremap_pud_capable;
41}
42
43static inline int ioremap_pmd_enabled(void)
44{
45 return ioremap_pmd_capable;
46}
47
48#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
49static inline int ioremap_pud_enabled(void) { return 0; }
50static inline int ioremap_pmd_enabled(void) { return 0; }
51#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
52
16static int ioremap_pte_range(pmd_t *pmd, unsigned long addr, 53static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
17 unsigned long end, phys_addr_t phys_addr, pgprot_t prot) 54 unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
18{ 55{
@@ -43,6 +80,14 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
43 return -ENOMEM; 80 return -ENOMEM;
44 do { 81 do {
45 next = pmd_addr_end(addr, end); 82 next = pmd_addr_end(addr, end);
83
84 if (ioremap_pmd_enabled() &&
85 ((next - addr) == PMD_SIZE) &&
86 IS_ALIGNED(phys_addr + addr, PMD_SIZE)) {
87 if (pmd_set_huge(pmd, phys_addr + addr, prot))
88 continue;
89 }
90
46 if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, prot)) 91 if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, prot))
47 return -ENOMEM; 92 return -ENOMEM;
48 } while (pmd++, addr = next, addr != end); 93 } while (pmd++, addr = next, addr != end);
@@ -61,6 +106,14 @@ static inline int ioremap_pud_range(pgd_t *pgd, unsigned long addr,
61 return -ENOMEM; 106 return -ENOMEM;
62 do { 107 do {
63 next = pud_addr_end(addr, end); 108 next = pud_addr_end(addr, end);
109
110 if (ioremap_pud_enabled() &&
111 ((next - addr) == PUD_SIZE) &&
112 IS_ALIGNED(phys_addr + addr, PUD_SIZE)) {
113 if (pud_set_huge(pud, phys_addr + addr, prot))
114 continue;
115 }
116
64 if (ioremap_pmd_range(pud, addr, next, phys_addr + addr, prot)) 117 if (ioremap_pmd_range(pud, addr, next, phys_addr + addr, prot))
65 return -ENOMEM; 118 return -ENOMEM;
66 } while (pud++, addr = next, addr != end); 119 } while (pud++, addr = next, addr != end);
diff --git a/mm/Kconfig b/mm/Kconfig
index a03131b6ba8e..390214da4546 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -517,6 +517,12 @@ config CMA_DEBUG
517 processing calls such as dma_alloc_from_contiguous(). 517 processing calls such as dma_alloc_from_contiguous().
518 This option does not affect warning and error messages. 518 This option does not affect warning and error messages.
519 519
520config CMA_DEBUGFS
521 bool "CMA debugfs interface"
522 depends on CMA && DEBUG_FS
523 help
524 Turns on the DebugFS interface for CMA.
525
520config CMA_AREAS 526config CMA_AREAS
521 int "Maximum count of the CMA areas" 527 int "Maximum count of the CMA areas"
522 depends on CMA 528 depends on CMA
diff --git a/mm/Makefile b/mm/Makefile
index 15dbe9903c27..98c4eaeabdcb 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
55obj-$(CONFIG_KASAN) += kasan/ 55obj-$(CONFIG_KASAN) += kasan/
56obj-$(CONFIG_FAILSLAB) += failslab.o 56obj-$(CONFIG_FAILSLAB) += failslab.o
57obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 57obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
58obj-$(CONFIG_MEMTEST) += memtest.o
58obj-$(CONFIG_MIGRATION) += migrate.o 59obj-$(CONFIG_MIGRATION) += migrate.o
59obj-$(CONFIG_QUICKLIST) += quicklist.o 60obj-$(CONFIG_QUICKLIST) += quicklist.o
60obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o 61obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
@@ -76,3 +77,4 @@ obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
76obj-$(CONFIG_CMA) += cma.o 77obj-$(CONFIG_CMA) += cma.o
77obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o 78obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
78obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o 79obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
80obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
diff --git a/mm/cleancache.c b/mm/cleancache.c
index 053bcd8f12fb..8fc50811119b 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -19,7 +19,7 @@
19#include <linux/cleancache.h> 19#include <linux/cleancache.h>
20 20
21/* 21/*
22 * cleancache_ops is set by cleancache_ops_register to contain the pointers 22 * cleancache_ops is set by cleancache_register_ops to contain the pointers
23 * to the cleancache "backend" implementation functions. 23 * to the cleancache "backend" implementation functions.
24 */ 24 */
25static struct cleancache_ops *cleancache_ops __read_mostly; 25static struct cleancache_ops *cleancache_ops __read_mostly;
@@ -34,145 +34,107 @@ static u64 cleancache_failed_gets;
34static u64 cleancache_puts; 34static u64 cleancache_puts;
35static u64 cleancache_invalidates; 35static u64 cleancache_invalidates;
36 36
37/* 37static void cleancache_register_ops_sb(struct super_block *sb, void *unused)
38 * When no backend is registered all calls to init_fs and init_shared_fs 38{
39 * are registered and fake poolids (FAKE_FS_POOLID_OFFSET or 39 switch (sb->cleancache_poolid) {
40 * FAKE_SHARED_FS_POOLID_OFFSET, plus offset in the respective array 40 case CLEANCACHE_NO_BACKEND:
41 * [shared_|]fs_poolid_map) are given to the respective super block 41 __cleancache_init_fs(sb);
42 * (sb->cleancache_poolid) and no tmem_pools are created. When a backend 42 break;
43 * registers with cleancache the previous calls to init_fs and init_shared_fs 43 case CLEANCACHE_NO_BACKEND_SHARED:
44 * are executed to create tmem_pools and set the respective poolids. While no 44 __cleancache_init_shared_fs(sb);
45 * backend is registered all "puts", "gets" and "flushes" are ignored or failed. 45 break;
46 */ 46 }
47#define MAX_INITIALIZABLE_FS 32 47}
48#define FAKE_FS_POOLID_OFFSET 1000
49#define FAKE_SHARED_FS_POOLID_OFFSET 2000
50
51#define FS_NO_BACKEND (-1)
52#define FS_UNKNOWN (-2)
53static int fs_poolid_map[MAX_INITIALIZABLE_FS];
54static int shared_fs_poolid_map[MAX_INITIALIZABLE_FS];
55static char *uuids[MAX_INITIALIZABLE_FS];
56/*
57 * Mutex for the [shared_|]fs_poolid_map to guard against multiple threads
58 * invoking umount (and ending in __cleancache_invalidate_fs) and also multiple
59 * threads calling mount (and ending up in __cleancache_init_[shared|]fs).
60 */
61static DEFINE_MUTEX(poolid_mutex);
62/*
63 * When set to false (default) all calls to the cleancache functions, except
64 * the __cleancache_invalidate_fs and __cleancache_init_[shared|]fs are guarded
65 * by the if (!cleancache_ops) return. This means multiple threads (from
66 * different filesystems) will be checking cleancache_ops. The usage of a
67 * bool instead of a atomic_t or a bool guarded by a spinlock is OK - we are
68 * OK if the time between the backend's have been initialized (and
69 * cleancache_ops has been set to not NULL) and when the filesystems start
70 * actually calling the backends. The inverse (when unloading) is obviously
71 * not good - but this shim does not do that (yet).
72 */
73
74/*
75 * The backends and filesystems work all asynchronously. This is b/c the
76 * backends can be built as modules.
77 * The usual sequence of events is:
78 * a) mount / -> __cleancache_init_fs is called. We set the
79 * [shared_|]fs_poolid_map and uuids for.
80 *
81 * b). user does I/Os -> we call the rest of __cleancache_* functions
82 * which return immediately as cleancache_ops is false.
83 *
84 * c). modprobe zcache -> cleancache_register_ops. We init the backend
85 * and set cleancache_ops to true, and for any fs_poolid_map
86 * (which is set by __cleancache_init_fs) we initialize the poolid.
87 *
88 * d). user does I/Os -> now that cleancache_ops is true all the
89 * __cleancache_* functions can call the backend. They all check
90 * that fs_poolid_map is valid and if so invoke the backend.
91 *
92 * e). umount / -> __cleancache_invalidate_fs, the fs_poolid_map is
93 * reset (which is the second check in the __cleancache_* ops
94 * to call the backend).
95 *
96 * The sequence of event could also be c), followed by a), and d). and e). The
97 * c) would not happen anymore. There is also the chance of c), and one thread
98 * doing a) + d), and another doing e). For that case we depend on the
99 * filesystem calling __cleancache_invalidate_fs in the proper sequence (so
100 * that it handles all I/Os before it invalidates the fs (which is last part
101 * of unmounting process).
102 *
103 * Note: The acute reader will notice that there is no "rmmod zcache" case.
104 * This is b/c the functionality for that is not yet implemented and when
105 * done, will require some extra locking not yet devised.
106 */
107 48
108/* 49/*
109 * Register operations for cleancache, returning previous thus allowing 50 * Register operations for cleancache. Returns 0 on success.
110 * detection of multiple backends and possible nesting.
111 */ 51 */
112struct cleancache_ops *cleancache_register_ops(struct cleancache_ops *ops) 52int cleancache_register_ops(struct cleancache_ops *ops)
113{ 53{
114 struct cleancache_ops *old = cleancache_ops; 54 if (cmpxchg(&cleancache_ops, NULL, ops))
115 int i; 55 return -EBUSY;
116 56
117 mutex_lock(&poolid_mutex);
118 for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
119 if (fs_poolid_map[i] == FS_NO_BACKEND)
120 fs_poolid_map[i] = ops->init_fs(PAGE_SIZE);
121 if (shared_fs_poolid_map[i] == FS_NO_BACKEND)
122 shared_fs_poolid_map[i] = ops->init_shared_fs
123 (uuids[i], PAGE_SIZE);
124 }
125 /* 57 /*
126 * We MUST set cleancache_ops _after_ we have called the backends 58 * A cleancache backend can be built as a module and hence loaded after
127 * init_fs or init_shared_fs functions. Otherwise the compiler might 59 * a cleancache enabled filesystem has called cleancache_init_fs. To
128 * re-order where cleancache_ops is set in this function. 60 * handle such a scenario, here we call ->init_fs or ->init_shared_fs
61 * for each active super block. To differentiate between local and
62 * shared filesystems, we temporarily initialize sb->cleancache_poolid
63 * to CLEANCACHE_NO_BACKEND or CLEANCACHE_NO_BACKEND_SHARED
64 * respectively in case there is no backend registered at the time
65 * cleancache_init_fs or cleancache_init_shared_fs is called.
66 *
67 * Since filesystems can be mounted concurrently with cleancache
68 * backend registration, we have to be careful to guarantee that all
69 * cleancache enabled filesystems that has been mounted by the time
70 * cleancache_register_ops is called has got and all mounted later will
71 * get cleancache_poolid. This is assured by the following statements
72 * tied together:
73 *
74 * a) iterate_supers skips only those super blocks that has started
75 * ->kill_sb
76 *
77 * b) if iterate_supers encounters a super block that has not finished
78 * ->mount yet, it waits until it is finished
79 *
80 * c) cleancache_init_fs is called from ->mount and
81 * cleancache_invalidate_fs is called from ->kill_sb
82 *
83 * d) we call iterate_supers after cleancache_ops has been set
84 *
85 * From a) it follows that if iterate_supers skips a super block, then
86 * either the super block is already dead, in which case we do not need
87 * to bother initializing cleancache for it, or it was mounted after we
88 * initiated iterate_supers. In the latter case, it must have seen
89 * cleancache_ops set according to d) and initialized cleancache from
90 * ->mount by itself according to c). This proves that we call
91 * ->init_fs at least once for each active super block.
92 *
93 * From b) and c) it follows that if iterate_supers encounters a super
94 * block that has already started ->init_fs, it will wait until ->mount
95 * and hence ->init_fs has finished, then check cleancache_poolid, see
96 * that it has already been set and therefore do nothing. This proves
97 * that we call ->init_fs no more than once for each super block.
98 *
99 * Combined together, the last two paragraphs prove the function
100 * correctness.
101 *
102 * Note that various cleancache callbacks may proceed before this
103 * function is called or even concurrently with it, but since
104 * CLEANCACHE_NO_BACKEND is negative, they will all result in a noop
105 * until the corresponding ->init_fs has been actually called and
106 * cleancache_ops has been set.
129 */ 107 */
130 barrier(); 108 iterate_supers(cleancache_register_ops_sb, NULL);
131 cleancache_ops = ops; 109 return 0;
132 mutex_unlock(&poolid_mutex);
133 return old;
134} 110}
135EXPORT_SYMBOL(cleancache_register_ops); 111EXPORT_SYMBOL(cleancache_register_ops);
136 112
137/* Called by a cleancache-enabled filesystem at time of mount */ 113/* Called by a cleancache-enabled filesystem at time of mount */
138void __cleancache_init_fs(struct super_block *sb) 114void __cleancache_init_fs(struct super_block *sb)
139{ 115{
140 int i; 116 int pool_id = CLEANCACHE_NO_BACKEND;
141 117
142 mutex_lock(&poolid_mutex); 118 if (cleancache_ops) {
143 for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { 119 pool_id = cleancache_ops->init_fs(PAGE_SIZE);
144 if (fs_poolid_map[i] == FS_UNKNOWN) { 120 if (pool_id < 0)
145 sb->cleancache_poolid = i + FAKE_FS_POOLID_OFFSET; 121 pool_id = CLEANCACHE_NO_POOL;
146 if (cleancache_ops)
147 fs_poolid_map[i] = cleancache_ops->init_fs(PAGE_SIZE);
148 else
149 fs_poolid_map[i] = FS_NO_BACKEND;
150 break;
151 }
152 } 122 }
153 mutex_unlock(&poolid_mutex); 123 sb->cleancache_poolid = pool_id;
154} 124}
155EXPORT_SYMBOL(__cleancache_init_fs); 125EXPORT_SYMBOL(__cleancache_init_fs);
156 126
157/* Called by a cleancache-enabled clustered filesystem at time of mount */ 127/* Called by a cleancache-enabled clustered filesystem at time of mount */
158void __cleancache_init_shared_fs(char *uuid, struct super_block *sb) 128void __cleancache_init_shared_fs(struct super_block *sb)
159{ 129{
160 int i; 130 int pool_id = CLEANCACHE_NO_BACKEND_SHARED;
161 131
162 mutex_lock(&poolid_mutex); 132 if (cleancache_ops) {
163 for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { 133 pool_id = cleancache_ops->init_shared_fs(sb->s_uuid, PAGE_SIZE);
164 if (shared_fs_poolid_map[i] == FS_UNKNOWN) { 134 if (pool_id < 0)
165 sb->cleancache_poolid = i + FAKE_SHARED_FS_POOLID_OFFSET; 135 pool_id = CLEANCACHE_NO_POOL;
166 uuids[i] = uuid;
167 if (cleancache_ops)
168 shared_fs_poolid_map[i] = cleancache_ops->init_shared_fs
169 (uuid, PAGE_SIZE);
170 else
171 shared_fs_poolid_map[i] = FS_NO_BACKEND;
172 break;
173 }
174 } 136 }
175 mutex_unlock(&poolid_mutex); 137 sb->cleancache_poolid = pool_id;
176} 138}
177EXPORT_SYMBOL(__cleancache_init_shared_fs); 139EXPORT_SYMBOL(__cleancache_init_shared_fs);
178 140
@@ -202,19 +164,6 @@ static int cleancache_get_key(struct inode *inode,
202} 164}
203 165
204/* 166/*
205 * Returns a pool_id that is associated with a given fake poolid.
206 */
207static int get_poolid_from_fake(int fake_pool_id)
208{
209 if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET)
210 return shared_fs_poolid_map[fake_pool_id -
211 FAKE_SHARED_FS_POOLID_OFFSET];
212 else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET)
213 return fs_poolid_map[fake_pool_id - FAKE_FS_POOLID_OFFSET];
214 return FS_NO_BACKEND;
215}
216
217/*
218 * "Get" data from cleancache associated with the poolid/inode/index 167 * "Get" data from cleancache associated with the poolid/inode/index
219 * that were specified when the data was put to cleanache and, if 168 * that were specified when the data was put to cleanache and, if
220 * successful, use it to fill the specified page with data and return 0. 169 * successful, use it to fill the specified page with data and return 0.
@@ -229,7 +178,6 @@ int __cleancache_get_page(struct page *page)
229{ 178{
230 int ret = -1; 179 int ret = -1;
231 int pool_id; 180 int pool_id;
232 int fake_pool_id;
233 struct cleancache_filekey key = { .u.key = { 0 } }; 181 struct cleancache_filekey key = { .u.key = { 0 } };
234 182
235 if (!cleancache_ops) { 183 if (!cleancache_ops) {
@@ -238,17 +186,14 @@ int __cleancache_get_page(struct page *page)
238 } 186 }
239 187
240 VM_BUG_ON_PAGE(!PageLocked(page), page); 188 VM_BUG_ON_PAGE(!PageLocked(page), page);
241 fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; 189 pool_id = page->mapping->host->i_sb->cleancache_poolid;
242 if (fake_pool_id < 0) 190 if (pool_id < 0)
243 goto out; 191 goto out;
244 pool_id = get_poolid_from_fake(fake_pool_id);
245 192
246 if (cleancache_get_key(page->mapping->host, &key) < 0) 193 if (cleancache_get_key(page->mapping->host, &key) < 0)
247 goto out; 194 goto out;
248 195
249 if (pool_id >= 0) 196 ret = cleancache_ops->get_page(pool_id, key, page->index, page);
250 ret = cleancache_ops->get_page(pool_id,
251 key, page->index, page);
252 if (ret == 0) 197 if (ret == 0)
253 cleancache_succ_gets++; 198 cleancache_succ_gets++;
254 else 199 else
@@ -271,7 +216,6 @@ EXPORT_SYMBOL(__cleancache_get_page);
271void __cleancache_put_page(struct page *page) 216void __cleancache_put_page(struct page *page)
272{ 217{
273 int pool_id; 218 int pool_id;
274 int fake_pool_id;
275 struct cleancache_filekey key = { .u.key = { 0 } }; 219 struct cleancache_filekey key = { .u.key = { 0 } };
276 220
277 if (!cleancache_ops) { 221 if (!cleancache_ops) {
@@ -280,12 +224,7 @@ void __cleancache_put_page(struct page *page)
280 } 224 }
281 225
282 VM_BUG_ON_PAGE(!PageLocked(page), page); 226 VM_BUG_ON_PAGE(!PageLocked(page), page);
283 fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; 227 pool_id = page->mapping->host->i_sb->cleancache_poolid;
284 if (fake_pool_id < 0)
285 return;
286
287 pool_id = get_poolid_from_fake(fake_pool_id);
288
289 if (pool_id >= 0 && 228 if (pool_id >= 0 &&
290 cleancache_get_key(page->mapping->host, &key) >= 0) { 229 cleancache_get_key(page->mapping->host, &key) >= 0) {
291 cleancache_ops->put_page(pool_id, key, page->index, page); 230 cleancache_ops->put_page(pool_id, key, page->index, page);
@@ -306,18 +245,13 @@ void __cleancache_invalidate_page(struct address_space *mapping,
306 struct page *page) 245 struct page *page)
307{ 246{
308 /* careful... page->mapping is NULL sometimes when this is called */ 247 /* careful... page->mapping is NULL sometimes when this is called */
309 int pool_id; 248 int pool_id = mapping->host->i_sb->cleancache_poolid;
310 int fake_pool_id = mapping->host->i_sb->cleancache_poolid;
311 struct cleancache_filekey key = { .u.key = { 0 } }; 249 struct cleancache_filekey key = { .u.key = { 0 } };
312 250
313 if (!cleancache_ops) 251 if (!cleancache_ops)
314 return; 252 return;
315 253
316 if (fake_pool_id >= 0) { 254 if (pool_id >= 0) {
317 pool_id = get_poolid_from_fake(fake_pool_id);
318 if (pool_id < 0)
319 return;
320
321 VM_BUG_ON_PAGE(!PageLocked(page), page); 255 VM_BUG_ON_PAGE(!PageLocked(page), page);
322 if (cleancache_get_key(mapping->host, &key) >= 0) { 256 if (cleancache_get_key(mapping->host, &key) >= 0) {
323 cleancache_ops->invalidate_page(pool_id, 257 cleancache_ops->invalidate_page(pool_id,
@@ -339,18 +273,12 @@ EXPORT_SYMBOL(__cleancache_invalidate_page);
339 */ 273 */
340void __cleancache_invalidate_inode(struct address_space *mapping) 274void __cleancache_invalidate_inode(struct address_space *mapping)
341{ 275{
342 int pool_id; 276 int pool_id = mapping->host->i_sb->cleancache_poolid;
343 int fake_pool_id = mapping->host->i_sb->cleancache_poolid;
344 struct cleancache_filekey key = { .u.key = { 0 } }; 277 struct cleancache_filekey key = { .u.key = { 0 } };
345 278
346 if (!cleancache_ops) 279 if (!cleancache_ops)
347 return; 280 return;
348 281
349 if (fake_pool_id < 0)
350 return;
351
352 pool_id = get_poolid_from_fake(fake_pool_id);
353
354 if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0) 282 if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
355 cleancache_ops->invalidate_inode(pool_id, key); 283 cleancache_ops->invalidate_inode(pool_id, key);
356} 284}
@@ -363,32 +291,18 @@ EXPORT_SYMBOL(__cleancache_invalidate_inode);
363 */ 291 */
364void __cleancache_invalidate_fs(struct super_block *sb) 292void __cleancache_invalidate_fs(struct super_block *sb)
365{ 293{
366 int index; 294 int pool_id;
367 int fake_pool_id = sb->cleancache_poolid;
368 int old_poolid = fake_pool_id;
369 295
370 mutex_lock(&poolid_mutex); 296 pool_id = sb->cleancache_poolid;
371 if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET) { 297 sb->cleancache_poolid = CLEANCACHE_NO_POOL;
372 index = fake_pool_id - FAKE_SHARED_FS_POOLID_OFFSET; 298
373 old_poolid = shared_fs_poolid_map[index]; 299 if (cleancache_ops && pool_id >= 0)
374 shared_fs_poolid_map[index] = FS_UNKNOWN; 300 cleancache_ops->invalidate_fs(pool_id);
375 uuids[index] = NULL;
376 } else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET) {
377 index = fake_pool_id - FAKE_FS_POOLID_OFFSET;
378 old_poolid = fs_poolid_map[index];
379 fs_poolid_map[index] = FS_UNKNOWN;
380 }
381 sb->cleancache_poolid = -1;
382 if (cleancache_ops)
383 cleancache_ops->invalidate_fs(old_poolid);
384 mutex_unlock(&poolid_mutex);
385} 301}
386EXPORT_SYMBOL(__cleancache_invalidate_fs); 302EXPORT_SYMBOL(__cleancache_invalidate_fs);
387 303
388static int __init init_cleancache(void) 304static int __init init_cleancache(void)
389{ 305{
390 int i;
391
392#ifdef CONFIG_DEBUG_FS 306#ifdef CONFIG_DEBUG_FS
393 struct dentry *root = debugfs_create_dir("cleancache", NULL); 307 struct dentry *root = debugfs_create_dir("cleancache", NULL);
394 if (root == NULL) 308 if (root == NULL)
@@ -400,10 +314,6 @@ static int __init init_cleancache(void)
400 debugfs_create_u64("invalidates", S_IRUGO, 314 debugfs_create_u64("invalidates", S_IRUGO,
401 root, &cleancache_invalidates); 315 root, &cleancache_invalidates);
402#endif 316#endif
403 for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
404 fs_poolid_map[i] = FS_UNKNOWN;
405 shared_fs_poolid_map[i] = FS_UNKNOWN;
406 }
407 return 0; 317 return 0;
408} 318}
409module_init(init_cleancache) 319module_init(init_cleancache)
diff --git a/mm/cma.c b/mm/cma.c
index 68ecb7a42983..47203faaf65e 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -35,29 +35,24 @@
35#include <linux/highmem.h> 35#include <linux/highmem.h>
36#include <linux/io.h> 36#include <linux/io.h>
37 37
38struct cma { 38#include "cma.h"
39 unsigned long base_pfn; 39
40 unsigned long count; 40struct cma cma_areas[MAX_CMA_AREAS];
41 unsigned long *bitmap; 41unsigned cma_area_count;
42 unsigned int order_per_bit; /* Order of pages represented by one bit */
43 struct mutex lock;
44};
45
46static struct cma cma_areas[MAX_CMA_AREAS];
47static unsigned cma_area_count;
48static DEFINE_MUTEX(cma_mutex); 42static DEFINE_MUTEX(cma_mutex);
49 43
50phys_addr_t cma_get_base(struct cma *cma) 44phys_addr_t cma_get_base(const struct cma *cma)
51{ 45{
52 return PFN_PHYS(cma->base_pfn); 46 return PFN_PHYS(cma->base_pfn);
53} 47}
54 48
55unsigned long cma_get_size(struct cma *cma) 49unsigned long cma_get_size(const struct cma *cma)
56{ 50{
57 return cma->count << PAGE_SHIFT; 51 return cma->count << PAGE_SHIFT;
58} 52}
59 53
60static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order) 54static unsigned long cma_bitmap_aligned_mask(const struct cma *cma,
55 int align_order)
61{ 56{
62 if (align_order <= cma->order_per_bit) 57 if (align_order <= cma->order_per_bit)
63 return 0; 58 return 0;
@@ -68,7 +63,8 @@ static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order)
68 * Find a PFN aligned to the specified order and return an offset represented in 63 * Find a PFN aligned to the specified order and return an offset represented in
69 * order_per_bits. 64 * order_per_bits.
70 */ 65 */
71static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order) 66static unsigned long cma_bitmap_aligned_offset(const struct cma *cma,
67 int align_order)
72{ 68{
73 if (align_order <= cma->order_per_bit) 69 if (align_order <= cma->order_per_bit)
74 return 0; 70 return 0;
@@ -77,18 +73,14 @@ static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order)
77 - cma->base_pfn) >> cma->order_per_bit; 73 - cma->base_pfn) >> cma->order_per_bit;
78} 74}
79 75
80static unsigned long cma_bitmap_maxno(struct cma *cma) 76static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma,
81{ 77 unsigned long pages)
82 return cma->count >> cma->order_per_bit;
83}
84
85static unsigned long cma_bitmap_pages_to_bits(struct cma *cma,
86 unsigned long pages)
87{ 78{
88 return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit; 79 return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit;
89} 80}
90 81
91static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, int count) 82static void cma_clear_bitmap(struct cma *cma, unsigned long pfn,
83 unsigned int count)
92{ 84{
93 unsigned long bitmap_no, bitmap_count; 85 unsigned long bitmap_no, bitmap_count;
94 86
@@ -134,6 +126,12 @@ static int __init cma_activate_area(struct cma *cma)
134 } while (--i); 126 } while (--i);
135 127
136 mutex_init(&cma->lock); 128 mutex_init(&cma->lock);
129
130#ifdef CONFIG_CMA_DEBUGFS
131 INIT_HLIST_HEAD(&cma->mem_head);
132 spin_lock_init(&cma->mem_head_lock);
133#endif
134
137 return 0; 135 return 0;
138 136
139err: 137err:
@@ -167,7 +165,8 @@ core_initcall(cma_init_reserved_areas);
167 * This function creates custom contiguous area from already reserved memory. 165 * This function creates custom contiguous area from already reserved memory.
168 */ 166 */
169int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, 167int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
170 int order_per_bit, struct cma **res_cma) 168 unsigned int order_per_bit,
169 struct cma **res_cma)
171{ 170{
172 struct cma *cma; 171 struct cma *cma;
173 phys_addr_t alignment; 172 phys_addr_t alignment;
@@ -358,7 +357,7 @@ err:
358 * This function allocates part of contiguous memory on specific 357 * This function allocates part of contiguous memory on specific
359 * contiguous memory area. 358 * contiguous memory area.
360 */ 359 */
361struct page *cma_alloc(struct cma *cma, int count, unsigned int align) 360struct page *cma_alloc(struct cma *cma, unsigned int count, unsigned int align)
362{ 361{
363 unsigned long mask, offset, pfn, start = 0; 362 unsigned long mask, offset, pfn, start = 0;
364 unsigned long bitmap_maxno, bitmap_no, bitmap_count; 363 unsigned long bitmap_maxno, bitmap_no, bitmap_count;
@@ -429,7 +428,7 @@ struct page *cma_alloc(struct cma *cma, int count, unsigned int align)
429 * It returns false when provided pages do not belong to contiguous area and 428 * It returns false when provided pages do not belong to contiguous area and
430 * true otherwise. 429 * true otherwise.
431 */ 430 */
432bool cma_release(struct cma *cma, struct page *pages, int count) 431bool cma_release(struct cma *cma, const struct page *pages, unsigned int count)
433{ 432{
434 unsigned long pfn; 433 unsigned long pfn;
435 434
diff --git a/mm/cma.h b/mm/cma.h
new file mode 100644
index 000000000000..1132d733556d
--- /dev/null
+++ b/mm/cma.h
@@ -0,0 +1,24 @@
1#ifndef __MM_CMA_H__
2#define __MM_CMA_H__
3
4struct cma {
5 unsigned long base_pfn;
6 unsigned long count;
7 unsigned long *bitmap;
8 unsigned int order_per_bit; /* Order of pages represented by one bit */
9 struct mutex lock;
10#ifdef CONFIG_CMA_DEBUGFS
11 struct hlist_head mem_head;
12 spinlock_t mem_head_lock;
13#endif
14};
15
16extern struct cma cma_areas[MAX_CMA_AREAS];
17extern unsigned cma_area_count;
18
19static unsigned long cma_bitmap_maxno(struct cma *cma)
20{
21 return cma->count >> cma->order_per_bit;
22}
23
24#endif
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
new file mode 100644
index 000000000000..0b377536ccde
--- /dev/null
+++ b/mm/cma_debug.c
@@ -0,0 +1,170 @@
1/*
2 * CMA DebugFS Interface
3 *
4 * Copyright (c) 2015 Sasha Levin <sasha.levin@oracle.com>
5 */
6
7
8#include <linux/debugfs.h>
9#include <linux/cma.h>
10#include <linux/list.h>
11#include <linux/kernel.h>
12#include <linux/slab.h>
13#include <linux/mm_types.h>
14
15#include "cma.h"
16
17struct cma_mem {
18 struct hlist_node node;
19 struct page *p;
20 unsigned long n;
21};
22
23static struct dentry *cma_debugfs_root;
24
25static int cma_debugfs_get(void *data, u64 *val)
26{
27 unsigned long *p = data;
28
29 *val = *p;
30
31 return 0;
32}
33
34DEFINE_SIMPLE_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n");
35
36static void cma_add_to_cma_mem_list(struct cma *cma, struct cma_mem *mem)
37{
38 spin_lock(&cma->mem_head_lock);
39 hlist_add_head(&mem->node, &cma->mem_head);
40 spin_unlock(&cma->mem_head_lock);
41}
42
43static struct cma_mem *cma_get_entry_from_list(struct cma *cma)
44{
45 struct cma_mem *mem = NULL;
46
47 spin_lock(&cma->mem_head_lock);
48 if (!hlist_empty(&cma->mem_head)) {
49 mem = hlist_entry(cma->mem_head.first, struct cma_mem, node);
50 hlist_del_init(&mem->node);
51 }
52 spin_unlock(&cma->mem_head_lock);
53
54 return mem;
55}
56
57static int cma_free_mem(struct cma *cma, int count)
58{
59 struct cma_mem *mem = NULL;
60
61 while (count) {
62 mem = cma_get_entry_from_list(cma);
63 if (mem == NULL)
64 return 0;
65
66 if (mem->n <= count) {
67 cma_release(cma, mem->p, mem->n);
68 count -= mem->n;
69 kfree(mem);
70 } else if (cma->order_per_bit == 0) {
71 cma_release(cma, mem->p, count);
72 mem->p += count;
73 mem->n -= count;
74 count = 0;
75 cma_add_to_cma_mem_list(cma, mem);
76 } else {
77 pr_debug("cma: cannot release partial block when order_per_bit != 0\n");
78 cma_add_to_cma_mem_list(cma, mem);
79 break;
80 }
81 }
82
83 return 0;
84
85}
86
87static int cma_free_write(void *data, u64 val)
88{
89 int pages = val;
90 struct cma *cma = data;
91
92 return cma_free_mem(cma, pages);
93}
94
95DEFINE_SIMPLE_ATTRIBUTE(cma_free_fops, NULL, cma_free_write, "%llu\n");
96
97static int cma_alloc_mem(struct cma *cma, int count)
98{
99 struct cma_mem *mem;
100 struct page *p;
101
102 mem = kzalloc(sizeof(*mem), GFP_KERNEL);
103 if (!mem)
104 return -ENOMEM;
105
106 p = cma_alloc(cma, count, 0);
107 if (!p) {
108 kfree(mem);
109 return -ENOMEM;
110 }
111
112 mem->p = p;
113 mem->n = count;
114
115 cma_add_to_cma_mem_list(cma, mem);
116
117 return 0;
118}
119
120static int cma_alloc_write(void *data, u64 val)
121{
122 int pages = val;
123 struct cma *cma = data;
124
125 return cma_alloc_mem(cma, pages);
126}
127
128DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n");
129
130static void cma_debugfs_add_one(struct cma *cma, int idx)
131{
132 struct dentry *tmp;
133 char name[16];
134 int u32s;
135
136 sprintf(name, "cma-%d", idx);
137
138 tmp = debugfs_create_dir(name, cma_debugfs_root);
139
140 debugfs_create_file("alloc", S_IWUSR, cma_debugfs_root, cma,
141 &cma_alloc_fops);
142
143 debugfs_create_file("free", S_IWUSR, cma_debugfs_root, cma,
144 &cma_free_fops);
145
146 debugfs_create_file("base_pfn", S_IRUGO, tmp,
147 &cma->base_pfn, &cma_debugfs_fops);
148 debugfs_create_file("count", S_IRUGO, tmp,
149 &cma->count, &cma_debugfs_fops);
150 debugfs_create_file("order_per_bit", S_IRUGO, tmp,
151 &cma->order_per_bit, &cma_debugfs_fops);
152
153 u32s = DIV_ROUND_UP(cma_bitmap_maxno(cma), BITS_PER_BYTE * sizeof(u32));
154 debugfs_create_u32_array("bitmap", S_IRUGO, tmp, (u32*)cma->bitmap, u32s);
155}
156
157static int __init cma_debugfs_init(void)
158{
159 int i;
160
161 cma_debugfs_root = debugfs_create_dir("cma", NULL);
162 if (!cma_debugfs_root)
163 return -ENOMEM;
164
165 for (i = 0; i < cma_area_count; i++)
166 cma_debugfs_add_one(&cma_areas[i], i);
167
168 return 0;
169}
170late_initcall(cma_debugfs_init);
diff --git a/mm/compaction.c b/mm/compaction.c
index 8c0d9459b54a..a18201a8124e 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1174,13 +1174,24 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
1174 /* Direct compactor: Is a suitable page free? */ 1174 /* Direct compactor: Is a suitable page free? */
1175 for (order = cc->order; order < MAX_ORDER; order++) { 1175 for (order = cc->order; order < MAX_ORDER; order++) {
1176 struct free_area *area = &zone->free_area[order]; 1176 struct free_area *area = &zone->free_area[order];
1177 bool can_steal;
1177 1178
1178 /* Job done if page is free of the right migratetype */ 1179 /* Job done if page is free of the right migratetype */
1179 if (!list_empty(&area->free_list[migratetype])) 1180 if (!list_empty(&area->free_list[migratetype]))
1180 return COMPACT_PARTIAL; 1181 return COMPACT_PARTIAL;
1181 1182
1182 /* Job done if allocation would set block type */ 1183#ifdef CONFIG_CMA
1183 if (order >= pageblock_order && area->nr_free) 1184 /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
1185 if (migratetype == MIGRATE_MOVABLE &&
1186 !list_empty(&area->free_list[MIGRATE_CMA]))
1187 return COMPACT_PARTIAL;
1188#endif
1189 /*
1190 * Job done if allocation would steal freepages from
1191 * other migratetype buddy lists.
1192 */
1193 if (find_suitable_fallback(area, order, migratetype,
1194 true, &can_steal) != -1)
1184 return COMPACT_PARTIAL; 1195 return COMPACT_PARTIAL;
1185 } 1196 }
1186 1197
diff --git a/mm/filemap.c b/mm/filemap.c
index 876f4e6f3ed6..12548d03c11d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -202,16 +202,15 @@ void __delete_from_page_cache(struct page *page, void *shadow)
202 BUG_ON(page_mapped(page)); 202 BUG_ON(page_mapped(page));
203 203
204 /* 204 /*
205 * Some filesystems seem to re-dirty the page even after 205 * At this point page must be either written or cleaned by truncate.
206 * the VM has canceled the dirty bit (eg ext3 journaling). 206 * Dirty page here signals a bug and loss of unwritten data.
207 * 207 *
208 * Fix it up by doing a final dirty accounting check after 208 * This fixes dirty accounting after removing the page entirely but
209 * having removed the page entirely. 209 * leaves PageDirty set: it has no effect for truncated page and
210 * anyway will be cleared before returning page into buddy allocator.
210 */ 211 */
211 if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { 212 if (WARN_ON_ONCE(PageDirty(page)))
212 dec_zone_page_state(page, NR_FILE_DIRTY); 213 account_page_cleaned(page, mapping);
213 dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE);
214 }
215} 214}
216 215
217/** 216/**
diff --git a/mm/gup.c b/mm/gup.c
index a6e24e246f86..ca7b607ab671 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -92,7 +92,7 @@ retry:
92 */ 92 */
93 mark_page_accessed(page); 93 mark_page_accessed(page);
94 } 94 }
95 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { 95 if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
96 /* 96 /*
97 * The preliminary mapping check is mainly to avoid the 97 * The preliminary mapping check is mainly to avoid the
98 * pointless overhead of lock_page on the ZERO_PAGE 98 * pointless overhead of lock_page on the ZERO_PAGE
@@ -265,8 +265,8 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
265 unsigned int fault_flags = 0; 265 unsigned int fault_flags = 0;
266 int ret; 266 int ret;
267 267
268 /* For mlock, just skip the stack guard page. */ 268 /* For mm_populate(), just skip the stack guard page. */
269 if ((*flags & FOLL_MLOCK) && 269 if ((*flags & FOLL_POPULATE) &&
270 (stack_guard_page_start(vma, address) || 270 (stack_guard_page_start(vma, address) ||
271 stack_guard_page_end(vma, address + PAGE_SIZE))) 271 stack_guard_page_end(vma, address + PAGE_SIZE)))
272 return -ENOENT; 272 return -ENOENT;
@@ -819,6 +819,124 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
819EXPORT_SYMBOL(get_user_pages); 819EXPORT_SYMBOL(get_user_pages);
820 820
821/** 821/**
822 * populate_vma_page_range() - populate a range of pages in the vma.
823 * @vma: target vma
824 * @start: start address
825 * @end: end address
826 * @nonblocking:
827 *
828 * This takes care of mlocking the pages too if VM_LOCKED is set.
829 *
830 * return 0 on success, negative error code on error.
831 *
832 * vma->vm_mm->mmap_sem must be held.
833 *
834 * If @nonblocking is NULL, it may be held for read or write and will
835 * be unperturbed.
836 *
837 * If @nonblocking is non-NULL, it must held for read only and may be
838 * released. If it's released, *@nonblocking will be set to 0.
839 */
840long populate_vma_page_range(struct vm_area_struct *vma,
841 unsigned long start, unsigned long end, int *nonblocking)
842{
843 struct mm_struct *mm = vma->vm_mm;
844 unsigned long nr_pages = (end - start) / PAGE_SIZE;
845 int gup_flags;
846
847 VM_BUG_ON(start & ~PAGE_MASK);
848 VM_BUG_ON(end & ~PAGE_MASK);
849 VM_BUG_ON_VMA(start < vma->vm_start, vma);
850 VM_BUG_ON_VMA(end > vma->vm_end, vma);
851 VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
852
853 gup_flags = FOLL_TOUCH | FOLL_POPULATE;
854 /*
855 * We want to touch writable mappings with a write fault in order
856 * to break COW, except for shared mappings because these don't COW
857 * and we would not want to dirty them for nothing.
858 */
859 if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
860 gup_flags |= FOLL_WRITE;
861
862 /*
863 * We want mlock to succeed for regions that have any permissions
864 * other than PROT_NONE.
865 */
866 if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
867 gup_flags |= FOLL_FORCE;
868
869 /*
870 * We made sure addr is within a VMA, so the following will
871 * not result in a stack expansion that recurses back here.
872 */
873 return __get_user_pages(current, mm, start, nr_pages, gup_flags,
874 NULL, NULL, nonblocking);
875}
876
877/*
878 * __mm_populate - populate and/or mlock pages within a range of address space.
879 *
880 * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
881 * flags. VMAs must be already marked with the desired vm_flags, and
882 * mmap_sem must not be held.
883 */
884int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
885{
886 struct mm_struct *mm = current->mm;
887 unsigned long end, nstart, nend;
888 struct vm_area_struct *vma = NULL;
889 int locked = 0;
890 long ret = 0;
891
892 VM_BUG_ON(start & ~PAGE_MASK);
893 VM_BUG_ON(len != PAGE_ALIGN(len));
894 end = start + len;
895
896 for (nstart = start; nstart < end; nstart = nend) {
897 /*
898 * We want to fault in pages for [nstart; end) address range.
899 * Find first corresponding VMA.
900 */
901 if (!locked) {
902 locked = 1;
903 down_read(&mm->mmap_sem);
904 vma = find_vma(mm, nstart);
905 } else if (nstart >= vma->vm_end)
906 vma = vma->vm_next;
907 if (!vma || vma->vm_start >= end)
908 break;
909 /*
910 * Set [nstart; nend) to intersection of desired address
911 * range with the first VMA. Also, skip undesirable VMA types.
912 */
913 nend = min(end, vma->vm_end);
914 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
915 continue;
916 if (nstart < vma->vm_start)
917 nstart = vma->vm_start;
918 /*
919 * Now fault in a range of pages. populate_vma_page_range()
920 * double checks the vma flags, so that it won't mlock pages
921 * if the vma was already munlocked.
922 */
923 ret = populate_vma_page_range(vma, nstart, nend, &locked);
924 if (ret < 0) {
925 if (ignore_errors) {
926 ret = 0;
927 continue; /* continue at next VMA */
928 }
929 break;
930 }
931 nend = nstart + ret * PAGE_SIZE;
932 ret = 0;
933 }
934 if (locked)
935 up_read(&mm->mmap_sem);
936 return ret; /* 0 or negative error code */
937}
938
939/**
822 * get_dump_page() - pin user page in memory while writing it to core dump 940 * get_dump_page() - pin user page in memory while writing it to core dump
823 * @addr: user address 941 * @addr: user address
824 * 942 *
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6817b0350c71..3afb5cbe1312 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1231,7 +1231,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1231 pmd, _pmd, 1)) 1231 pmd, _pmd, 1))
1232 update_mmu_cache_pmd(vma, addr, pmd); 1232 update_mmu_cache_pmd(vma, addr, pmd);
1233 } 1233 }
1234 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { 1234 if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
1235 if (page->mapping && trylock_page(page)) { 1235 if (page->mapping && trylock_page(page)) {
1236 lru_add_drain(); 1236 lru_add_drain();
1237 if (page->mapping) 1237 if (page->mapping)
@@ -2109,7 +2109,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte)
2109{ 2109{
2110 while (--_pte >= pte) { 2110 while (--_pte >= pte) {
2111 pte_t pteval = *_pte; 2111 pte_t pteval = *_pte;
2112 if (!pte_none(pteval)) 2112 if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)))
2113 release_pte_page(pte_page(pteval)); 2113 release_pte_page(pte_page(pteval));
2114 } 2114 }
2115} 2115}
@@ -2120,13 +2120,13 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
2120{ 2120{
2121 struct page *page; 2121 struct page *page;
2122 pte_t *_pte; 2122 pte_t *_pte;
2123 int none = 0; 2123 int none_or_zero = 0;
2124 bool referenced = false, writable = false; 2124 bool referenced = false, writable = false;
2125 for (_pte = pte; _pte < pte+HPAGE_PMD_NR; 2125 for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
2126 _pte++, address += PAGE_SIZE) { 2126 _pte++, address += PAGE_SIZE) {
2127 pte_t pteval = *_pte; 2127 pte_t pteval = *_pte;
2128 if (pte_none(pteval)) { 2128 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
2129 if (++none <= khugepaged_max_ptes_none) 2129 if (++none_or_zero <= khugepaged_max_ptes_none)
2130 continue; 2130 continue;
2131 else 2131 else
2132 goto out; 2132 goto out;
@@ -2207,9 +2207,21 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
2207 pte_t pteval = *_pte; 2207 pte_t pteval = *_pte;
2208 struct page *src_page; 2208 struct page *src_page;
2209 2209
2210 if (pte_none(pteval)) { 2210 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
2211 clear_user_highpage(page, address); 2211 clear_user_highpage(page, address);
2212 add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); 2212 add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
2213 if (is_zero_pfn(pte_pfn(pteval))) {
2214 /*
2215 * ptl mostly unnecessary.
2216 */
2217 spin_lock(ptl);
2218 /*
2219 * paravirt calls inside pte_clear here are
2220 * superfluous.
2221 */
2222 pte_clear(vma->vm_mm, address, _pte);
2223 spin_unlock(ptl);
2224 }
2213 } else { 2225 } else {
2214 src_page = pte_page(pteval); 2226 src_page = pte_page(pteval);
2215 copy_user_highpage(page, src_page, address, vma); 2227 copy_user_highpage(page, src_page, address, vma);
@@ -2316,8 +2328,14 @@ static struct page
2316 struct vm_area_struct *vma, unsigned long address, 2328 struct vm_area_struct *vma, unsigned long address,
2317 int node) 2329 int node)
2318{ 2330{
2331 gfp_t flags;
2332
2319 VM_BUG_ON_PAGE(*hpage, *hpage); 2333 VM_BUG_ON_PAGE(*hpage, *hpage);
2320 2334
2335 /* Only allocate from the target node */
2336 flags = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) |
2337 __GFP_THISNODE;
2338
2321 /* 2339 /*
2322 * Before allocating the hugepage, release the mmap_sem read lock. 2340 * Before allocating the hugepage, release the mmap_sem read lock.
2323 * The allocation can take potentially a long time if it involves 2341 * The allocation can take potentially a long time if it involves
@@ -2326,8 +2344,7 @@ static struct page
2326 */ 2344 */
2327 up_read(&mm->mmap_sem); 2345 up_read(&mm->mmap_sem);
2328 2346
2329 *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask( 2347 *hpage = alloc_pages_exact_node(node, flags, HPAGE_PMD_ORDER);
2330 khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
2331 if (unlikely(!*hpage)) { 2348 if (unlikely(!*hpage)) {
2332 count_vm_event(THP_COLLAPSE_ALLOC_FAILED); 2349 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2333 *hpage = ERR_PTR(-ENOMEM); 2350 *hpage = ERR_PTR(-ENOMEM);
@@ -2543,7 +2560,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2543{ 2560{
2544 pmd_t *pmd; 2561 pmd_t *pmd;
2545 pte_t *pte, *_pte; 2562 pte_t *pte, *_pte;
2546 int ret = 0, none = 0; 2563 int ret = 0, none_or_zero = 0;
2547 struct page *page; 2564 struct page *page;
2548 unsigned long _address; 2565 unsigned long _address;
2549 spinlock_t *ptl; 2566 spinlock_t *ptl;
@@ -2561,8 +2578,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2561 for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; 2578 for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
2562 _pte++, _address += PAGE_SIZE) { 2579 _pte++, _address += PAGE_SIZE) {
2563 pte_t pteval = *_pte; 2580 pte_t pteval = *_pte;
2564 if (pte_none(pteval)) { 2581 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
2565 if (++none <= khugepaged_max_ptes_none) 2582 if (++none_or_zero <= khugepaged_max_ptes_none)
2566 continue; 2583 continue;
2567 else 2584 else
2568 goto out_unmap; 2585 goto out_unmap;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c41b2a0ee273..8874c8ad55aa 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3278,6 +3278,15 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
3278 struct page *page; 3278 struct page *page;
3279 3279
3280 /* 3280 /*
3281 * If we have a pending SIGKILL, don't keep faulting pages and
3282 * potentially allocating memory.
3283 */
3284 if (unlikely(fatal_signal_pending(current))) {
3285 remainder = 0;
3286 break;
3287 }
3288
3289 /*
3281 * Some archs (sparc64, sh*) have multiple pte_ts to 3290 * Some archs (sparc64, sh*) have multiple pte_ts to
3282 * each hugepage. We have to make sure we get the 3291 * each hugepage. We have to make sure we get the
3283 * first, for the page indexing below to work. 3292 * first, for the page indexing below to work.
@@ -3735,8 +3744,7 @@ retry:
3735 if (!pmd_huge(*pmd)) 3744 if (!pmd_huge(*pmd))
3736 goto out; 3745 goto out;
3737 if (pmd_present(*pmd)) { 3746 if (pmd_present(*pmd)) {
3738 page = pte_page(*(pte_t *)pmd) + 3747 page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
3739 ((address & ~PMD_MASK) >> PAGE_SHIFT);
3740 if (flags & FOLL_GET) 3748 if (flags & FOLL_GET)
3741 get_page(page); 3749 get_page(page);
3742 } else { 3750 } else {
diff --git a/mm/internal.h b/mm/internal.h
index a96da5b0029d..edaab69a9c35 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -200,6 +200,8 @@ isolate_freepages_range(struct compact_control *cc,
200unsigned long 200unsigned long
201isolate_migratepages_range(struct compact_control *cc, 201isolate_migratepages_range(struct compact_control *cc,
202 unsigned long low_pfn, unsigned long end_pfn); 202 unsigned long low_pfn, unsigned long end_pfn);
203int find_suitable_fallback(struct free_area *area, unsigned int order,
204 int migratetype, bool only_stealable, bool *can_steal);
203 205
204#endif 206#endif
205 207
@@ -240,7 +242,7 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
240 struct vm_area_struct *prev, struct rb_node *rb_parent); 242 struct vm_area_struct *prev, struct rb_node *rb_parent);
241 243
242#ifdef CONFIG_MMU 244#ifdef CONFIG_MMU
243extern long __mlock_vma_pages_range(struct vm_area_struct *vma, 245extern long populate_vma_page_range(struct vm_area_struct *vma,
244 unsigned long start, unsigned long end, int *nonblocking); 246 unsigned long start, unsigned long end, int *nonblocking);
245extern void munlock_vma_pages_range(struct vm_area_struct *vma, 247extern void munlock_vma_pages_range(struct vm_area_struct *vma,
246 unsigned long start, unsigned long end); 248 unsigned long start, unsigned long end);
diff --git a/mm/memblock.c b/mm/memblock.c
index 252b77bdf65e..3f37a0bca5d5 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -699,14 +699,14 @@ static int __init_memblock memblock_reserve_region(phys_addr_t base,
699 int nid, 699 int nid,
700 unsigned long flags) 700 unsigned long flags)
701{ 701{
702 struct memblock_type *_rgn = &memblock.reserved; 702 struct memblock_type *type = &memblock.reserved;
703 703
704 memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n", 704 memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n",
705 (unsigned long long)base, 705 (unsigned long long)base,
706 (unsigned long long)base + size - 1, 706 (unsigned long long)base + size - 1,
707 flags, (void *)_RET_IP_); 707 flags, (void *)_RET_IP_);
708 708
709 return memblock_add_range(_rgn, base, size, nid, flags); 709 return memblock_add_range(type, base, size, nid, flags);
710} 710}
711 711
712int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) 712int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b34ef4a32a3b..c3f09b2dda5f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -14,6 +14,12 @@
14 * Copyright (C) 2012 Parallels Inc. and Google Inc. 14 * Copyright (C) 2012 Parallels Inc. and Google Inc.
15 * Authors: Glauber Costa and Suleiman Souhlal 15 * Authors: Glauber Costa and Suleiman Souhlal
16 * 16 *
17 * Native page reclaim
18 * Charge lifetime sanitation
19 * Lockless page tracking & accounting
20 * Unified hierarchy configuration model
21 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
22 *
17 * This program is free software; you can redistribute it and/or modify 23 * This program is free software; you can redistribute it and/or modify
18 * it under the terms of the GNU General Public License as published by 24 * it under the terms of the GNU General Public License as published by
19 * the Free Software Foundation; either version 2 of the License, or 25 * the Free Software Foundation; either version 2 of the License, or
@@ -1436,15 +1442,17 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1436 struct mem_cgroup *iter; 1442 struct mem_cgroup *iter;
1437 unsigned int i; 1443 unsigned int i;
1438 1444
1439 if (!p)
1440 return;
1441
1442 mutex_lock(&oom_info_lock); 1445 mutex_lock(&oom_info_lock);
1443 rcu_read_lock(); 1446 rcu_read_lock();
1444 1447
1445 pr_info("Task in "); 1448 if (p) {
1446 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); 1449 pr_info("Task in ");
1447 pr_cont(" killed as a result of limit of "); 1450 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1451 pr_cont(" killed as a result of limit of ");
1452 } else {
1453 pr_info("Memory limit reached of cgroup ");
1454 }
1455
1448 pr_cont_cgroup_path(memcg->css.cgroup); 1456 pr_cont_cgroup_path(memcg->css.cgroup);
1449 pr_cont("\n"); 1457 pr_cont("\n");
1450 1458
@@ -1531,7 +1539,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1531 return; 1539 return;
1532 } 1540 }
1533 1541
1534 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); 1542 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
1535 totalpages = mem_cgroup_get_limit(memcg) ? : 1; 1543 totalpages = mem_cgroup_get_limit(memcg) ? : 1;
1536 for_each_mem_cgroup_tree(iter, memcg) { 1544 for_each_mem_cgroup_tree(iter, memcg) {
1537 struct css_task_iter it; 1545 struct css_task_iter it;
@@ -2779,92 +2787,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
2779} 2787}
2780#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 2788#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
2781 2789
2782/**
2783 * mem_cgroup_move_account - move account of the page
2784 * @page: the page
2785 * @nr_pages: number of regular pages (>1 for huge pages)
2786 * @from: mem_cgroup which the page is moved from.
2787 * @to: mem_cgroup which the page is moved to. @from != @to.
2788 *
2789 * The caller must confirm following.
2790 * - page is not on LRU (isolate_page() is useful.)
2791 * - compound_lock is held when nr_pages > 1
2792 *
2793 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
2794 * from old cgroup.
2795 */
2796static int mem_cgroup_move_account(struct page *page,
2797 unsigned int nr_pages,
2798 struct mem_cgroup *from,
2799 struct mem_cgroup *to)
2800{
2801 unsigned long flags;
2802 int ret;
2803
2804 VM_BUG_ON(from == to);
2805 VM_BUG_ON_PAGE(PageLRU(page), page);
2806 /*
2807 * The page is isolated from LRU. So, collapse function
2808 * will not handle this page. But page splitting can happen.
2809 * Do this check under compound_page_lock(). The caller should
2810 * hold it.
2811 */
2812 ret = -EBUSY;
2813 if (nr_pages > 1 && !PageTransHuge(page))
2814 goto out;
2815
2816 /*
2817 * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
2818 * of its source page while we change it: page migration takes
2819 * both pages off the LRU, but page cache replacement doesn't.
2820 */
2821 if (!trylock_page(page))
2822 goto out;
2823
2824 ret = -EINVAL;
2825 if (page->mem_cgroup != from)
2826 goto out_unlock;
2827
2828 spin_lock_irqsave(&from->move_lock, flags);
2829
2830 if (!PageAnon(page) && page_mapped(page)) {
2831 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
2832 nr_pages);
2833 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
2834 nr_pages);
2835 }
2836
2837 if (PageWriteback(page)) {
2838 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
2839 nr_pages);
2840 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
2841 nr_pages);
2842 }
2843
2844 /*
2845 * It is safe to change page->mem_cgroup here because the page
2846 * is referenced, charged, and isolated - we can't race with
2847 * uncharging, charging, migration, or LRU putback.
2848 */
2849
2850 /* caller should have done css_get */
2851 page->mem_cgroup = to;
2852 spin_unlock_irqrestore(&from->move_lock, flags);
2853
2854 ret = 0;
2855
2856 local_irq_disable();
2857 mem_cgroup_charge_statistics(to, page, nr_pages);
2858 memcg_check_events(to, page);
2859 mem_cgroup_charge_statistics(from, page, -nr_pages);
2860 memcg_check_events(from, page);
2861 local_irq_enable();
2862out_unlock:
2863 unlock_page(page);
2864out:
2865 return ret;
2866}
2867
2868#ifdef CONFIG_MEMCG_SWAP 2790#ifdef CONFIG_MEMCG_SWAP
2869static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, 2791static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
2870 bool charge) 2792 bool charge)
@@ -4816,6 +4738,92 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
4816 return page; 4738 return page;
4817} 4739}
4818 4740
4741/**
4742 * mem_cgroup_move_account - move account of the page
4743 * @page: the page
4744 * @nr_pages: number of regular pages (>1 for huge pages)
4745 * @from: mem_cgroup which the page is moved from.
4746 * @to: mem_cgroup which the page is moved to. @from != @to.
4747 *
4748 * The caller must confirm following.
4749 * - page is not on LRU (isolate_page() is useful.)
4750 * - compound_lock is held when nr_pages > 1
4751 *
4752 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
4753 * from old cgroup.
4754 */
4755static int mem_cgroup_move_account(struct page *page,
4756 unsigned int nr_pages,
4757 struct mem_cgroup *from,
4758 struct mem_cgroup *to)
4759{
4760 unsigned long flags;
4761 int ret;
4762
4763 VM_BUG_ON(from == to);
4764 VM_BUG_ON_PAGE(PageLRU(page), page);
4765 /*
4766 * The page is isolated from LRU. So, collapse function
4767 * will not handle this page. But page splitting can happen.
4768 * Do this check under compound_page_lock(). The caller should
4769 * hold it.
4770 */
4771 ret = -EBUSY;
4772 if (nr_pages > 1 && !PageTransHuge(page))
4773 goto out;
4774
4775 /*
4776 * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
4777 * of its source page while we change it: page migration takes
4778 * both pages off the LRU, but page cache replacement doesn't.
4779 */
4780 if (!trylock_page(page))
4781 goto out;
4782
4783 ret = -EINVAL;
4784 if (page->mem_cgroup != from)
4785 goto out_unlock;
4786
4787 spin_lock_irqsave(&from->move_lock, flags);
4788
4789 if (!PageAnon(page) && page_mapped(page)) {
4790 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
4791 nr_pages);
4792 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
4793 nr_pages);
4794 }
4795
4796 if (PageWriteback(page)) {
4797 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
4798 nr_pages);
4799 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
4800 nr_pages);
4801 }
4802
4803 /*
4804 * It is safe to change page->mem_cgroup here because the page
4805 * is referenced, charged, and isolated - we can't race with
4806 * uncharging, charging, migration, or LRU putback.
4807 */
4808
4809 /* caller should have done css_get */
4810 page->mem_cgroup = to;
4811 spin_unlock_irqrestore(&from->move_lock, flags);
4812
4813 ret = 0;
4814
4815 local_irq_disable();
4816 mem_cgroup_charge_statistics(to, page, nr_pages);
4817 memcg_check_events(to, page);
4818 mem_cgroup_charge_statistics(from, page, -nr_pages);
4819 memcg_check_events(from, page);
4820 local_irq_enable();
4821out_unlock:
4822 unlock_page(page);
4823out:
4824 return ret;
4825}
4826
4819static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, 4827static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
4820 unsigned long addr, pte_t ptent, union mc_target *target) 4828 unsigned long addr, pte_t ptent, union mc_target *target)
4821{ 4829{
diff --git a/mm/memory.c b/mm/memory.c
index 97839f5c8c30..ac20b2a6a0c3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1983,167 +1983,91 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
1983} 1983}
1984 1984
1985/* 1985/*
1986 * This routine handles present pages, when users try to write 1986 * Handle write page faults for pages that can be reused in the current vma
1987 * to a shared page. It is done by copying the page to a new address
1988 * and decrementing the shared-page counter for the old page.
1989 *
1990 * Note that this routine assumes that the protection checks have been
1991 * done by the caller (the low-level page fault routine in most cases).
1992 * Thus we can safely just mark it writable once we've done any necessary
1993 * COW.
1994 * 1987 *
1995 * We also mark the page dirty at this point even though the page will 1988 * This can happen either due to the mapping being with the VM_SHARED flag,
1996 * change only once the write actually happens. This avoids a few races, 1989 * or due to us being the last reference standing to the page. In either
1997 * and potentially makes it more efficient. 1990 * case, all we need to do here is to mark the page as writable and update
1998 * 1991 * any related book-keeping.
1999 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2000 * but allow concurrent faults), with pte both mapped and locked.
2001 * We return with mmap_sem still held, but pte unmapped and unlocked.
2002 */ 1992 */
2003static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, 1993static inline int wp_page_reuse(struct mm_struct *mm,
2004 unsigned long address, pte_t *page_table, pmd_t *pmd, 1994 struct vm_area_struct *vma, unsigned long address,
2005 spinlock_t *ptl, pte_t orig_pte) 1995 pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
1996 struct page *page, int page_mkwrite,
1997 int dirty_shared)
2006 __releases(ptl) 1998 __releases(ptl)
2007{ 1999{
2008 struct page *old_page, *new_page = NULL;
2009 pte_t entry; 2000 pte_t entry;
2010 int ret = 0;
2011 int page_mkwrite = 0;
2012 bool dirty_shared = false;
2013 unsigned long mmun_start = 0; /* For mmu_notifiers */
2014 unsigned long mmun_end = 0; /* For mmu_notifiers */
2015 struct mem_cgroup *memcg;
2016
2017 old_page = vm_normal_page(vma, address, orig_pte);
2018 if (!old_page) {
2019 /*
2020 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
2021 * VM_PFNMAP VMA.
2022 *
2023 * We should not cow pages in a shared writeable mapping.
2024 * Just mark the pages writable as we can't do any dirty
2025 * accounting on raw pfn maps.
2026 */
2027 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2028 (VM_WRITE|VM_SHARED))
2029 goto reuse;
2030 goto gotten;
2031 }
2032
2033 /* 2001 /*
2034 * Take out anonymous pages first, anonymous shared vmas are 2002 * Clear the pages cpupid information as the existing
2035 * not dirty accountable. 2003 * information potentially belongs to a now completely
2004 * unrelated process.
2036 */ 2005 */
2037 if (PageAnon(old_page) && !PageKsm(old_page)) { 2006 if (page)
2038 if (!trylock_page(old_page)) { 2007 page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
2039 page_cache_get(old_page);
2040 pte_unmap_unlock(page_table, ptl);
2041 lock_page(old_page);
2042 page_table = pte_offset_map_lock(mm, pmd, address,
2043 &ptl);
2044 if (!pte_same(*page_table, orig_pte)) {
2045 unlock_page(old_page);
2046 goto unlock;
2047 }
2048 page_cache_release(old_page);
2049 }
2050 if (reuse_swap_page(old_page)) {
2051 /*
2052 * The page is all ours. Move it to our anon_vma so
2053 * the rmap code will not search our parent or siblings.
2054 * Protected against the rmap code by the page lock.
2055 */
2056 page_move_anon_rmap(old_page, vma, address);
2057 unlock_page(old_page);
2058 goto reuse;
2059 }
2060 unlock_page(old_page);
2061 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2062 (VM_WRITE|VM_SHARED))) {
2063 page_cache_get(old_page);
2064 /*
2065 * Only catch write-faults on shared writable pages,
2066 * read-only shared pages can get COWed by
2067 * get_user_pages(.write=1, .force=1).
2068 */
2069 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2070 int tmp;
2071
2072 pte_unmap_unlock(page_table, ptl);
2073 tmp = do_page_mkwrite(vma, old_page, address);
2074 if (unlikely(!tmp || (tmp &
2075 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
2076 page_cache_release(old_page);
2077 return tmp;
2078 }
2079 /*
2080 * Since we dropped the lock we need to revalidate
2081 * the PTE as someone else may have changed it. If
2082 * they did, we just return, as we can count on the
2083 * MMU to tell us if they didn't also make it writable.
2084 */
2085 page_table = pte_offset_map_lock(mm, pmd, address,
2086 &ptl);
2087 if (!pte_same(*page_table, orig_pte)) {
2088 unlock_page(old_page);
2089 goto unlock;
2090 }
2091 page_mkwrite = 1;
2092 }
2093
2094 dirty_shared = true;
2095
2096reuse:
2097 /*
2098 * Clear the pages cpupid information as the existing
2099 * information potentially belongs to a now completely
2100 * unrelated process.
2101 */
2102 if (old_page)
2103 page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1);
2104
2105 flush_cache_page(vma, address, pte_pfn(orig_pte));
2106 entry = pte_mkyoung(orig_pte);
2107 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2108 if (ptep_set_access_flags(vma, address, page_table, entry,1))
2109 update_mmu_cache(vma, address, page_table);
2110 pte_unmap_unlock(page_table, ptl);
2111 ret |= VM_FAULT_WRITE;
2112 2008
2113 if (dirty_shared) { 2009 flush_cache_page(vma, address, pte_pfn(orig_pte));
2114 struct address_space *mapping; 2010 entry = pte_mkyoung(orig_pte);
2115 int dirtied; 2011 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2012 if (ptep_set_access_flags(vma, address, page_table, entry, 1))
2013 update_mmu_cache(vma, address, page_table);
2014 pte_unmap_unlock(page_table, ptl);
2116 2015
2117 if (!page_mkwrite) 2016 if (dirty_shared) {
2118 lock_page(old_page); 2017 struct address_space *mapping;
2018 int dirtied;
2119 2019
2120 dirtied = set_page_dirty(old_page); 2020 if (!page_mkwrite)
2121 VM_BUG_ON_PAGE(PageAnon(old_page), old_page); 2021 lock_page(page);
2122 mapping = old_page->mapping;
2123 unlock_page(old_page);
2124 page_cache_release(old_page);
2125 2022
2126 if ((dirtied || page_mkwrite) && mapping) { 2023 dirtied = set_page_dirty(page);
2127 /* 2024 VM_BUG_ON_PAGE(PageAnon(page), page);
2128 * Some device drivers do not set page.mapping 2025 mapping = page->mapping;
2129 * but still dirty their pages 2026 unlock_page(page);
2130 */ 2027 page_cache_release(page);
2131 balance_dirty_pages_ratelimited(mapping);
2132 }
2133 2028
2134 if (!page_mkwrite) 2029 if ((dirtied || page_mkwrite) && mapping) {
2135 file_update_time(vma->vm_file); 2030 /*
2031 * Some device drivers do not set page.mapping
2032 * but still dirty their pages
2033 */
2034 balance_dirty_pages_ratelimited(mapping);
2136 } 2035 }
2137 2036
2138 return ret; 2037 if (!page_mkwrite)
2038 file_update_time(vma->vm_file);
2139 } 2039 }
2140 2040
2141 /* 2041 return VM_FAULT_WRITE;
2142 * Ok, we need to copy. Oh, well.. 2042}
2143 */ 2043
2144 page_cache_get(old_page); 2044/*
2145gotten: 2045 * Handle the case of a page which we actually need to copy to a new page.
2146 pte_unmap_unlock(page_table, ptl); 2046 *
2047 * Called with mmap_sem locked and the old page referenced, but
2048 * without the ptl held.
2049 *
2050 * High level logic flow:
2051 *
2052 * - Allocate a page, copy the content of the old page to the new one.
2053 * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
2054 * - Take the PTL. If the pte changed, bail out and release the allocated page
2055 * - If the pte is still the way we remember it, update the page table and all
2056 * relevant references. This includes dropping the reference the page-table
2057 * held to the old page, as well as updating the rmap.
2058 * - In any case, unlock the PTL and drop the reference we took to the old page.
2059 */
2060static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
2061 unsigned long address, pte_t *page_table, pmd_t *pmd,
2062 pte_t orig_pte, struct page *old_page)
2063{
2064 struct page *new_page = NULL;
2065 spinlock_t *ptl = NULL;
2066 pte_t entry;
2067 int page_copied = 0;
2068 const unsigned long mmun_start = address & PAGE_MASK; /* For mmu_notifiers */
2069 const unsigned long mmun_end = mmun_start + PAGE_SIZE; /* For mmu_notifiers */
2070 struct mem_cgroup *memcg;
2147 2071
2148 if (unlikely(anon_vma_prepare(vma))) 2072 if (unlikely(anon_vma_prepare(vma)))
2149 goto oom; 2073 goto oom;
@@ -2163,8 +2087,6 @@ gotten:
2163 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) 2087 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
2164 goto oom_free_new; 2088 goto oom_free_new;
2165 2089
2166 mmun_start = address & PAGE_MASK;
2167 mmun_end = mmun_start + PAGE_SIZE;
2168 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2090 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2169 2091
2170 /* 2092 /*
@@ -2177,8 +2099,9 @@ gotten:
2177 dec_mm_counter_fast(mm, MM_FILEPAGES); 2099 dec_mm_counter_fast(mm, MM_FILEPAGES);
2178 inc_mm_counter_fast(mm, MM_ANONPAGES); 2100 inc_mm_counter_fast(mm, MM_ANONPAGES);
2179 } 2101 }
2180 } else 2102 } else {
2181 inc_mm_counter_fast(mm, MM_ANONPAGES); 2103 inc_mm_counter_fast(mm, MM_ANONPAGES);
2104 }
2182 flush_cache_page(vma, address, pte_pfn(orig_pte)); 2105 flush_cache_page(vma, address, pte_pfn(orig_pte));
2183 entry = mk_pte(new_page, vma->vm_page_prot); 2106 entry = mk_pte(new_page, vma->vm_page_prot);
2184 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2107 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2227,29 +2150,29 @@ gotten:
2227 2150
2228 /* Free the old page.. */ 2151 /* Free the old page.. */
2229 new_page = old_page; 2152 new_page = old_page;
2230 ret |= VM_FAULT_WRITE; 2153 page_copied = 1;
2231 } else 2154 } else {
2232 mem_cgroup_cancel_charge(new_page, memcg); 2155 mem_cgroup_cancel_charge(new_page, memcg);
2156 }
2233 2157
2234 if (new_page) 2158 if (new_page)
2235 page_cache_release(new_page); 2159 page_cache_release(new_page);
2236unlock: 2160
2237 pte_unmap_unlock(page_table, ptl); 2161 pte_unmap_unlock(page_table, ptl);
2238 if (mmun_end > mmun_start) 2162 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2239 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2240 if (old_page) { 2163 if (old_page) {
2241 /* 2164 /*
2242 * Don't let another task, with possibly unlocked vma, 2165 * Don't let another task, with possibly unlocked vma,
2243 * keep the mlocked page. 2166 * keep the mlocked page.
2244 */ 2167 */
2245 if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) { 2168 if (page_copied && (vma->vm_flags & VM_LOCKED)) {
2246 lock_page(old_page); /* LRU manipulation */ 2169 lock_page(old_page); /* LRU manipulation */
2247 munlock_vma_page(old_page); 2170 munlock_vma_page(old_page);
2248 unlock_page(old_page); 2171 unlock_page(old_page);
2249 } 2172 }
2250 page_cache_release(old_page); 2173 page_cache_release(old_page);
2251 } 2174 }
2252 return ret; 2175 return page_copied ? VM_FAULT_WRITE : 0;
2253oom_free_new: 2176oom_free_new:
2254 page_cache_release(new_page); 2177 page_cache_release(new_page);
2255oom: 2178oom:
@@ -2258,6 +2181,144 @@ oom:
2258 return VM_FAULT_OOM; 2181 return VM_FAULT_OOM;
2259} 2182}
2260 2183
2184static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
2185 unsigned long address, pte_t *page_table,
2186 pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte,
2187 struct page *old_page)
2188 __releases(ptl)
2189{
2190 int page_mkwrite = 0;
2191
2192 page_cache_get(old_page);
2193
2194 /*
2195 * Only catch write-faults on shared writable pages,
2196 * read-only shared pages can get COWed by
2197 * get_user_pages(.write=1, .force=1).
2198 */
2199 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2200 int tmp;
2201
2202 pte_unmap_unlock(page_table, ptl);
2203 tmp = do_page_mkwrite(vma, old_page, address);
2204 if (unlikely(!tmp || (tmp &
2205 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
2206 page_cache_release(old_page);
2207 return tmp;
2208 }
2209 /*
2210 * Since we dropped the lock we need to revalidate
2211 * the PTE as someone else may have changed it. If
2212 * they did, we just return, as we can count on the
2213 * MMU to tell us if they didn't also make it writable.
2214 */
2215 page_table = pte_offset_map_lock(mm, pmd, address,
2216 &ptl);
2217 if (!pte_same(*page_table, orig_pte)) {
2218 unlock_page(old_page);
2219 pte_unmap_unlock(page_table, ptl);
2220 page_cache_release(old_page);
2221 return 0;
2222 }
2223 page_mkwrite = 1;
2224 }
2225
2226 return wp_page_reuse(mm, vma, address, page_table, ptl,
2227 orig_pte, old_page, page_mkwrite, 1);
2228}
2229
2230/*
2231 * This routine handles present pages, when users try to write
2232 * to a shared page. It is done by copying the page to a new address
2233 * and decrementing the shared-page counter for the old page.
2234 *
2235 * Note that this routine assumes that the protection checks have been
2236 * done by the caller (the low-level page fault routine in most cases).
2237 * Thus we can safely just mark it writable once we've done any necessary
2238 * COW.
2239 *
2240 * We also mark the page dirty at this point even though the page will
2241 * change only once the write actually happens. This avoids a few races,
2242 * and potentially makes it more efficient.
2243 *
2244 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2245 * but allow concurrent faults), with pte both mapped and locked.
2246 * We return with mmap_sem still held, but pte unmapped and unlocked.
2247 */
2248static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2249 unsigned long address, pte_t *page_table, pmd_t *pmd,
2250 spinlock_t *ptl, pte_t orig_pte)
2251 __releases(ptl)
2252{
2253 struct page *old_page;
2254
2255 old_page = vm_normal_page(vma, address, orig_pte);
2256 if (!old_page) {
2257 /*
2258 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
2259 * VM_PFNMAP VMA.
2260 *
2261 * We should not cow pages in a shared writeable mapping.
2262 * Just mark the pages writable as we can't do any dirty
2263 * accounting on raw pfn maps.
2264 */
2265 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2266 (VM_WRITE|VM_SHARED))
2267 return wp_page_reuse(mm, vma, address, page_table, ptl,
2268 orig_pte, old_page, 0, 0);
2269
2270 pte_unmap_unlock(page_table, ptl);
2271 return wp_page_copy(mm, vma, address, page_table, pmd,
2272 orig_pte, old_page);
2273 }
2274
2275 /*
2276 * Take out anonymous pages first, anonymous shared vmas are
2277 * not dirty accountable.
2278 */
2279 if (PageAnon(old_page) && !PageKsm(old_page)) {
2280 if (!trylock_page(old_page)) {
2281 page_cache_get(old_page);
2282 pte_unmap_unlock(page_table, ptl);
2283 lock_page(old_page);
2284 page_table = pte_offset_map_lock(mm, pmd, address,
2285 &ptl);
2286 if (!pte_same(*page_table, orig_pte)) {
2287 unlock_page(old_page);
2288 pte_unmap_unlock(page_table, ptl);
2289 page_cache_release(old_page);
2290 return 0;
2291 }
2292 page_cache_release(old_page);
2293 }
2294 if (reuse_swap_page(old_page)) {
2295 /*
2296 * The page is all ours. Move it to our anon_vma so
2297 * the rmap code will not search our parent or siblings.
2298 * Protected against the rmap code by the page lock.
2299 */
2300 page_move_anon_rmap(old_page, vma, address);
2301 unlock_page(old_page);
2302 return wp_page_reuse(mm, vma, address, page_table, ptl,
2303 orig_pte, old_page, 0, 0);
2304 }
2305 unlock_page(old_page);
2306 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2307 (VM_WRITE|VM_SHARED))) {
2308 return wp_page_shared(mm, vma, address, page_table, pmd,
2309 ptl, orig_pte, old_page);
2310 }
2311
2312 /*
2313 * Ok, we need to copy. Oh, well..
2314 */
2315 page_cache_get(old_page);
2316
2317 pte_unmap_unlock(page_table, ptl);
2318 return wp_page_copy(mm, vma, address, page_table, pmd,
2319 orig_pte, old_page);
2320}
2321
2261static void unmap_mapping_range_vma(struct vm_area_struct *vma, 2322static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2262 unsigned long start_addr, unsigned long end_addr, 2323 unsigned long start_addr, unsigned long end_addr,
2263 struct zap_details *details) 2324 struct zap_details *details)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 65842d688b7c..e2e8014fb755 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -104,7 +104,7 @@ void put_online_mems(void)
104 104
105} 105}
106 106
107static void mem_hotplug_begin(void) 107void mem_hotplug_begin(void)
108{ 108{
109 mem_hotplug.active_writer = current; 109 mem_hotplug.active_writer = current;
110 110
@@ -119,7 +119,7 @@ static void mem_hotplug_begin(void)
119 } 119 }
120} 120}
121 121
122static void mem_hotplug_done(void) 122void mem_hotplug_done(void)
123{ 123{
124 mem_hotplug.active_writer = NULL; 124 mem_hotplug.active_writer = NULL;
125 mutex_unlock(&mem_hotplug.lock); 125 mutex_unlock(&mem_hotplug.lock);
@@ -502,7 +502,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
502 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); 502 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
503 503
504 for (i = start_sec; i <= end_sec; i++) { 504 for (i = start_sec; i <= end_sec; i++) {
505 err = __add_section(nid, zone, i << PFN_SECTION_SHIFT); 505 err = __add_section(nid, zone, section_nr_to_pfn(i));
506 506
507 /* 507 /*
508 * EEXIST is finally dealt with by ioresource collision 508 * EEXIST is finally dealt with by ioresource collision
@@ -959,6 +959,7 @@ static void node_states_set_node(int node, struct memory_notify *arg)
959} 959}
960 960
961 961
962/* Must be protected by mem_hotplug_begin() */
962int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) 963int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
963{ 964{
964 unsigned long flags; 965 unsigned long flags;
@@ -969,7 +970,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
969 int ret; 970 int ret;
970 struct memory_notify arg; 971 struct memory_notify arg;
971 972
972 mem_hotplug_begin();
973 /* 973 /*
974 * This doesn't need a lock to do pfn_to_page(). 974 * This doesn't need a lock to do pfn_to_page().
975 * The section can't be removed here because of the 975 * The section can't be removed here because of the
@@ -977,21 +977,20 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
977 */ 977 */
978 zone = page_zone(pfn_to_page(pfn)); 978 zone = page_zone(pfn_to_page(pfn));
979 979
980 ret = -EINVAL;
981 if ((zone_idx(zone) > ZONE_NORMAL || 980 if ((zone_idx(zone) > ZONE_NORMAL ||
982 online_type == MMOP_ONLINE_MOVABLE) && 981 online_type == MMOP_ONLINE_MOVABLE) &&
983 !can_online_high_movable(zone)) 982 !can_online_high_movable(zone))
984 goto out; 983 return -EINVAL;
985 984
986 if (online_type == MMOP_ONLINE_KERNEL && 985 if (online_type == MMOP_ONLINE_KERNEL &&
987 zone_idx(zone) == ZONE_MOVABLE) { 986 zone_idx(zone) == ZONE_MOVABLE) {
988 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) 987 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages))
989 goto out; 988 return -EINVAL;
990 } 989 }
991 if (online_type == MMOP_ONLINE_MOVABLE && 990 if (online_type == MMOP_ONLINE_MOVABLE &&
992 zone_idx(zone) == ZONE_MOVABLE - 1) { 991 zone_idx(zone) == ZONE_MOVABLE - 1) {
993 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) 992 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages))
994 goto out; 993 return -EINVAL;
995 } 994 }
996 995
997 /* Previous code may changed the zone of the pfn range */ 996 /* Previous code may changed the zone of the pfn range */
@@ -1007,7 +1006,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
1007 ret = notifier_to_errno(ret); 1006 ret = notifier_to_errno(ret);
1008 if (ret) { 1007 if (ret) {
1009 memory_notify(MEM_CANCEL_ONLINE, &arg); 1008 memory_notify(MEM_CANCEL_ONLINE, &arg);
1010 goto out; 1009 return ret;
1011 } 1010 }
1012 /* 1011 /*
1013 * If this zone is not populated, then it is not in zonelist. 1012 * If this zone is not populated, then it is not in zonelist.
@@ -1031,7 +1030,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
1031 (((unsigned long long) pfn + nr_pages) 1030 (((unsigned long long) pfn + nr_pages)
1032 << PAGE_SHIFT) - 1); 1031 << PAGE_SHIFT) - 1);
1033 memory_notify(MEM_CANCEL_ONLINE, &arg); 1032 memory_notify(MEM_CANCEL_ONLINE, &arg);
1034 goto out; 1033 return ret;
1035 } 1034 }
1036 1035
1037 zone->present_pages += onlined_pages; 1036 zone->present_pages += onlined_pages;
@@ -1061,9 +1060,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
1061 1060
1062 if (onlined_pages) 1061 if (onlined_pages)
1063 memory_notify(MEM_ONLINE, &arg); 1062 memory_notify(MEM_ONLINE, &arg);
1064out: 1063 return 0;
1065 mem_hotplug_done();
1066 return ret;
1067} 1064}
1068#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ 1065#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
1069 1066
@@ -1688,21 +1685,18 @@ static int __ref __offline_pages(unsigned long start_pfn,
1688 if (!test_pages_in_a_zone(start_pfn, end_pfn)) 1685 if (!test_pages_in_a_zone(start_pfn, end_pfn))
1689 return -EINVAL; 1686 return -EINVAL;
1690 1687
1691 mem_hotplug_begin();
1692
1693 zone = page_zone(pfn_to_page(start_pfn)); 1688 zone = page_zone(pfn_to_page(start_pfn));
1694 node = zone_to_nid(zone); 1689 node = zone_to_nid(zone);
1695 nr_pages = end_pfn - start_pfn; 1690 nr_pages = end_pfn - start_pfn;
1696 1691
1697 ret = -EINVAL;
1698 if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) 1692 if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages))
1699 goto out; 1693 return -EINVAL;
1700 1694
1701 /* set above range as isolated */ 1695 /* set above range as isolated */
1702 ret = start_isolate_page_range(start_pfn, end_pfn, 1696 ret = start_isolate_page_range(start_pfn, end_pfn,
1703 MIGRATE_MOVABLE, true); 1697 MIGRATE_MOVABLE, true);
1704 if (ret) 1698 if (ret)
1705 goto out; 1699 return ret;
1706 1700
1707 arg.start_pfn = start_pfn; 1701 arg.start_pfn = start_pfn;
1708 arg.nr_pages = nr_pages; 1702 arg.nr_pages = nr_pages;
@@ -1795,7 +1789,6 @@ repeat:
1795 writeback_set_ratelimit(); 1789 writeback_set_ratelimit();
1796 1790
1797 memory_notify(MEM_OFFLINE, &arg); 1791 memory_notify(MEM_OFFLINE, &arg);
1798 mem_hotplug_done();
1799 return 0; 1792 return 0;
1800 1793
1801failed_removal: 1794failed_removal:
@@ -1805,12 +1798,10 @@ failed_removal:
1805 memory_notify(MEM_CANCEL_OFFLINE, &arg); 1798 memory_notify(MEM_CANCEL_OFFLINE, &arg);
1806 /* pushback to free area */ 1799 /* pushback to free area */
1807 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1800 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
1808
1809out:
1810 mem_hotplug_done();
1811 return ret; 1801 return ret;
1812} 1802}
1813 1803
1804/* Must be protected by mem_hotplug_begin() */
1814int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 1805int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1815{ 1806{
1816 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); 1807 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4721046a134a..ede26291d4aa 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -945,7 +945,8 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x
945 return alloc_huge_page_node(page_hstate(compound_head(page)), 945 return alloc_huge_page_node(page_hstate(compound_head(page)),
946 node); 946 node);
947 else 947 else
948 return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); 948 return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE |
949 __GFP_THISNODE, 0);
949} 950}
950 951
951/* 952/*
@@ -1985,7 +1986,8 @@ retry_cpuset:
1985 nmask = policy_nodemask(gfp, pol); 1986 nmask = policy_nodemask(gfp, pol);
1986 if (!nmask || node_isset(node, *nmask)) { 1987 if (!nmask || node_isset(node, *nmask)) {
1987 mpol_cond_put(pol); 1988 mpol_cond_put(pol);
1988 page = alloc_pages_exact_node(node, gfp, order); 1989 page = alloc_pages_exact_node(node,
1990 gfp | __GFP_THISNODE, order);
1989 goto out; 1991 goto out;
1990 } 1992 }
1991 } 1993 }
diff --git a/mm/mempool.c b/mm/mempool.c
index e209c98c7203..949970db2874 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -113,23 +113,24 @@ EXPORT_SYMBOL(mempool_create_node);
113 * mempool_create(). 113 * mempool_create().
114 * @new_min_nr: the new minimum number of elements guaranteed to be 114 * @new_min_nr: the new minimum number of elements guaranteed to be
115 * allocated for this pool. 115 * allocated for this pool.
116 * @gfp_mask: the usual allocation bitmask.
117 * 116 *
118 * This function shrinks/grows the pool. In the case of growing, 117 * This function shrinks/grows the pool. In the case of growing,
119 * it cannot be guaranteed that the pool will be grown to the new 118 * it cannot be guaranteed that the pool will be grown to the new
120 * size immediately, but new mempool_free() calls will refill it. 119 * size immediately, but new mempool_free() calls will refill it.
120 * This function may sleep.
121 * 121 *
122 * Note, the caller must guarantee that no mempool_destroy is called 122 * Note, the caller must guarantee that no mempool_destroy is called
123 * while this function is running. mempool_alloc() & mempool_free() 123 * while this function is running. mempool_alloc() & mempool_free()
124 * might be called (eg. from IRQ contexts) while this function executes. 124 * might be called (eg. from IRQ contexts) while this function executes.
125 */ 125 */
126int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask) 126int mempool_resize(mempool_t *pool, int new_min_nr)
127{ 127{
128 void *element; 128 void *element;
129 void **new_elements; 129 void **new_elements;
130 unsigned long flags; 130 unsigned long flags;
131 131
132 BUG_ON(new_min_nr <= 0); 132 BUG_ON(new_min_nr <= 0);
133 might_sleep();
133 134
134 spin_lock_irqsave(&pool->lock, flags); 135 spin_lock_irqsave(&pool->lock, flags);
135 if (new_min_nr <= pool->min_nr) { 136 if (new_min_nr <= pool->min_nr) {
@@ -145,7 +146,8 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask)
145 spin_unlock_irqrestore(&pool->lock, flags); 146 spin_unlock_irqrestore(&pool->lock, flags);
146 147
147 /* Grow the pool */ 148 /* Grow the pool */
148 new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask); 149 new_elements = kmalloc_array(new_min_nr, sizeof(*new_elements),
150 GFP_KERNEL);
149 if (!new_elements) 151 if (!new_elements)
150 return -ENOMEM; 152 return -ENOMEM;
151 153
@@ -164,7 +166,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask)
164 166
165 while (pool->curr_nr < pool->min_nr) { 167 while (pool->curr_nr < pool->min_nr) {
166 spin_unlock_irqrestore(&pool->lock, flags); 168 spin_unlock_irqrestore(&pool->lock, flags);
167 element = pool->alloc(gfp_mask, pool->pool_data); 169 element = pool->alloc(GFP_KERNEL, pool->pool_data);
168 if (!element) 170 if (!element)
169 goto out; 171 goto out;
170 spin_lock_irqsave(&pool->lock, flags); 172 spin_lock_irqsave(&pool->lock, flags);
diff --git a/arch/x86/mm/memtest.c b/mm/memtest.c
index 1e9da795767a..1997d934b13b 100644
--- a/arch/x86/mm/memtest.c
+++ b/mm/memtest.c
@@ -29,7 +29,7 @@ static u64 patterns[] __initdata = {
29 0x7a6c7258554e494cULL, /* yeah ;-) */ 29 0x7a6c7258554e494cULL, /* yeah ;-) */
30}; 30};
31 31
32static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad) 32static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr_t end_bad)
33{ 33{
34 printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n", 34 printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n",
35 (unsigned long long) pattern, 35 (unsigned long long) pattern,
@@ -38,11 +38,11 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
38 memblock_reserve(start_bad, end_bad - start_bad); 38 memblock_reserve(start_bad, end_bad - start_bad);
39} 39}
40 40
41static void __init memtest(u64 pattern, u64 start_phys, u64 size) 41static void __init memtest(u64 pattern, phys_addr_t start_phys, phys_addr_t size)
42{ 42{
43 u64 *p, *start, *end; 43 u64 *p, *start, *end;
44 u64 start_bad, last_bad; 44 phys_addr_t start_bad, last_bad;
45 u64 start_phys_aligned; 45 phys_addr_t start_phys_aligned;
46 const size_t incr = sizeof(pattern); 46 const size_t incr = sizeof(pattern);
47 47
48 start_phys_aligned = ALIGN(start_phys, incr); 48 start_phys_aligned = ALIGN(start_phys, incr);
@@ -69,14 +69,14 @@ static void __init memtest(u64 pattern, u64 start_phys, u64 size)
69 reserve_bad_mem(pattern, start_bad, last_bad + incr); 69 reserve_bad_mem(pattern, start_bad, last_bad + incr);
70} 70}
71 71
72static void __init do_one_pass(u64 pattern, u64 start, u64 end) 72static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end)
73{ 73{
74 u64 i; 74 u64 i;
75 phys_addr_t this_start, this_end; 75 phys_addr_t this_start, this_end;
76 76
77 for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) { 77 for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) {
78 this_start = clamp_t(phys_addr_t, this_start, start, end); 78 this_start = clamp(this_start, start, end);
79 this_end = clamp_t(phys_addr_t, this_end, start, end); 79 this_end = clamp(this_end, start, end);
80 if (this_start < this_end) { 80 if (this_start < this_end) {
81 printk(KERN_INFO " %010llx - %010llx pattern %016llx\n", 81 printk(KERN_INFO " %010llx - %010llx pattern %016llx\n",
82 (unsigned long long)this_start, 82 (unsigned long long)this_start,
@@ -102,7 +102,7 @@ static int __init parse_memtest(char *arg)
102 102
103early_param("memtest", parse_memtest); 103early_param("memtest", parse_memtest);
104 104
105void __init early_memtest(unsigned long start, unsigned long end) 105void __init early_memtest(phys_addr_t start, phys_addr_t end)
106{ 106{
107 unsigned int i; 107 unsigned int i;
108 unsigned int idx = 0; 108 unsigned int idx = 0;
diff --git a/mm/migrate.c b/mm/migrate.c
index 85e042686031..a65ff72ab739 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -901,12 +901,23 @@ out:
901} 901}
902 902
903/* 903/*
904 * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work
905 * around it.
906 */
907#if (GCC_VERSION >= 40700 && GCC_VERSION < 40900) && defined(CONFIG_ARM)
908#define ICE_noinline noinline
909#else
910#define ICE_noinline
911#endif
912
913/*
904 * Obtain the lock on page, remove all ptes and migrate the page 914 * Obtain the lock on page, remove all ptes and migrate the page
905 * to the newly allocated page in newpage. 915 * to the newly allocated page in newpage.
906 */ 916 */
907static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, 917static ICE_noinline int unmap_and_move(new_page_t get_new_page,
908 unsigned long private, struct page *page, int force, 918 free_page_t put_new_page,
909 enum migrate_mode mode) 919 unsigned long private, struct page *page,
920 int force, enum migrate_mode mode)
910{ 921{
911 int rc = 0; 922 int rc = 0;
912 int *result = NULL; 923 int *result = NULL;
@@ -1554,30 +1565,10 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
1554 * page migration rate limiting control. 1565 * page migration rate limiting control.
1555 * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs 1566 * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
1556 * window of time. Default here says do not migrate more than 1280M per second. 1567 * window of time. Default here says do not migrate more than 1280M per second.
1557 * If a node is rate-limited then PTE NUMA updates are also rate-limited. However
1558 * as it is faults that reset the window, pte updates will happen unconditionally
1559 * if there has not been a fault since @pteupdate_interval_millisecs after the
1560 * throttle window closed.
1561 */ 1568 */
1562static unsigned int migrate_interval_millisecs __read_mostly = 100; 1569static unsigned int migrate_interval_millisecs __read_mostly = 100;
1563static unsigned int pteupdate_interval_millisecs __read_mostly = 1000;
1564static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); 1570static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
1565 1571
1566/* Returns true if NUMA migration is currently rate limited */
1567bool migrate_ratelimited(int node)
1568{
1569 pg_data_t *pgdat = NODE_DATA(node);
1570
1571 if (time_after(jiffies, pgdat->numabalancing_migrate_next_window +
1572 msecs_to_jiffies(pteupdate_interval_millisecs)))
1573 return false;
1574
1575 if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages)
1576 return false;
1577
1578 return true;
1579}
1580
1581/* Returns true if the node is migrate rate-limited after the update */ 1572/* Returns true if the node is migrate rate-limited after the update */
1582static bool numamigrate_update_ratelimit(pg_data_t *pgdat, 1573static bool numamigrate_update_ratelimit(pg_data_t *pgdat,
1583 unsigned long nr_pages) 1574 unsigned long nr_pages)
diff --git a/mm/mlock.c b/mm/mlock.c
index 8a54cd214925..6fd2cf15e868 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -205,62 +205,6 @@ out:
205 return nr_pages - 1; 205 return nr_pages - 1;
206} 206}
207 207
208/**
209 * __mlock_vma_pages_range() - mlock a range of pages in the vma.
210 * @vma: target vma
211 * @start: start address
212 * @end: end address
213 * @nonblocking:
214 *
215 * This takes care of making the pages present too.
216 *
217 * return 0 on success, negative error code on error.
218 *
219 * vma->vm_mm->mmap_sem must be held.
220 *
221 * If @nonblocking is NULL, it may be held for read or write and will
222 * be unperturbed.
223 *
224 * If @nonblocking is non-NULL, it must held for read only and may be
225 * released. If it's released, *@nonblocking will be set to 0.
226 */
227long __mlock_vma_pages_range(struct vm_area_struct *vma,
228 unsigned long start, unsigned long end, int *nonblocking)
229{
230 struct mm_struct *mm = vma->vm_mm;
231 unsigned long nr_pages = (end - start) / PAGE_SIZE;
232 int gup_flags;
233
234 VM_BUG_ON(start & ~PAGE_MASK);
235 VM_BUG_ON(end & ~PAGE_MASK);
236 VM_BUG_ON_VMA(start < vma->vm_start, vma);
237 VM_BUG_ON_VMA(end > vma->vm_end, vma);
238 VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
239
240 gup_flags = FOLL_TOUCH | FOLL_MLOCK;
241 /*
242 * We want to touch writable mappings with a write fault in order
243 * to break COW, except for shared mappings because these don't COW
244 * and we would not want to dirty them for nothing.
245 */
246 if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
247 gup_flags |= FOLL_WRITE;
248
249 /*
250 * We want mlock to succeed for regions that have any permissions
251 * other than PROT_NONE.
252 */
253 if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
254 gup_flags |= FOLL_FORCE;
255
256 /*
257 * We made sure addr is within a VMA, so the following will
258 * not result in a stack expansion that recurses back here.
259 */
260 return __get_user_pages(current, mm, start, nr_pages, gup_flags,
261 NULL, NULL, nonblocking);
262}
263
264/* 208/*
265 * convert get_user_pages() return value to posix mlock() error 209 * convert get_user_pages() return value to posix mlock() error
266 */ 210 */
@@ -596,7 +540,7 @@ success:
596 /* 540 /*
597 * vm_flags is protected by the mmap_sem held in write mode. 541 * vm_flags is protected by the mmap_sem held in write mode.
598 * It's okay if try_to_unmap_one unmaps a page just after we 542 * It's okay if try_to_unmap_one unmaps a page just after we
599 * set VM_LOCKED, __mlock_vma_pages_range will bring it back. 543 * set VM_LOCKED, populate_vma_page_range will bring it back.
600 */ 544 */
601 545
602 if (lock) 546 if (lock)
@@ -660,69 +604,6 @@ static int do_mlock(unsigned long start, size_t len, int on)
660 return error; 604 return error;
661} 605}
662 606
663/*
664 * __mm_populate - populate and/or mlock pages within a range of address space.
665 *
666 * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
667 * flags. VMAs must be already marked with the desired vm_flags, and
668 * mmap_sem must not be held.
669 */
670int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
671{
672 struct mm_struct *mm = current->mm;
673 unsigned long end, nstart, nend;
674 struct vm_area_struct *vma = NULL;
675 int locked = 0;
676 long ret = 0;
677
678 VM_BUG_ON(start & ~PAGE_MASK);
679 VM_BUG_ON(len != PAGE_ALIGN(len));
680 end = start + len;
681
682 for (nstart = start; nstart < end; nstart = nend) {
683 /*
684 * We want to fault in pages for [nstart; end) address range.
685 * Find first corresponding VMA.
686 */
687 if (!locked) {
688 locked = 1;
689 down_read(&mm->mmap_sem);
690 vma = find_vma(mm, nstart);
691 } else if (nstart >= vma->vm_end)
692 vma = vma->vm_next;
693 if (!vma || vma->vm_start >= end)
694 break;
695 /*
696 * Set [nstart; nend) to intersection of desired address
697 * range with the first VMA. Also, skip undesirable VMA types.
698 */
699 nend = min(end, vma->vm_end);
700 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
701 continue;
702 if (nstart < vma->vm_start)
703 nstart = vma->vm_start;
704 /*
705 * Now fault in a range of pages. __mlock_vma_pages_range()
706 * double checks the vma flags, so that it won't mlock pages
707 * if the vma was already munlocked.
708 */
709 ret = __mlock_vma_pages_range(vma, nstart, nend, &locked);
710 if (ret < 0) {
711 if (ignore_errors) {
712 ret = 0;
713 continue; /* continue at next VMA */
714 }
715 ret = __mlock_posix_error_return(ret);
716 break;
717 }
718 nend = nstart + ret * PAGE_SIZE;
719 ret = 0;
720 }
721 if (locked)
722 up_read(&mm->mmap_sem);
723 return ret; /* 0 or negative error code */
724}
725
726SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) 607SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
727{ 608{
728 unsigned long locked; 609 unsigned long locked;
@@ -750,9 +631,13 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
750 error = do_mlock(start, len, 1); 631 error = do_mlock(start, len, 1);
751 632
752 up_write(&current->mm->mmap_sem); 633 up_write(&current->mm->mmap_sem);
753 if (!error) 634 if (error)
754 error = __mm_populate(start, len, 0); 635 return error;
755 return error; 636
637 error = __mm_populate(start, len, 0);
638 if (error)
639 return __mlock_posix_error_return(error);
640 return 0;
756} 641}
757 642
758SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) 643SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
diff --git a/mm/mmap.c b/mm/mmap.c
index 9ec50a368634..06a6076c92e5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2316,7 +2316,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
2316 if (!prev || expand_stack(prev, addr)) 2316 if (!prev || expand_stack(prev, addr))
2317 return NULL; 2317 return NULL;
2318 if (prev->vm_flags & VM_LOCKED) 2318 if (prev->vm_flags & VM_LOCKED)
2319 __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL); 2319 populate_vma_page_range(prev, addr, prev->vm_end, NULL);
2320 return prev; 2320 return prev;
2321} 2321}
2322#else 2322#else
@@ -2351,7 +2351,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
2351 if (expand_stack(vma, addr)) 2351 if (expand_stack(vma, addr))
2352 return NULL; 2352 return NULL;
2353 if (vma->vm_flags & VM_LOCKED) 2353 if (vma->vm_flags & VM_LOCKED)
2354 __mlock_vma_pages_range(vma, addr, start, NULL); 2354 populate_vma_page_range(vma, addr, start, NULL);
2355 return vma; 2355 return vma;
2356} 2356}
2357#endif 2357#endif
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 642f38cb175a..52628c819bf7 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -612,7 +612,8 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
612 * Determines whether the kernel must panic because of the panic_on_oom sysctl. 612 * Determines whether the kernel must panic because of the panic_on_oom sysctl.
613 */ 613 */
614void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, 614void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
615 int order, const nodemask_t *nodemask) 615 int order, const nodemask_t *nodemask,
616 struct mem_cgroup *memcg)
616{ 617{
617 if (likely(!sysctl_panic_on_oom)) 618 if (likely(!sysctl_panic_on_oom))
618 return; 619 return;
@@ -625,7 +626,7 @@ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
625 if (constraint != CONSTRAINT_NONE) 626 if (constraint != CONSTRAINT_NONE)
626 return; 627 return;
627 } 628 }
628 dump_header(NULL, gfp_mask, order, NULL, nodemask); 629 dump_header(NULL, gfp_mask, order, memcg, nodemask);
629 panic("Out of memory: %s panic_on_oom is enabled\n", 630 panic("Out of memory: %s panic_on_oom is enabled\n",
630 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); 631 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
631} 632}
@@ -740,7 +741,7 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
740 constraint = constrained_alloc(zonelist, gfp_mask, nodemask, 741 constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
741 &totalpages); 742 &totalpages);
742 mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL; 743 mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
743 check_panic_on_oom(constraint, gfp_mask, order, mpol_mask); 744 check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL);
744 745
745 if (sysctl_oom_kill_allocating_task && current->mm && 746 if (sysctl_oom_kill_allocating_task && current->mm &&
746 !oom_unkillable_task(current, NULL, nodemask) && 747 !oom_unkillable_task(current, NULL, nodemask) &&
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 644bcb665773..0372411f38fc 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2111,6 +2111,25 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
2111EXPORT_SYMBOL(account_page_dirtied); 2111EXPORT_SYMBOL(account_page_dirtied);
2112 2112
2113/* 2113/*
2114 * Helper function for deaccounting dirty page without writeback.
2115 *
2116 * Doing this should *normally* only ever be done when a page
2117 * is truncated, and is not actually mapped anywhere at all. However,
2118 * fs/buffer.c does this when it notices that somebody has cleaned
2119 * out all the buffers on a page without actually doing it through
2120 * the VM. Can you say "ext3 is horribly ugly"? Thought you could.
2121 */
2122void account_page_cleaned(struct page *page, struct address_space *mapping)
2123{
2124 if (mapping_cap_account_dirty(mapping)) {
2125 dec_zone_page_state(page, NR_FILE_DIRTY);
2126 dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE);
2127 task_io_account_cancelled_write(PAGE_CACHE_SIZE);
2128 }
2129}
2130EXPORT_SYMBOL(account_page_cleaned);
2131
2132/*
2114 * For address_spaces which do not use buffers. Just tag the page as dirty in 2133 * For address_spaces which do not use buffers. Just tag the page as dirty in
2115 * its radix tree. 2134 * its radix tree.
2116 * 2135 *
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 40e29429e7b0..1b849500640c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1032,11 +1032,9 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
1032static int fallbacks[MIGRATE_TYPES][4] = { 1032static int fallbacks[MIGRATE_TYPES][4] = {
1033 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 1033 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
1034 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 1034 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
1035 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
1035#ifdef CONFIG_CMA 1036#ifdef CONFIG_CMA
1036 [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
1037 [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ 1037 [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */
1038#else
1039 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
1040#endif 1038#endif
1041 [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ 1039 [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
1042#ifdef CONFIG_MEMORY_ISOLATION 1040#ifdef CONFIG_MEMORY_ISOLATION
@@ -1044,6 +1042,17 @@ static int fallbacks[MIGRATE_TYPES][4] = {
1044#endif 1042#endif
1045}; 1043};
1046 1044
1045#ifdef CONFIG_CMA
1046static struct page *__rmqueue_cma_fallback(struct zone *zone,
1047 unsigned int order)
1048{
1049 return __rmqueue_smallest(zone, order, MIGRATE_CMA);
1050}
1051#else
1052static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
1053 unsigned int order) { return NULL; }
1054#endif
1055
1047/* 1056/*
1048 * Move the free pages in a range to the free lists of the requested type. 1057 * Move the free pages in a range to the free lists of the requested type.
1049 * Note that start_page and end_pages are not aligned on a pageblock 1058 * Note that start_page and end_pages are not aligned on a pageblock
@@ -1136,14 +1145,40 @@ static void change_pageblock_range(struct page *pageblock_page,
1136 * as fragmentation caused by those allocations polluting movable pageblocks 1145 * as fragmentation caused by those allocations polluting movable pageblocks
1137 * is worse than movable allocations stealing from unmovable and reclaimable 1146 * is worse than movable allocations stealing from unmovable and reclaimable
1138 * pageblocks. 1147 * pageblocks.
1139 *
1140 * If we claim more than half of the pageblock, change pageblock's migratetype
1141 * as well.
1142 */ 1148 */
1143static void try_to_steal_freepages(struct zone *zone, struct page *page, 1149static bool can_steal_fallback(unsigned int order, int start_mt)
1144 int start_type, int fallback_type) 1150{
1151 /*
1152 * Leaving this order check is intended, although there is
1153 * relaxed order check in next check. The reason is that
1154 * we can actually steal whole pageblock if this condition met,
1155 * but, below check doesn't guarantee it and that is just heuristic
1156 * so could be changed anytime.
1157 */
1158 if (order >= pageblock_order)
1159 return true;
1160
1161 if (order >= pageblock_order / 2 ||
1162 start_mt == MIGRATE_RECLAIMABLE ||
1163 start_mt == MIGRATE_UNMOVABLE ||
1164 page_group_by_mobility_disabled)
1165 return true;
1166
1167 return false;
1168}
1169
1170/*
1171 * This function implements actual steal behaviour. If order is large enough,
1172 * we can steal whole pageblock. If not, we first move freepages in this
1173 * pageblock and check whether half of pages are moved or not. If half of
1174 * pages are moved, we can change migratetype of pageblock and permanently
1175 * use it's pages as requested migratetype in the future.
1176 */
1177static void steal_suitable_fallback(struct zone *zone, struct page *page,
1178 int start_type)
1145{ 1179{
1146 int current_order = page_order(page); 1180 int current_order = page_order(page);
1181 int pages;
1147 1182
1148 /* Take ownership for orders >= pageblock_order */ 1183 /* Take ownership for orders >= pageblock_order */
1149 if (current_order >= pageblock_order) { 1184 if (current_order >= pageblock_order) {
@@ -1151,19 +1186,49 @@ static void try_to_steal_freepages(struct zone *zone, struct page *page,
1151 return; 1186 return;
1152 } 1187 }
1153 1188
1154 if (current_order >= pageblock_order / 2 || 1189 pages = move_freepages_block(zone, page, start_type);
1155 start_type == MIGRATE_RECLAIMABLE || 1190
1156 start_type == MIGRATE_UNMOVABLE || 1191 /* Claim the whole block if over half of it is free */
1157 page_group_by_mobility_disabled) { 1192 if (pages >= (1 << (pageblock_order-1)) ||
1158 int pages; 1193 page_group_by_mobility_disabled)
1194 set_pageblock_migratetype(page, start_type);
1195}
1196
1197/*
1198 * Check whether there is a suitable fallback freepage with requested order.
1199 * If only_stealable is true, this function returns fallback_mt only if
1200 * we can steal other freepages all together. This would help to reduce
1201 * fragmentation due to mixed migratetype pages in one pageblock.
1202 */
1203int find_suitable_fallback(struct free_area *area, unsigned int order,
1204 int migratetype, bool only_stealable, bool *can_steal)
1205{
1206 int i;
1207 int fallback_mt;
1208
1209 if (area->nr_free == 0)
1210 return -1;
1211
1212 *can_steal = false;
1213 for (i = 0;; i++) {
1214 fallback_mt = fallbacks[migratetype][i];
1215 if (fallback_mt == MIGRATE_RESERVE)
1216 break;
1217
1218 if (list_empty(&area->free_list[fallback_mt]))
1219 continue;
1159 1220
1160 pages = move_freepages_block(zone, page, start_type); 1221 if (can_steal_fallback(order, migratetype))
1222 *can_steal = true;
1161 1223
1162 /* Claim the whole block if over half of it is free */ 1224 if (!only_stealable)
1163 if (pages >= (1 << (pageblock_order-1)) || 1225 return fallback_mt;
1164 page_group_by_mobility_disabled) 1226
1165 set_pageblock_migratetype(page, start_type); 1227 if (*can_steal)
1228 return fallback_mt;
1166 } 1229 }
1230
1231 return -1;
1167} 1232}
1168 1233
1169/* Remove an element from the buddy allocator from the fallback list */ 1234/* Remove an element from the buddy allocator from the fallback list */
@@ -1173,64 +1238,45 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
1173 struct free_area *area; 1238 struct free_area *area;
1174 unsigned int current_order; 1239 unsigned int current_order;
1175 struct page *page; 1240 struct page *page;
1241 int fallback_mt;
1242 bool can_steal;
1176 1243
1177 /* Find the largest possible block of pages in the other list */ 1244 /* Find the largest possible block of pages in the other list */
1178 for (current_order = MAX_ORDER-1; 1245 for (current_order = MAX_ORDER-1;
1179 current_order >= order && current_order <= MAX_ORDER-1; 1246 current_order >= order && current_order <= MAX_ORDER-1;
1180 --current_order) { 1247 --current_order) {
1181 int i; 1248 area = &(zone->free_area[current_order]);
1182 for (i = 0;; i++) { 1249 fallback_mt = find_suitable_fallback(area, current_order,
1183 int migratetype = fallbacks[start_migratetype][i]; 1250 start_migratetype, false, &can_steal);
1184 int buddy_type = start_migratetype; 1251 if (fallback_mt == -1)
1185 1252 continue;
1186 /* MIGRATE_RESERVE handled later if necessary */
1187 if (migratetype == MIGRATE_RESERVE)
1188 break;
1189
1190 area = &(zone->free_area[current_order]);
1191 if (list_empty(&area->free_list[migratetype]))
1192 continue;
1193
1194 page = list_entry(area->free_list[migratetype].next,
1195 struct page, lru);
1196 area->nr_free--;
1197
1198 if (!is_migrate_cma(migratetype)) {
1199 try_to_steal_freepages(zone, page,
1200 start_migratetype,
1201 migratetype);
1202 } else {
1203 /*
1204 * When borrowing from MIGRATE_CMA, we need to
1205 * release the excess buddy pages to CMA
1206 * itself, and we do not try to steal extra
1207 * free pages.
1208 */
1209 buddy_type = migratetype;
1210 }
1211 1253
1212 /* Remove the page from the freelists */ 1254 page = list_entry(area->free_list[fallback_mt].next,
1213 list_del(&page->lru); 1255 struct page, lru);
1214 rmv_page_order(page); 1256 if (can_steal)
1257 steal_suitable_fallback(zone, page, start_migratetype);
1215 1258
1216 expand(zone, page, order, current_order, area, 1259 /* Remove the page from the freelists */
1217 buddy_type); 1260 area->nr_free--;
1261 list_del(&page->lru);
1262 rmv_page_order(page);
1218 1263
1219 /* 1264 expand(zone, page, order, current_order, area,
1220 * The freepage_migratetype may differ from pageblock's 1265 start_migratetype);
1221 * migratetype depending on the decisions in 1266 /*
1222 * try_to_steal_freepages(). This is OK as long as it 1267 * The freepage_migratetype may differ from pageblock's
1223 * does not differ for MIGRATE_CMA pageblocks. For CMA 1268 * migratetype depending on the decisions in
1224 * we need to make sure unallocated pages flushed from 1269 * try_to_steal_freepages(). This is OK as long as it
1225 * pcp lists are returned to the correct freelist. 1270 * does not differ for MIGRATE_CMA pageblocks. For CMA
1226 */ 1271 * we need to make sure unallocated pages flushed from
1227 set_freepage_migratetype(page, buddy_type); 1272 * pcp lists are returned to the correct freelist.
1273 */
1274 set_freepage_migratetype(page, start_migratetype);
1228 1275
1229 trace_mm_page_alloc_extfrag(page, order, current_order, 1276 trace_mm_page_alloc_extfrag(page, order, current_order,
1230 start_migratetype, migratetype); 1277 start_migratetype, fallback_mt);
1231 1278
1232 return page; 1279 return page;
1233 }
1234 } 1280 }
1235 1281
1236 return NULL; 1282 return NULL;
@@ -1249,7 +1295,11 @@ retry_reserve:
1249 page = __rmqueue_smallest(zone, order, migratetype); 1295 page = __rmqueue_smallest(zone, order, migratetype);
1250 1296
1251 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { 1297 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
1252 page = __rmqueue_fallback(zone, order, migratetype); 1298 if (migratetype == MIGRATE_MOVABLE)
1299 page = __rmqueue_cma_fallback(zone, order);
1300
1301 if (!page)
1302 page = __rmqueue_fallback(zone, order, migratetype);
1253 1303
1254 /* 1304 /*
1255 * Use MIGRATE_RESERVE rather than fail an allocation. goto 1305 * Use MIGRATE_RESERVE rather than fail an allocation. goto
@@ -2362,13 +2412,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2362 *did_some_progress = 1; 2412 *did_some_progress = 1;
2363 goto out; 2413 goto out;
2364 } 2414 }
2365 /* 2415 /* The OOM killer may not free memory on a specific node */
2366 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
2367 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
2368 * The caller should handle page allocation failure by itself if
2369 * it specifies __GFP_THISNODE.
2370 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
2371 */
2372 if (gfp_mask & __GFP_THISNODE) 2416 if (gfp_mask & __GFP_THISNODE)
2373 goto out; 2417 goto out;
2374 } 2418 }
@@ -2623,15 +2667,11 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2623 } 2667 }
2624 2668
2625 /* 2669 /*
2626 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and 2670 * If this allocation cannot block and it is for a specific node, then
2627 * __GFP_NOWARN set) should not cause reclaim since the subsystem 2671 * fail early. There's no need to wakeup kswapd or retry for a
2628 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim 2672 * speculative node-specific allocation.
2629 * using a larger set of nodes after it has established that the
2630 * allowed per node queues are empty and that nodes are
2631 * over allocated.
2632 */ 2673 */
2633 if (IS_ENABLED(CONFIG_NUMA) && 2674 if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !wait)
2634 (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
2635 goto nopage; 2675 goto nopage;
2636 2676
2637retry: 2677retry:
@@ -2824,7 +2864,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2824 /* 2864 /*
2825 * Check the zones suitable for the gfp_mask contain at least one 2865 * Check the zones suitable for the gfp_mask contain at least one
2826 * valid zone. It's possible to have an empty zonelist as a result 2866 * valid zone. It's possible to have an empty zonelist as a result
2827 * of GFP_THISNODE and a memoryless node 2867 * of __GFP_THISNODE and a memoryless node
2828 */ 2868 */
2829 if (unlikely(!zonelist->_zonerefs->zone)) 2869 if (unlikely(!zonelist->_zonerefs->zone))
2830 return NULL; 2870 return NULL;
@@ -3201,38 +3241,31 @@ static void show_migration_types(unsigned char type)
3201 * Show free area list (used inside shift_scroll-lock stuff) 3241 * Show free area list (used inside shift_scroll-lock stuff)
3202 * We also calculate the percentage fragmentation. We do this by counting the 3242 * We also calculate the percentage fragmentation. We do this by counting the
3203 * memory on each free list with the exception of the first item on the list. 3243 * memory on each free list with the exception of the first item on the list.
3204 * Suppresses nodes that are not allowed by current's cpuset if 3244 *
3205 * SHOW_MEM_FILTER_NODES is passed. 3245 * Bits in @filter:
3246 * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
3247 * cpuset.
3206 */ 3248 */
3207void show_free_areas(unsigned int filter) 3249void show_free_areas(unsigned int filter)
3208{ 3250{
3251 unsigned long free_pcp = 0;
3209 int cpu; 3252 int cpu;
3210 struct zone *zone; 3253 struct zone *zone;
3211 3254
3212 for_each_populated_zone(zone) { 3255 for_each_populated_zone(zone) {
3213 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3256 if (skip_free_areas_node(filter, zone_to_nid(zone)))
3214 continue; 3257 continue;
3215 show_node(zone);
3216 printk("%s per-cpu:\n", zone->name);
3217 3258
3218 for_each_online_cpu(cpu) { 3259 for_each_online_cpu(cpu)
3219 struct per_cpu_pageset *pageset; 3260 free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
3220
3221 pageset = per_cpu_ptr(zone->pageset, cpu);
3222
3223 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
3224 cpu, pageset->pcp.high,
3225 pageset->pcp.batch, pageset->pcp.count);
3226 }
3227 } 3261 }
3228 3262
3229 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" 3263 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
3230 " active_file:%lu inactive_file:%lu isolated_file:%lu\n" 3264 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
3231 " unevictable:%lu" 3265 " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
3232 " dirty:%lu writeback:%lu unstable:%lu\n" 3266 " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
3233 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
3234 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" 3267 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
3235 " free_cma:%lu\n", 3268 " free:%lu free_pcp:%lu free_cma:%lu\n",
3236 global_page_state(NR_ACTIVE_ANON), 3269 global_page_state(NR_ACTIVE_ANON),
3237 global_page_state(NR_INACTIVE_ANON), 3270 global_page_state(NR_INACTIVE_ANON),
3238 global_page_state(NR_ISOLATED_ANON), 3271 global_page_state(NR_ISOLATED_ANON),
@@ -3243,13 +3276,14 @@ void show_free_areas(unsigned int filter)
3243 global_page_state(NR_FILE_DIRTY), 3276 global_page_state(NR_FILE_DIRTY),
3244 global_page_state(NR_WRITEBACK), 3277 global_page_state(NR_WRITEBACK),
3245 global_page_state(NR_UNSTABLE_NFS), 3278 global_page_state(NR_UNSTABLE_NFS),
3246 global_page_state(NR_FREE_PAGES),
3247 global_page_state(NR_SLAB_RECLAIMABLE), 3279 global_page_state(NR_SLAB_RECLAIMABLE),
3248 global_page_state(NR_SLAB_UNRECLAIMABLE), 3280 global_page_state(NR_SLAB_UNRECLAIMABLE),
3249 global_page_state(NR_FILE_MAPPED), 3281 global_page_state(NR_FILE_MAPPED),
3250 global_page_state(NR_SHMEM), 3282 global_page_state(NR_SHMEM),
3251 global_page_state(NR_PAGETABLE), 3283 global_page_state(NR_PAGETABLE),
3252 global_page_state(NR_BOUNCE), 3284 global_page_state(NR_BOUNCE),
3285 global_page_state(NR_FREE_PAGES),
3286 free_pcp,
3253 global_page_state(NR_FREE_CMA_PAGES)); 3287 global_page_state(NR_FREE_CMA_PAGES));
3254 3288
3255 for_each_populated_zone(zone) { 3289 for_each_populated_zone(zone) {
@@ -3257,6 +3291,11 @@ void show_free_areas(unsigned int filter)
3257 3291
3258 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3292 if (skip_free_areas_node(filter, zone_to_nid(zone)))
3259 continue; 3293 continue;
3294
3295 free_pcp = 0;
3296 for_each_online_cpu(cpu)
3297 free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
3298
3260 show_node(zone); 3299 show_node(zone);
3261 printk("%s" 3300 printk("%s"
3262 " free:%lukB" 3301 " free:%lukB"
@@ -3283,6 +3322,8 @@ void show_free_areas(unsigned int filter)
3283 " pagetables:%lukB" 3322 " pagetables:%lukB"
3284 " unstable:%lukB" 3323 " unstable:%lukB"
3285 " bounce:%lukB" 3324 " bounce:%lukB"
3325 " free_pcp:%lukB"
3326 " local_pcp:%ukB"
3286 " free_cma:%lukB" 3327 " free_cma:%lukB"
3287 " writeback_tmp:%lukB" 3328 " writeback_tmp:%lukB"
3288 " pages_scanned:%lu" 3329 " pages_scanned:%lu"
@@ -3314,6 +3355,8 @@ void show_free_areas(unsigned int filter)
3314 K(zone_page_state(zone, NR_PAGETABLE)), 3355 K(zone_page_state(zone, NR_PAGETABLE)),
3315 K(zone_page_state(zone, NR_UNSTABLE_NFS)), 3356 K(zone_page_state(zone, NR_UNSTABLE_NFS)),
3316 K(zone_page_state(zone, NR_BOUNCE)), 3357 K(zone_page_state(zone, NR_BOUNCE)),
3358 K(free_pcp),
3359 K(this_cpu_read(zone->pageset->pcp.count)),
3317 K(zone_page_state(zone, NR_FREE_CMA_PAGES)), 3360 K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
3318 K(zone_page_state(zone, NR_WRITEBACK_TEMP)), 3361 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
3319 K(zone_page_state(zone, NR_PAGES_SCANNED)), 3362 K(zone_page_state(zone, NR_PAGES_SCANNED)),
@@ -5717,7 +5760,7 @@ static void __setup_per_zone_wmarks(void)
5717 * value here. 5760 * value here.
5718 * 5761 *
5719 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) 5762 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
5720 * deltas controls asynch page reclaim, and so should 5763 * deltas control asynch page reclaim, and so should
5721 * not be capped for highmem. 5764 * not be capped for highmem.
5722 */ 5765 */
5723 unsigned long min_pages; 5766 unsigned long min_pages;
diff --git a/mm/slab.c b/mm/slab.c
index c4b89eaf4c96..7eb38dd1cefa 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -857,6 +857,11 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep,
857 return NULL; 857 return NULL;
858} 858}
859 859
860static inline gfp_t gfp_exact_node(gfp_t flags)
861{
862 return flags;
863}
864
860#else /* CONFIG_NUMA */ 865#else /* CONFIG_NUMA */
861 866
862static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); 867static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
@@ -1023,6 +1028,15 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1023 1028
1024 return __cache_free_alien(cachep, objp, node, page_node); 1029 return __cache_free_alien(cachep, objp, node, page_node);
1025} 1030}
1031
1032/*
1033 * Construct gfp mask to allocate from a specific node but do not invoke reclaim
1034 * or warn about failures.
1035 */
1036static inline gfp_t gfp_exact_node(gfp_t flags)
1037{
1038 return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_WAIT;
1039}
1026#endif 1040#endif
1027 1041
1028/* 1042/*
@@ -2825,7 +2839,7 @@ alloc_done:
2825 if (unlikely(!ac->avail)) { 2839 if (unlikely(!ac->avail)) {
2826 int x; 2840 int x;
2827force_grow: 2841force_grow:
2828 x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); 2842 x = cache_grow(cachep, gfp_exact_node(flags), node, NULL);
2829 2843
2830 /* cache_grow can reenable interrupts, then ac could change. */ 2844 /* cache_grow can reenable interrupts, then ac could change. */
2831 ac = cpu_cache_get(cachep); 2845 ac = cpu_cache_get(cachep);
@@ -3019,7 +3033,7 @@ retry:
3019 get_node(cache, nid) && 3033 get_node(cache, nid) &&
3020 get_node(cache, nid)->free_objects) { 3034 get_node(cache, nid)->free_objects) {
3021 obj = ____cache_alloc_node(cache, 3035 obj = ____cache_alloc_node(cache,
3022 flags | GFP_THISNODE, nid); 3036 gfp_exact_node(flags), nid);
3023 if (obj) 3037 if (obj)
3024 break; 3038 break;
3025 } 3039 }
@@ -3047,7 +3061,7 @@ retry:
3047 nid = page_to_nid(page); 3061 nid = page_to_nid(page);
3048 if (cache_grow(cache, flags, nid, page)) { 3062 if (cache_grow(cache, flags, nid, page)) {
3049 obj = ____cache_alloc_node(cache, 3063 obj = ____cache_alloc_node(cache,
3050 flags | GFP_THISNODE, nid); 3064 gfp_exact_node(flags), nid);
3051 if (!obj) 3065 if (!obj)
3052 /* 3066 /*
3053 * Another processor may allocate the 3067 * Another processor may allocate the
@@ -3118,7 +3132,7 @@ retry:
3118 3132
3119must_grow: 3133must_grow:
3120 spin_unlock(&n->list_lock); 3134 spin_unlock(&n->list_lock);
3121 x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL); 3135 x = cache_grow(cachep, gfp_exact_node(flags), nodeid, NULL);
3122 if (x) 3136 if (x)
3123 goto retry; 3137 goto retry;
3124 3138
diff --git a/mm/slob.c b/mm/slob.c
index 94a7fede6d48..4765f65019c7 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -532,7 +532,7 @@ int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
532 return 0; 532 return 0;
533} 533}
534 534
535void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) 535static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
536{ 536{
537 void *b; 537 void *b;
538 538
@@ -558,7 +558,6 @@ void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
558 kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags); 558 kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags);
559 return b; 559 return b;
560} 560}
561EXPORT_SYMBOL(slob_alloc_node);
562 561
563void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) 562void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
564{ 563{
diff --git a/mm/slub.c b/mm/slub.c
index 82c473780c91..0fdd6c1e1f82 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -374,7 +374,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
374 if (cmpxchg_double(&page->freelist, &page->counters, 374 if (cmpxchg_double(&page->freelist, &page->counters,
375 freelist_old, counters_old, 375 freelist_old, counters_old,
376 freelist_new, counters_new)) 376 freelist_new, counters_new))
377 return 1; 377 return true;
378 } else 378 } else
379#endif 379#endif
380 { 380 {
@@ -384,7 +384,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
384 page->freelist = freelist_new; 384 page->freelist = freelist_new;
385 set_page_slub_counters(page, counters_new); 385 set_page_slub_counters(page, counters_new);
386 slab_unlock(page); 386 slab_unlock(page);
387 return 1; 387 return true;
388 } 388 }
389 slab_unlock(page); 389 slab_unlock(page);
390 } 390 }
@@ -396,7 +396,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
396 pr_info("%s %s: cmpxchg double redo ", n, s->name); 396 pr_info("%s %s: cmpxchg double redo ", n, s->name);
397#endif 397#endif
398 398
399 return 0; 399 return false;
400} 400}
401 401
402static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, 402static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
@@ -410,7 +410,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
410 if (cmpxchg_double(&page->freelist, &page->counters, 410 if (cmpxchg_double(&page->freelist, &page->counters,
411 freelist_old, counters_old, 411 freelist_old, counters_old,
412 freelist_new, counters_new)) 412 freelist_new, counters_new))
413 return 1; 413 return true;
414 } else 414 } else
415#endif 415#endif
416 { 416 {
@@ -424,7 +424,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
424 set_page_slub_counters(page, counters_new); 424 set_page_slub_counters(page, counters_new);
425 slab_unlock(page); 425 slab_unlock(page);
426 local_irq_restore(flags); 426 local_irq_restore(flags);
427 return 1; 427 return true;
428 } 428 }
429 slab_unlock(page); 429 slab_unlock(page);
430 local_irq_restore(flags); 430 local_irq_restore(flags);
@@ -437,7 +437,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
437 pr_info("%s %s: cmpxchg double redo ", n, s->name); 437 pr_info("%s %s: cmpxchg double redo ", n, s->name);
438#endif 438#endif
439 439
440 return 0; 440 return false;
441} 441}
442 442
443#ifdef CONFIG_SLUB_DEBUG 443#ifdef CONFIG_SLUB_DEBUG
@@ -1137,15 +1137,6 @@ static int __init setup_slub_debug(char *str)
1137 */ 1137 */
1138 goto check_slabs; 1138 goto check_slabs;
1139 1139
1140 if (tolower(*str) == 'o') {
1141 /*
1142 * Avoid enabling debugging on caches if its minimum order
1143 * would increase as a result.
1144 */
1145 disable_higher_order_debug = 1;
1146 goto out;
1147 }
1148
1149 slub_debug = 0; 1140 slub_debug = 0;
1150 if (*str == '-') 1141 if (*str == '-')
1151 /* 1142 /*
@@ -1176,6 +1167,13 @@ static int __init setup_slub_debug(char *str)
1176 case 'a': 1167 case 'a':
1177 slub_debug |= SLAB_FAILSLAB; 1168 slub_debug |= SLAB_FAILSLAB;
1178 break; 1169 break;
1170 case 'o':
1171 /*
1172 * Avoid enabling debugging on caches if its minimum
1173 * order would increase as a result.
1174 */
1175 disable_higher_order_debug = 1;
1176 break;
1179 default: 1177 default:
1180 pr_err("slub_debug option '%c' unknown. skipped\n", 1178 pr_err("slub_debug option '%c' unknown. skipped\n",
1181 *str); 1179 *str);
diff --git a/mm/truncate.c b/mm/truncate.c
index ddec5a5966d7..7a9d8a3cb143 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -93,35 +93,6 @@ void do_invalidatepage(struct page *page, unsigned int offset,
93} 93}
94 94
95/* 95/*
96 * This cancels just the dirty bit on the kernel page itself, it
97 * does NOT actually remove dirty bits on any mmap's that may be
98 * around. It also leaves the page tagged dirty, so any sync
99 * activity will still find it on the dirty lists, and in particular,
100 * clear_page_dirty_for_io() will still look at the dirty bits in
101 * the VM.
102 *
103 * Doing this should *normally* only ever be done when a page
104 * is truncated, and is not actually mapped anywhere at all. However,
105 * fs/buffer.c does this when it notices that somebody has cleaned
106 * out all the buffers on a page without actually doing it through
107 * the VM. Can you say "ext3 is horribly ugly"? Tought you could.
108 */
109void cancel_dirty_page(struct page *page, unsigned int account_size)
110{
111 if (TestClearPageDirty(page)) {
112 struct address_space *mapping = page->mapping;
113 if (mapping && mapping_cap_account_dirty(mapping)) {
114 dec_zone_page_state(page, NR_FILE_DIRTY);
115 dec_bdi_stat(inode_to_bdi(mapping->host),
116 BDI_RECLAIMABLE);
117 if (account_size)
118 task_io_account_cancelled_write(account_size);
119 }
120 }
121}
122EXPORT_SYMBOL(cancel_dirty_page);
123
124/*
125 * If truncate cannot remove the fs-private metadata from the page, the page 96 * If truncate cannot remove the fs-private metadata from the page, the page
126 * becomes orphaned. It will be left on the LRU and may even be mapped into 97 * becomes orphaned. It will be left on the LRU and may even be mapped into
127 * user pagetables if we're racing with filemap_fault(). 98 * user pagetables if we're racing with filemap_fault().
@@ -140,7 +111,13 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
140 if (page_has_private(page)) 111 if (page_has_private(page))
141 do_invalidatepage(page, 0, PAGE_CACHE_SIZE); 112 do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
142 113
143 cancel_dirty_page(page, PAGE_CACHE_SIZE); 114 /*
115 * Some filesystems seem to re-dirty the page even after
116 * the VM has canceled the dirty bit (eg ext3 journaling).
117 * Hence dirty accounting check is placed after invalidation.
118 */
119 if (TestClearPageDirty(page))
120 account_page_cleaned(page, mapping);
144 121
145 ClearPageMappedToDisk(page); 122 ClearPageMappedToDisk(page);
146 delete_from_page_cache(page); 123 delete_from_page_cache(page);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 49abccf29a29..a5bbdd3b5d67 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -29,6 +29,7 @@
29#include <linux/atomic.h> 29#include <linux/atomic.h>
30#include <linux/compiler.h> 30#include <linux/compiler.h>
31#include <linux/llist.h> 31#include <linux/llist.h>
32#include <linux/bitops.h>
32 33
33#include <asm/uaccess.h> 34#include <asm/uaccess.h>
34#include <asm/tlbflush.h> 35#include <asm/tlbflush.h>
@@ -74,6 +75,8 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
74 pmd = pmd_offset(pud, addr); 75 pmd = pmd_offset(pud, addr);
75 do { 76 do {
76 next = pmd_addr_end(addr, end); 77 next = pmd_addr_end(addr, end);
78 if (pmd_clear_huge(pmd))
79 continue;
77 if (pmd_none_or_clear_bad(pmd)) 80 if (pmd_none_or_clear_bad(pmd))
78 continue; 81 continue;
79 vunmap_pte_range(pmd, addr, next); 82 vunmap_pte_range(pmd, addr, next);
@@ -88,6 +91,8 @@ static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end)
88 pud = pud_offset(pgd, addr); 91 pud = pud_offset(pgd, addr);
89 do { 92 do {
90 next = pud_addr_end(addr, end); 93 next = pud_addr_end(addr, end);
94 if (pud_clear_huge(pud))
95 continue;
91 if (pud_none_or_clear_bad(pud)) 96 if (pud_none_or_clear_bad(pud))
92 continue; 97 continue;
93 vunmap_pmd_range(pud, addr, next); 98 vunmap_pmd_range(pud, addr, next);
@@ -1314,7 +1319,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
1314 1319
1315 BUG_ON(in_interrupt()); 1320 BUG_ON(in_interrupt());
1316 if (flags & VM_IOREMAP) 1321 if (flags & VM_IOREMAP)
1317 align = 1ul << clamp(fls(size), PAGE_SHIFT, IOREMAP_MAX_ORDER); 1322 align = 1ul << clamp_t(int, fls_long(size),
1323 PAGE_SHIFT, IOREMAP_MAX_ORDER);
1318 1324
1319 size = PAGE_ALIGN(size); 1325 size = PAGE_ALIGN(size);
1320 if (unlikely(!size)) 1326 if (unlikely(!size))
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 50ec42f170a0..2dacc7b5af23 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -100,7 +100,9 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags,
100 100
101 new_stats = 101 new_stats =
102 kmem_cache_alloc_node(flow_stats_cache, 102 kmem_cache_alloc_node(flow_stats_cache,
103 GFP_THISNODE | 103 GFP_NOWAIT |
104 __GFP_THISNODE |
105 __GFP_NOWARN |
104 __GFP_NOMEMALLOC, 106 __GFP_NOMEMALLOC,
105 node); 107 node);
106 if (likely(new_stats)) { 108 if (likely(new_stats)) {
diff --git a/scripts/coccinelle/misc/bugon.cocci b/scripts/coccinelle/misc/bugon.cocci
index 3b7eec24fb5a..27c97f1f2767 100644
--- a/scripts/coccinelle/misc/bugon.cocci
+++ b/scripts/coccinelle/misc/bugon.cocci
@@ -57,6 +57,6 @@ coccilib.org.print_todo(p[0], "WARNING use BUG_ON")
57p << r.p; 57p << r.p;
58@@ 58@@
59 59
60msg="WARNING: Use BUG_ON" 60msg="WARNING: Use BUG_ON instead of if condition followed by BUG.\nPlease make sure the condition has no side effects (see conditional BUG_ON definition in include/asm-generic/bug.h)"
61coccilib.report.print_report(p[0], msg) 61coccilib.report.print_report(p[0], msg)
62 62