aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-06-04 19:55:13 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-06-04 19:55:13 -0400
commit00170fdd0846df7cdb5ad421d3a340440f930b8f (patch)
tree1883cfbda846cd65faed011bda54a52c1d40ecdd
parentd09cc3659db494aca4b3bb2393c533fb4946b794 (diff)
parent3ff6db3287e8a5e8f5bb9529b8e1259ca6b10def (diff)
Merge branch 'akpm' (patchbomb from Andrew) into next
Merge misc updates from Andrew Morton: - a few fixes for 3.16. Cc'ed to stable so they'll get there somehow. - various misc fixes and cleanups - most of the ocfs2 queue. Review is slow... - most of MM. The MM queue is pretty huge this time, but not much in the way of feature work. - some tweaks under kernel/ - printk maintenance work - updates to lib/ - checkpatch updates - tweaks to init/ * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (276 commits) fs/autofs4/dev-ioctl.c: add __init to autofs_dev_ioctl_init fs/ncpfs/getopt.c: replace simple_strtoul by kstrtoul init/main.c: remove an ifdef kthreads: kill CLONE_KERNEL, change kernel_thread(kernel_init) to avoid CLONE_SIGHAND init/main.c: add initcall_blacklist kernel parameter init/main.c: don't use pr_debug() fs/binfmt_flat.c: make old_reloc() static fs/binfmt_elf.c: fix bool assignements fs/efs: convert printk(KERN_DEBUG to pr_debug fs/efs: add pr_fmt / use __func__ fs/efs: convert printk to pr_foo() scripts/checkpatch.pl: device_initcall is not the only __initcall substitute checkpatch: check stable email address checkpatch: warn on unnecessary void function return statements checkpatch: prefer kstrto<foo> to sscanf(buf, "%<lhuidx>", &bar); checkpatch: add warning for kmalloc/kzalloc with multiply checkpatch: warn on #defines ending in semicolon checkpatch: make --strict a default for files in drivers/net and net/ checkpatch: always warn on missing blank line after variable declaration block checkpatch: fix wildcard DT compatible string checking ...
-rw-r--r--Documentation/CodingStyle22
-rw-r--r--Documentation/cgroups/memory.txt16
-rw-r--r--Documentation/kernel-parameters.txt11
-rw-r--r--Documentation/memory-hotplug.txt125
-rw-r--r--Documentation/sysctl/vm.txt26
-rw-r--r--Documentation/vm/hwpoison.txt5
-rw-r--r--MAINTAINERS8
-rw-r--r--arch/arc/kernel/troubleshoot.c10
-rw-r--r--arch/arm/mm/hugetlbpage.c5
-rw-r--r--arch/arm64/mm/hugetlbpage.c5
-rw-r--r--arch/blackfin/include/asm/unistd.h1
-rw-r--r--arch/cris/include/asm/unistd.h1
-rw-r--r--arch/frv/include/asm/unistd.h1
-rw-r--r--arch/ia64/include/asm/topology.h3
-rw-r--r--arch/ia64/mm/hugetlbpage.c5
-rw-r--r--arch/m68k/include/asm/unistd.h1
-rw-r--r--arch/metag/mm/hugetlbpage.c5
-rw-r--r--arch/microblaze/include/asm/unistd.h1
-rw-r--r--arch/mips/include/asm/unistd.h1
-rw-r--r--arch/mips/mm/hugetlbpage.c5
-rw-r--r--arch/mn10300/include/asm/unistd.h1
-rw-r--r--arch/parisc/include/asm/unistd.h1
-rw-r--r--arch/powerpc/include/asm/pgtable.h6
-rw-r--r--arch/powerpc/include/asm/topology.h8
-rw-r--r--arch/powerpc/include/asm/unistd.h1
-rw-r--r--arch/powerpc/mm/hugetlbpage.c10
-rw-r--r--arch/s390/mm/hugetlbpage.c5
-rw-r--r--arch/sh/include/asm/unistd.h1
-rw-r--r--arch/sh/kernel/hw_breakpoint.c4
-rw-r--r--arch/sh/kernel/kprobes.c30
-rw-r--r--arch/sh/kernel/localtimer.c2
-rw-r--r--arch/sh/kernel/perf_event.c8
-rw-r--r--arch/sh/kernel/smp.c2
-rw-r--r--arch/sh/mm/hugetlbpage.c5
-rw-r--r--arch/sparc/include/asm/unistd.h1
-rw-r--r--arch/sparc/mm/hugetlbpage.c5
-rw-r--r--arch/tile/mm/homecache.c2
-rw-r--r--arch/tile/mm/hugetlbpage.c5
-rw-r--r--arch/unicore32/mm/ioremap.c4
-rw-r--r--arch/x86/Kconfig10
-rw-r--r--arch/x86/include/asm/mce.h2
-rw-r--r--arch/x86/include/asm/pgtable-2level.h59
-rw-r--r--arch/x86/include/asm/pgtable.h20
-rw-r--r--arch/x86/include/asm/pgtable_64.h8
-rw-r--r--arch/x86/include/asm/pgtable_types.h66
-rw-r--r--arch/x86/include/asm/swiotlb.h7
-rw-r--r--arch/x86/include/asm/unistd.h1
-rw-r--r--arch/x86/kernel/amd_gart_64.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c5
-rw-r--r--arch/x86/kernel/head64.c2
-rw-r--r--arch/x86/kernel/pci-dma.c11
-rw-r--r--arch/x86/kernel/pci-swiotlb.c9
-rw-r--r--arch/x86/kernel/setup.c2
-rw-r--r--arch/x86/mm/hugetlbpage.c10
-rw-r--r--arch/x86/mm/init_64.c34
-rw-r--r--arch/x86/mm/numa.c6
-rw-r--r--arch/x86/mm/pageattr-test.c2
-rw-r--r--arch/x86/pci/sta2x11-fixup.c6
-rw-r--r--arch/x86/platform/uv/uv_nmi.c2
-rw-r--r--drivers/base/Kconfig2
-rw-r--r--drivers/base/dma-contiguous.c42
-rw-r--r--drivers/base/memory.c12
-rw-r--r--drivers/block/brd.c16
-rw-r--r--drivers/block/zram/zram_drv.c4
-rw-r--r--drivers/gpu/drm/exynos/exynos_drm_g2d.c6
-rw-r--r--drivers/iommu/intel-iommu.c33
-rw-r--r--drivers/nubus/nubus.c18
-rw-r--r--drivers/tty/sysrq.c8
-rw-r--r--fs/9p/v9fs.c2
-rw-r--r--fs/9p/vfs_dir.c1
-rw-r--r--fs/9p/vfs_file.c4
-rw-r--r--fs/9p/vfs_inode.c6
-rw-r--r--fs/9p/vfs_inode_dotl.c7
-rw-r--r--fs/autofs4/dev-ioctl.c2
-rw-r--r--fs/binfmt_elf.c4
-rw-r--r--fs/binfmt_flat.c2
-rw-r--r--fs/block_dev.c63
-rw-r--r--fs/btrfs/extent_io.c11
-rw-r--r--fs/btrfs/file.c5
-rw-r--r--fs/buffer.c49
-rw-r--r--fs/configfs/configfs_internal.h6
-rw-r--r--fs/configfs/dir.c8
-rw-r--r--fs/configfs/inode.c5
-rw-r--r--fs/configfs/item.c58
-rw-r--r--fs/configfs/mount.c4
-rw-r--r--fs/efivarfs/super.c2
-rw-r--r--fs/efs/dir.c18
-rw-r--r--fs/efs/efs.h6
-rw-r--r--fs/efs/file.c14
-rw-r--r--fs/efs/inode.c42
-rw-r--r--fs/efs/namei.c8
-rw-r--r--fs/efs/super.c42
-rw-r--r--fs/exportfs/expfs.c4
-rw-r--r--fs/ext4/mballoc.c14
-rw-r--r--fs/ext4/page-io.c2
-rw-r--r--fs/f2fs/checkpoint.c3
-rw-r--r--fs/f2fs/node.c2
-rw-r--r--fs/fscache/cache.c13
-rw-r--r--fs/fscache/cookie.c2
-rw-r--r--fs/fscache/histogram.c6
-rw-r--r--fs/fscache/internal.h26
-rw-r--r--fs/fscache/main.c7
-rw-r--r--fs/fscache/netfs.c7
-rw-r--r--fs/fscache/object-list.c8
-rw-r--r--fs/fscache/operation.c3
-rw-r--r--fs/fscache/page.c6
-rw-r--r--fs/fuse/dev.c2
-rw-r--r--fs/fuse/file.c2
-rw-r--r--fs/gfs2/aops.c1
-rw-r--r--fs/gfs2/meta_io.c4
-rw-r--r--fs/hugetlbfs/inode.c25
-rw-r--r--fs/libfs.c34
-rw-r--r--fs/mpage.c84
-rw-r--r--fs/ncpfs/getopt.c13
-rw-r--r--fs/notify/fanotify/fanotify_user.c45
-rw-r--r--fs/notify/mark.c2
-rw-r--r--fs/ntfs/attrib.c1
-rw-r--r--fs/ntfs/compress.c2
-rw-r--r--fs/ntfs/file.c1
-rw-r--r--fs/ntfs/super.c4
-rw-r--r--fs/ntfs/sysctl.c2
-rw-r--r--fs/ocfs2/alloc.c6
-rw-r--r--fs/ocfs2/cluster/tcp.c2
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h1
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c2
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c14
-rw-r--r--fs/ocfs2/dlm/dlmlock.c2
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c6
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c10
-rw-r--r--fs/ocfs2/dlmglue.c5
-rw-r--r--fs/ocfs2/file.c2
-rw-r--r--fs/ocfs2/ioctl.c81
-rw-r--r--fs/ocfs2/journal.c17
-rw-r--r--fs/ocfs2/ocfs2.h1
-rw-r--r--fs/ocfs2/refcounttree.c7
-rw-r--r--fs/ocfs2/resize.c10
-rw-r--r--fs/ocfs2/stackglue.c2
-rw-r--r--fs/ocfs2/super.c8
-rw-r--r--fs/ocfs2/uptodate.c2
-rw-r--r--fs/proc/task_mmu.c14
-rw-r--r--fs/readdir.c2
-rw-r--r--fs/squashfs/squashfs.h2
-rw-r--r--fs/super.c16
-rw-r--r--include/asm-generic/pgtable.h8
-rw-r--r--include/linux/blkdev.h13
-rw-r--r--include/linux/bootmem.h6
-rw-r--r--include/linux/buffer_head.h2
-rw-r--r--include/linux/compaction.h4
-rw-r--r--include/linux/compiler.h13
-rw-r--r--include/linux/cpuset.h29
-rw-r--r--include/linux/dma-contiguous.h9
-rw-r--r--include/linux/fs.h1
-rw-r--r--include/linux/gfp.h15
-rw-r--r--include/linux/hugetlb.h22
-rw-r--r--include/linux/jump_label.h20
-rw-r--r--include/linux/memblock.h2
-rw-r--r--include/linux/memcontrol.h32
-rw-r--r--include/linux/memory_hotplug.h14
-rw-r--r--include/linux/mempolicy.h6
-rw-r--r--include/linux/migrate.h11
-rw-r--r--include/linux/mm.h29
-rw-r--r--include/linux/mm_types.h2
-rw-r--r--include/linux/mmdebug.h15
-rw-r--r--include/linux/mmzone.h35
-rw-r--r--include/linux/page-flags.h2
-rw-r--r--include/linux/pageblock-flags.h30
-rw-r--r--include/linux/pagemap.h111
-rw-r--r--include/linux/plist.h45
-rw-r--r--include/linux/printk.h38
-rw-r--r--include/linux/proc_fs.h4
-rw-r--r--include/linux/rmap.h11
-rw-r--r--include/linux/sched.h11
-rw-r--r--include/linux/sched/sysctl.h4
-rw-r--r--include/linux/slab.h20
-rw-r--r--include/linux/swap.h38
-rw-r--r--include/linux/swapfile.h2
-rw-r--r--include/linux/swapops.h2
-rw-r--r--include/linux/swiotlb.h2
-rw-r--r--include/linux/syscalls.h4
-rw-r--r--include/linux/thread_info.h2
-rw-r--r--include/linux/topology.h3
-rw-r--r--include/linux/vm_event_item.h4
-rw-r--r--include/linux/vmstat.h6
-rw-r--r--include/linux/zbud.h2
-rw-r--r--include/trace/events/compaction.h25
-rw-r--r--include/trace/events/gfpflags.h1
-rw-r--r--include/trace/events/vmscan.h19
-rw-r--r--init/Kconfig33
-rw-r--r--init/main.c81
-rw-r--r--kernel/backtracetest.c18
-rw-r--r--kernel/capability.c6
-rw-r--r--kernel/compat.c8
-rw-r--r--kernel/cpu.c31
-rw-r--r--kernel/cpuset.c14
-rw-r--r--kernel/debug/kdb/kdb_bt.c2
-rw-r--r--kernel/debug/kdb/kdb_io.c2
-rw-r--r--kernel/debug/kdb/kdb_main.c2
-rw-r--r--kernel/exec_domain.c14
-rw-r--r--kernel/exit.c22
-rw-r--r--kernel/fork.c10
-rw-r--r--kernel/hung_task.c4
-rw-r--r--kernel/kthread.c4
-rw-r--r--kernel/latencytop.c5
-rw-r--r--kernel/printk/printk.c343
-rw-r--r--kernel/reboot.c21
-rw-r--r--kernel/res_counter.c7
-rw-r--r--kernel/sched/core.c2
-rw-r--r--kernel/sched/deadline.c7
-rw-r--r--kernel/sched/rt.c8
-rw-r--r--kernel/signal.c4
-rw-r--r--kernel/stop_machine.c1
-rw-r--r--kernel/sys_ni.c2
-rw-r--r--kernel/time/ntp.c15
-rw-r--r--kernel/time/timekeeping.c7
-rw-r--r--kernel/tracepoint.c2
-rw-r--r--kernel/user.c1
-rw-r--r--kernel/utsname_sysctl.c4
-rw-r--r--lib/Kconfig.debug25
-rw-r--r--lib/asn1_decoder.c2
-rw-r--r--lib/atomic64_test.c13
-rw-r--r--lib/btree.c1
-rw-r--r--lib/bug.c21
-rw-r--r--lib/crc32.c4
-rw-r--r--lib/debugobjects.c19
-rw-r--r--lib/digsig.c5
-rw-r--r--lib/libcrc32c.c5
-rw-r--r--lib/nlattr.c17
-rw-r--r--lib/plist.c56
-rw-r--r--lib/radix-tree.c7
-rw-r--r--lib/string.c8
-rw-r--r--lib/swiotlb.c2
-rw-r--r--lib/textsearch.c9
-rw-r--r--lib/vsprintf.c4
-rw-r--r--lib/xz/Kconfig24
-rw-r--r--lib/xz/xz_dec_lzma2.c4
-rw-r--r--mm/Kconfig15
-rw-r--r--mm/Makefile2
-rw-r--r--mm/compaction.c249
-rw-r--r--mm/dmapool.c27
-rw-r--r--mm/filemap.c238
-rw-r--r--mm/fremap.c7
-rw-r--r--mm/frontswap.c13
-rw-r--r--mm/gup.c662
-rw-r--r--mm/huge_memory.c32
-rw-r--r--mm/hugetlb.c363
-rw-r--r--mm/internal.h36
-rw-r--r--mm/kmemleak.c4
-rw-r--r--mm/memblock.c26
-rw-r--r--mm/memcontrol.c383
-rw-r--r--mm/memory-failure.c96
-rw-r--r--mm/memory.c743
-rw-r--r--mm/memory_hotplug.c148
-rw-r--r--mm/mempolicy.c30
-rw-r--r--mm/mempool.c2
-rw-r--r--mm/migrate.c63
-rw-r--r--mm/mmap.c9
-rw-r--r--mm/msync.c8
-rw-r--r--mm/page-writeback.c22
-rw-r--r--mm/page_alloc.c394
-rw-r--r--mm/page_io.c21
-rw-r--r--mm/rmap.c55
-rw-r--r--mm/shmem.c8
-rw-r--r--mm/slab.c45
-rw-r--r--mm/slab.h48
-rw-r--r--mm/slab_common.c95
-rw-r--r--mm/slob.c3
-rw-r--r--mm/slub.c225
-rw-r--r--mm/swap.c238
-rw-r--r--mm/swap_state.c2
-rw-r--r--mm/swapfile.c253
-rw-r--r--mm/vmacache.c22
-rw-r--r--mm/vmalloc.c13
-rw-r--r--mm/vmscan.c184
-rw-r--r--mm/vmstat.c12
-rw-r--r--mm/zbud.c4
-rw-r--r--mm/zsmalloc.c4
-rw-r--r--mm/zswap.c2
-rwxr-xr-xscripts/checkpatch.pl136
-rw-r--r--tools/vm/page-types.c35
279 files changed, 4712 insertions, 3514 deletions
diff --git a/Documentation/CodingStyle b/Documentation/CodingStyle
index 7fe0546c504a..6b6bef31e956 100644
--- a/Documentation/CodingStyle
+++ b/Documentation/CodingStyle
@@ -660,15 +660,23 @@ There are a number of driver model diagnostic macros in <linux/device.h>
660which you should use to make sure messages are matched to the right device 660which you should use to make sure messages are matched to the right device
661and driver, and are tagged with the right level: dev_err(), dev_warn(), 661and driver, and are tagged with the right level: dev_err(), dev_warn(),
662dev_info(), and so forth. For messages that aren't associated with a 662dev_info(), and so forth. For messages that aren't associated with a
663particular device, <linux/printk.h> defines pr_debug() and pr_info(). 663particular device, <linux/printk.h> defines pr_notice(), pr_info(),
664pr_warn(), pr_err(), etc.
664 665
665Coming up with good debugging messages can be quite a challenge; and once 666Coming up with good debugging messages can be quite a challenge; and once
666you have them, they can be a huge help for remote troubleshooting. Such 667you have them, they can be a huge help for remote troubleshooting. However
667messages should be compiled out when the DEBUG symbol is not defined (that 668debug message printing is handled differently than printing other non-debug
668is, by default they are not included). When you use dev_dbg() or pr_debug(), 669messages. While the other pr_XXX() functions print unconditionally,
669that's automatic. Many subsystems have Kconfig options to turn on -DDEBUG. 670pr_debug() does not; it is compiled out by default, unless either DEBUG is
670A related convention uses VERBOSE_DEBUG to add dev_vdbg() messages to the 671defined or CONFIG_DYNAMIC_DEBUG is set. That is true for dev_dbg() also,
671ones already enabled by DEBUG. 672and a related convention uses VERBOSE_DEBUG to add dev_vdbg() messages to
673the ones already enabled by DEBUG.
674
675Many subsystems have Kconfig debug options to turn on -DDEBUG in the
676corresponding Makefile; in other cases specific files #define DEBUG. And
677when a debug message should be unconditionally printed, such as if it is
678already inside a debug-related #ifdef secton, printk(KERN_DEBUG ...) can be
679used.
672 680
673 681
674 Chapter 14: Allocating memory 682 Chapter 14: Allocating memory
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 2622115276aa..4937e6fff9b4 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -270,6 +270,11 @@ When oom event notifier is registered, event will be delivered.
270 270
2712.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) 2712.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM)
272 272
273WARNING: Current implementation lacks reclaim support. That means allocation
274 attempts will fail when close to the limit even if there are plenty of
275 kmem available for reclaim. That makes this option unusable in real
276 life so DO NOT SELECT IT unless for development purposes.
277
273With the Kernel memory extension, the Memory Controller is able to limit 278With the Kernel memory extension, the Memory Controller is able to limit
274the amount of kernel memory used by the system. Kernel memory is fundamentally 279the amount of kernel memory used by the system. Kernel memory is fundamentally
275different than user memory, since it can't be swapped out, which makes it 280different than user memory, since it can't be swapped out, which makes it
@@ -535,17 +540,15 @@ Note:
535 540
5365.3 swappiness 5415.3 swappiness
537 542
538Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only. 543Similar to /proc/sys/vm/swappiness, but only affecting reclaim that is
544triggered by this cgroup's hard limit. The tunable in the root cgroup
545corresponds to the global swappiness setting.
546
539Please note that unlike the global swappiness, memcg knob set to 0 547Please note that unlike the global swappiness, memcg knob set to 0
540really prevents from any swapping even if there is a swap storage 548really prevents from any swapping even if there is a swap storage
541available. This might lead to memcg OOM killer if there are no file 549available. This might lead to memcg OOM killer if there are no file
542pages to reclaim. 550pages to reclaim.
543 551
544Following cgroups' swappiness can't be changed.
545- root cgroup (uses /proc/sys/vm/swappiness).
546- a cgroup which uses hierarchy and it has other cgroup(s) below it.
547- a cgroup which uses hierarchy and not the root of hierarchy.
548
5495.4 failcnt 5525.4 failcnt
550 553
551A memory cgroup provides memory.failcnt and memory.memsw.failcnt files. 554A memory cgroup provides memory.failcnt and memory.memsw.failcnt files.
@@ -754,7 +757,6 @@ You can disable the OOM-killer by writing "1" to memory.oom_control file, as:
754 757
755 #echo 1 > memory.oom_control 758 #echo 1 > memory.oom_control
756 759
757This operation is only allowed to the top cgroup of a sub-hierarchy.
758If OOM-killer is disabled, tasks under cgroup will hang/sleep 760If OOM-killer is disabled, tasks under cgroup will hang/sleep
759in memory cgroup's OOM-waitqueue when they request accountable memory. 761in memory cgroup's OOM-waitqueue when they request accountable memory.
760 762
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index af55e13ace8f..9973a7e2e0ac 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -630,8 +630,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
630 Also note the kernel might malfunction if you disable 630 Also note the kernel might malfunction if you disable
631 some critical bits. 631 some critical bits.
632 632
633 cma=nn[MG] [ARM,KNL] 633 cma=nn[MG]@[start[MG][-end[MG]]]
634 Sets the size of kernel global memory area for contiguous 634 [ARM,X86,KNL]
635 Sets the size of kernel global memory area for
636 contiguous memory allocations and optionally the
637 placement constraint by the physical address range of
635 memory allocations. For more information, see 638 memory allocations. For more information, see
636 include/linux/dma-contiguous.h 639 include/linux/dma-contiguous.h
637 640
@@ -1309,6 +1312,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
1309 for working out where the kernel is dying during 1312 for working out where the kernel is dying during
1310 startup. 1313 startup.
1311 1314
1315 initcall_blacklist= [KNL] Do not execute a comma-separated list of
1316 initcall functions. Useful for debugging built-in
1317 modules and initcalls.
1318
1312 initrd= [BOOT] Specify the location of the initial ramdisk 1319 initrd= [BOOT] Specify the location of the initial ramdisk
1313 1320
1314 inport.irq= [HW] Inport (ATI XL and Microsoft) busmouse driver 1321 inport.irq= [HW] Inport (ATI XL and Microsoft) busmouse driver
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
index 58340d50f8a6..f304edb8fbe7 100644
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -88,16 +88,21 @@ phase by hand.
88 88
891.3. Unit of Memory online/offline operation 891.3. Unit of Memory online/offline operation
90------------ 90------------
91Memory hotplug uses SPARSEMEM memory model. SPARSEMEM divides the whole memory 91Memory hotplug uses SPARSEMEM memory model which allows memory to be divided
92into chunks of the same size. The chunk is called a "section". The size of 92into chunks of the same size. These chunks are called "sections". The size of
93a section is architecture dependent. For example, power uses 16MiB, ia64 uses 93a memory section is architecture dependent. For example, power uses 16MiB, ia64
941GiB. The unit of online/offline operation is "one section". (see Section 3.) 94uses 1GiB.
95 95
96To determine the size of sections, please read this file: 96Memory sections are combined into chunks referred to as "memory blocks". The
97size of a memory block is architecture dependent and represents the logical
98unit upon which memory online/offline operations are to be performed. The
99default size of a memory block is the same as memory section size unless an
100architecture specifies otherwise. (see Section 3.)
101
102To determine the size (in bytes) of a memory block please read this file:
97 103
98/sys/devices/system/memory/block_size_bytes 104/sys/devices/system/memory/block_size_bytes
99 105
100This file shows the size of sections in byte.
101 106
102----------------------- 107-----------------------
1032. Kernel Configuration 1082. Kernel Configuration
@@ -123,42 +128,35 @@ config options.
123 (CONFIG_ACPI_CONTAINER). 128 (CONFIG_ACPI_CONTAINER).
124 This option can be kernel module too. 129 This option can be kernel module too.
125 130
131
126-------------------------------- 132--------------------------------
1274 sysfs files for memory hotplug 1333 sysfs files for memory hotplug
128-------------------------------- 134--------------------------------
129All sections have their device information in sysfs. Each section is part of 135All memory blocks have their device information in sysfs. Each memory block
130a memory block under /sys/devices/system/memory as 136is described under /sys/devices/system/memory as
131 137
132/sys/devices/system/memory/memoryXXX 138/sys/devices/system/memory/memoryXXX
133(XXX is the section id.) 139(XXX is the memory block id.)
134 140
135Now, XXX is defined as (start_address_of_section / section_size) of the first 141For the memory block covered by the sysfs directory. It is expected that all
136section contained in the memory block. The files 'phys_index' and
137'end_phys_index' under each directory report the beginning and end section id's
138for the memory block covered by the sysfs directory. It is expected that all
139memory sections in this range are present and no memory holes exist in the 142memory sections in this range are present and no memory holes exist in the
140range. Currently there is no way to determine if there is a memory hole, but 143range. Currently there is no way to determine if there is a memory hole, but
141the existence of one should not affect the hotplug capabilities of the memory 144the existence of one should not affect the hotplug capabilities of the memory
142block. 145block.
143 146
144For example, assume 1GiB section size. A device for a memory starting at 147For example, assume 1GiB memory block size. A device for a memory starting at
1450x100000000 is /sys/device/system/memory/memory4 1480x100000000 is /sys/device/system/memory/memory4
146(0x100000000 / 1Gib = 4) 149(0x100000000 / 1Gib = 4)
147This device covers address range [0x100000000 ... 0x140000000) 150This device covers address range [0x100000000 ... 0x140000000)
148 151
149Under each section, you can see 4 or 5 files, the end_phys_index file being 152Under each memory block, you can see 4 files:
150a recent addition and not present on older kernels.
151 153
152/sys/devices/system/memory/memoryXXX/start_phys_index 154/sys/devices/system/memory/memoryXXX/phys_index
153/sys/devices/system/memory/memoryXXX/end_phys_index
154/sys/devices/system/memory/memoryXXX/phys_device 155/sys/devices/system/memory/memoryXXX/phys_device
155/sys/devices/system/memory/memoryXXX/state 156/sys/devices/system/memory/memoryXXX/state
156/sys/devices/system/memory/memoryXXX/removable 157/sys/devices/system/memory/memoryXXX/removable
157 158
158'phys_index' : read-only and contains section id of the first section 159'phys_index' : read-only and contains memory block id, same as XXX.
159 in the memory block, same as XXX.
160'end_phys_index' : read-only and contains section id of the last section
161 in the memory block.
162'state' : read-write 160'state' : read-write
163 at read: contains online/offline state of memory. 161 at read: contains online/offline state of memory.
164 at write: user can specify "online_kernel", 162 at write: user can specify "online_kernel",
@@ -185,6 +183,7 @@ For example:
185A backlink will also be created: 183A backlink will also be created:
186/sys/devices/system/memory/memory9/node0 -> ../../node/node0 184/sys/devices/system/memory/memory9/node0 -> ../../node/node0
187 185
186
188-------------------------------- 187--------------------------------
1894. Physical memory hot-add phase 1884. Physical memory hot-add phase
190-------------------------------- 189--------------------------------
@@ -227,11 +226,10 @@ You can tell the physical address of new memory to the kernel by
227 226
228% echo start_address_of_new_memory > /sys/devices/system/memory/probe 227% echo start_address_of_new_memory > /sys/devices/system/memory/probe
229 228
230Then, [start_address_of_new_memory, start_address_of_new_memory + section_size) 229Then, [start_address_of_new_memory, start_address_of_new_memory +
231memory range is hot-added. In this case, hotplug script is not called (in 230memory_block_size] memory range is hot-added. In this case, hotplug script is
232current implementation). You'll have to online memory by yourself. 231not called (in current implementation). You'll have to online memory by
233Please see "How to online memory" in this text. 232yourself. Please see "How to online memory" in this text.
234
235 233
236 234
237------------------------------ 235------------------------------
@@ -240,36 +238,36 @@ Please see "How to online memory" in this text.
240 238
2415.1. State of memory 2395.1. State of memory
242------------ 240------------
243To see (online/offline) state of memory section, read 'state' file. 241To see (online/offline) state of a memory block, read 'state' file.
244 242
245% cat /sys/device/system/memory/memoryXXX/state 243% cat /sys/device/system/memory/memoryXXX/state
246 244
247 245
248If the memory section is online, you'll read "online". 246If the memory block is online, you'll read "online".
249If the memory section is offline, you'll read "offline". 247If the memory block is offline, you'll read "offline".
250 248
251 249
2525.2. How to online memory 2505.2. How to online memory
253------------ 251------------
254Even if the memory is hot-added, it is not at ready-to-use state. 252Even if the memory is hot-added, it is not at ready-to-use state.
255For using newly added memory, you have to "online" the memory section. 253For using newly added memory, you have to "online" the memory block.
256 254
257For onlining, you have to write "online" to the section's state file as: 255For onlining, you have to write "online" to the memory block's state file as:
258 256
259% echo online > /sys/devices/system/memory/memoryXXX/state 257% echo online > /sys/devices/system/memory/memoryXXX/state
260 258
261This onlining will not change the ZONE type of the target memory section, 259This onlining will not change the ZONE type of the target memory block,
262If the memory section is in ZONE_NORMAL, you can change it to ZONE_MOVABLE: 260If the memory block is in ZONE_NORMAL, you can change it to ZONE_MOVABLE:
263 261
264% echo online_movable > /sys/devices/system/memory/memoryXXX/state 262% echo online_movable > /sys/devices/system/memory/memoryXXX/state
265(NOTE: current limit: this memory section must be adjacent to ZONE_MOVABLE) 263(NOTE: current limit: this memory block must be adjacent to ZONE_MOVABLE)
266 264
267And if the memory section is in ZONE_MOVABLE, you can change it to ZONE_NORMAL: 265And if the memory block is in ZONE_MOVABLE, you can change it to ZONE_NORMAL:
268 266
269% echo online_kernel > /sys/devices/system/memory/memoryXXX/state 267% echo online_kernel > /sys/devices/system/memory/memoryXXX/state
270(NOTE: current limit: this memory section must be adjacent to ZONE_NORMAL) 268(NOTE: current limit: this memory block must be adjacent to ZONE_NORMAL)
271 269
272After this, section memoryXXX's state will be 'online' and the amount of 270After this, memory block XXX's state will be 'online' and the amount of
273available memory will be increased. 271available memory will be increased.
274 272
275Currently, newly added memory is added as ZONE_NORMAL (for powerpc, ZONE_DMA). 273Currently, newly added memory is added as ZONE_NORMAL (for powerpc, ZONE_DMA).
@@ -284,22 +282,22 @@ This may be changed in future.
2846.1 Memory offline and ZONE_MOVABLE 2826.1 Memory offline and ZONE_MOVABLE
285------------ 283------------
286Memory offlining is more complicated than memory online. Because memory offline 284Memory offlining is more complicated than memory online. Because memory offline
287has to make the whole memory section be unused, memory offline can fail if 285has to make the whole memory block be unused, memory offline can fail if
288the section includes memory which cannot be freed. 286the memory block includes memory which cannot be freed.
289 287
290In general, memory offline can use 2 techniques. 288In general, memory offline can use 2 techniques.
291 289
292(1) reclaim and free all memory in the section. 290(1) reclaim and free all memory in the memory block.
293(2) migrate all pages in the section. 291(2) migrate all pages in the memory block.
294 292
295In the current implementation, Linux's memory offline uses method (2), freeing 293In the current implementation, Linux's memory offline uses method (2), freeing
296all pages in the section by page migration. But not all pages are 294all pages in the memory block by page migration. But not all pages are
297migratable. Under current Linux, migratable pages are anonymous pages and 295migratable. Under current Linux, migratable pages are anonymous pages and
298page caches. For offlining a section by migration, the kernel has to guarantee 296page caches. For offlining a memory block by migration, the kernel has to
299that the section contains only migratable pages. 297guarantee that the memory block contains only migratable pages.
300 298
301Now, a boot option for making a section which consists of migratable pages is 299Now, a boot option for making a memory block which consists of migratable pages
302supported. By specifying "kernelcore=" or "movablecore=" boot option, you can 300is supported. By specifying "kernelcore=" or "movablecore=" boot option, you can
303create ZONE_MOVABLE...a zone which is just used for movable pages. 301create ZONE_MOVABLE...a zone which is just used for movable pages.
304(See also Documentation/kernel-parameters.txt) 302(See also Documentation/kernel-parameters.txt)
305 303
@@ -315,28 +313,27 @@ creates ZONE_MOVABLE as following.
315 Size of memory for movable pages (for offline) is ZZZZ. 313 Size of memory for movable pages (for offline) is ZZZZ.
316 314
317 315
318Note) Unfortunately, there is no information to show which section belongs 316Note: Unfortunately, there is no information to show which memory block belongs
319to ZONE_MOVABLE. This is TBD. 317to ZONE_MOVABLE. This is TBD.
320 318
321 319
3226.2. How to offline memory 3206.2. How to offline memory
323------------ 321------------
324You can offline a section by using the same sysfs interface that was used in 322You can offline a memory block by using the same sysfs interface that was used
325memory onlining. 323in memory onlining.
326 324
327% echo offline > /sys/devices/system/memory/memoryXXX/state 325% echo offline > /sys/devices/system/memory/memoryXXX/state
328 326
329If offline succeeds, the state of the memory section is changed to be "offline". 327If offline succeeds, the state of the memory block is changed to be "offline".
330If it fails, some error core (like -EBUSY) will be returned by the kernel. 328If it fails, some error core (like -EBUSY) will be returned by the kernel.
331Even if a section does not belong to ZONE_MOVABLE, you can try to offline it. 329Even if a memory block does not belong to ZONE_MOVABLE, you can try to offline
332If it doesn't contain 'unmovable' memory, you'll get success. 330it. If it doesn't contain 'unmovable' memory, you'll get success.
333 331
334A section under ZONE_MOVABLE is considered to be able to be offlined easily. 332A memory block under ZONE_MOVABLE is considered to be able to be offlined
335But under some busy state, it may return -EBUSY. Even if a memory section 333easily. But under some busy state, it may return -EBUSY. Even if a memory
336cannot be offlined due to -EBUSY, you can retry offlining it and may be able to 334block cannot be offlined due to -EBUSY, you can retry offlining it and may be
337offline it (or not). 335able to offline it (or not). (For example, a page is referred to by some kernel
338(For example, a page is referred to by some kernel internal call and released 336internal call and released soon.)
339 soon.)
340 337
341Consideration: 338Consideration:
342Memory hotplug's design direction is to make the possibility of memory offlining 339Memory hotplug's design direction is to make the possibility of memory offlining
@@ -373,11 +370,11 @@ MEMORY_GOING_OFFLINE
373 Generated to begin the process of offlining memory. Allocations are no 370 Generated to begin the process of offlining memory. Allocations are no
374 longer possible from the memory but some of the memory to be offlined 371 longer possible from the memory but some of the memory to be offlined
375 is still in use. The callback can be used to free memory known to a 372 is still in use. The callback can be used to free memory known to a
376 subsystem from the indicated memory section. 373 subsystem from the indicated memory block.
377 374
378MEMORY_CANCEL_OFFLINE 375MEMORY_CANCEL_OFFLINE
379 Generated if MEMORY_GOING_OFFLINE fails. Memory is available again from 376 Generated if MEMORY_GOING_OFFLINE fails. Memory is available again from
380 the section that we attempted to offline. 377 the memory block that we attempted to offline.
381 378
382MEMORY_OFFLINE 379MEMORY_OFFLINE
383 Generated after offlining memory is complete. 380 Generated after offlining memory is complete.
@@ -413,8 +410,8 @@ node if necessary.
413-------------- 410--------------
414 - allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like 411 - allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like
415 sysctl or new control file. 412 sysctl or new control file.
416 - showing memory section and physical device relationship. 413 - showing memory block and physical device relationship.
417 - showing memory section is under ZONE_MOVABLE or not 414 - showing memory block is under ZONE_MOVABLE or not
418 - test and make it better memory offlining. 415 - test and make it better memory offlining.
419 - support HugeTLB page migration and offlining. 416 - support HugeTLB page migration and offlining.
420 - memmap removing at memory offline. 417 - memmap removing at memory offline.
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index dd9d0e33b443..bd4b34c03738 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -746,8 +746,8 @@ Changing this takes effect whenever an application requests memory.
746vfs_cache_pressure 746vfs_cache_pressure
747------------------ 747------------------
748 748
749Controls the tendency of the kernel to reclaim the memory which is used for 749This percentage value controls the tendency of the kernel to reclaim
750caching of directory and inode objects. 750the memory which is used for caching of directory and inode objects.
751 751
752At the default value of vfs_cache_pressure=100 the kernel will attempt to 752At the default value of vfs_cache_pressure=100 the kernel will attempt to
753reclaim dentries and inodes at a "fair" rate with respect to pagecache and 753reclaim dentries and inodes at a "fair" rate with respect to pagecache and
@@ -757,6 +757,11 @@ never reclaim dentries and inodes due to memory pressure and this can easily
757lead to out-of-memory conditions. Increasing vfs_cache_pressure beyond 100 757lead to out-of-memory conditions. Increasing vfs_cache_pressure beyond 100
758causes the kernel to prefer to reclaim dentries and inodes. 758causes the kernel to prefer to reclaim dentries and inodes.
759 759
760Increasing vfs_cache_pressure significantly beyond 100 may have negative
761performance impact. Reclaim code needs to take various locks to find freeable
762directory and inode objects. With vfs_cache_pressure=1000, it will look for
763ten times more freeable objects than there are.
764
760============================================================== 765==============================================================
761 766
762zone_reclaim_mode: 767zone_reclaim_mode:
@@ -772,16 +777,17 @@ This is value ORed together of
7722 = Zone reclaim writes dirty pages out 7772 = Zone reclaim writes dirty pages out
7734 = Zone reclaim swaps pages 7784 = Zone reclaim swaps pages
774 779
775zone_reclaim_mode is set during bootup to 1 if it is determined that pages 780zone_reclaim_mode is disabled by default. For file servers or workloads
776from remote zones will cause a measurable performance reduction. The 781that benefit from having their data cached, zone_reclaim_mode should be
777page allocator will then reclaim easily reusable pages (those page 782left disabled as the caching effect is likely to be more important than
778cache pages that are currently not used) before allocating off node pages.
779
780It may be beneficial to switch off zone reclaim if the system is
781used for a file server and all of memory should be used for caching files
782from disk. In that case the caching effect is more important than
783data locality. 783data locality.
784 784
785zone_reclaim may be enabled if it's known that the workload is partitioned
786such that each partition fits within a NUMA node and that accessing remote
787memory would cause a measurable performance reduction. The page allocator
788will then reclaim easily reusable pages (those page cache pages that are
789currently not used) before allocating off node pages.
790
785Allowing zone reclaim to write out pages stops processes that are 791Allowing zone reclaim to write out pages stops processes that are
786writing large amounts of data from dirtying pages on other nodes. Zone 792writing large amounts of data from dirtying pages on other nodes. Zone
787reclaim will write out dirty pages if a zone fills up and so effectively 793reclaim will write out dirty pages if a zone fills up and so effectively
diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.txt
index 550068466605..6ae89a9edf2a 100644
--- a/Documentation/vm/hwpoison.txt
+++ b/Documentation/vm/hwpoison.txt
@@ -84,6 +84,11 @@ PR_MCE_KILL
84 PR_MCE_KILL_EARLY: Early kill 84 PR_MCE_KILL_EARLY: Early kill
85 PR_MCE_KILL_LATE: Late kill 85 PR_MCE_KILL_LATE: Late kill
86 PR_MCE_KILL_DEFAULT: Use system global default 86 PR_MCE_KILL_DEFAULT: Use system global default
87 Note that if you want to have a dedicated thread which handles
88 the SIGBUS(BUS_MCEERR_AO) on behalf of the process, you should
89 call prctl(PR_MCE_KILL_EARLY) on the designated thread. Otherwise,
90 the SIGBUS is sent to the main thread.
91
87PR_MCE_KILL_GET 92PR_MCE_KILL_GET
88 return current mode 93 return current mode
89 94
diff --git a/MAINTAINERS b/MAINTAINERS
index e433e45814af..7d101d5ba953 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3882,6 +3882,11 @@ L: kvm@vger.kernel.org
3882S: Supported 3882S: Supported
3883F: drivers/uio/uio_pci_generic.c 3883F: drivers/uio/uio_pci_generic.c
3884 3884
3885GET_MAINTAINER SCRIPT
3886M: Joe Perches <joe@perches.com>
3887S: Maintained
3888F: scripts/get_maintainer.pl
3889
3885GFS2 FILE SYSTEM 3890GFS2 FILE SYSTEM
3886M: Steven Whitehouse <swhiteho@redhat.com> 3891M: Steven Whitehouse <swhiteho@redhat.com>
3887L: cluster-devel@redhat.com 3892L: cluster-devel@redhat.com
@@ -4006,9 +4011,8 @@ S: Odd Fixes
4006F: drivers/media/usb/hdpvr/ 4011F: drivers/media/usb/hdpvr/
4007 4012
4008HWPOISON MEMORY FAILURE HANDLING 4013HWPOISON MEMORY FAILURE HANDLING
4009M: Andi Kleen <andi@firstfloor.org> 4014M: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
4010L: linux-mm@kvack.org 4015L: linux-mm@kvack.org
4011T: git git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6.git hwpoison
4012S: Maintained 4016S: Maintained
4013F: mm/memory-failure.c 4017F: mm/memory-failure.c
4014F: mm/hwpoison-inject.c 4018F: mm/hwpoison-inject.c
diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c
index 73a7450ee622..1badf9b84b51 100644
--- a/arch/arc/kernel/troubleshoot.c
+++ b/arch/arc/kernel/troubleshoot.c
@@ -86,12 +86,13 @@ static void show_faulting_vma(unsigned long address, char *buf)
86 unsigned long ino = 0; 86 unsigned long ino = 0;
87 dev_t dev = 0; 87 dev_t dev = 0;
88 char *nm = buf; 88 char *nm = buf;
89 struct mm_struct *active_mm = current->active_mm;
89 90
90 /* can't use print_vma_addr() yet as it doesn't check for 91 /* can't use print_vma_addr() yet as it doesn't check for
91 * non-inclusive vma 92 * non-inclusive vma
92 */ 93 */
93 94 down_read(&active_mm->mmap_sem);
94 vma = find_vma(current->active_mm, address); 95 vma = find_vma(active_mm, address);
95 96
96 /* check against the find_vma( ) behaviour which returns the next VMA 97 /* check against the find_vma( ) behaviour which returns the next VMA
97 * if the container VMA is not found 98 * if the container VMA is not found
@@ -110,9 +111,10 @@ static void show_faulting_vma(unsigned long address, char *buf)
110 vma->vm_start < TASK_UNMAPPED_BASE ? 111 vma->vm_start < TASK_UNMAPPED_BASE ?
111 address : address - vma->vm_start, 112 address : address - vma->vm_start,
112 nm, vma->vm_start, vma->vm_end); 113 nm, vma->vm_start, vma->vm_end);
113 } else { 114 } else
114 pr_info(" @No matching VMA found\n"); 115 pr_info(" @No matching VMA found\n");
115 } 116
117 up_read(&active_mm->mmap_sem);
116} 118}
117 119
118static void show_ecr_verbose(struct pt_regs *regs) 120static void show_ecr_verbose(struct pt_regs *regs)
diff --git a/arch/arm/mm/hugetlbpage.c b/arch/arm/mm/hugetlbpage.c
index 54ee6163c181..66781bf34077 100644
--- a/arch/arm/mm/hugetlbpage.c
+++ b/arch/arm/mm/hugetlbpage.c
@@ -56,8 +56,3 @@ int pmd_huge(pmd_t pmd)
56{ 56{
57 return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT); 57 return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT);
58} 58}
59
60int pmd_huge_support(void)
61{
62 return 1;
63}
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 31eb959e9aa8..023747bf4dd7 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -58,11 +58,6 @@ int pud_huge(pud_t pud)
58#endif 58#endif
59} 59}
60 60
61int pmd_huge_support(void)
62{
63 return 1;
64}
65
66static __init int setup_hugepagesz(char *opt) 61static __init int setup_hugepagesz(char *opt)
67{ 62{
68 unsigned long ps = memparse(opt, &opt); 63 unsigned long ps = memparse(opt, &opt);
diff --git a/arch/blackfin/include/asm/unistd.h b/arch/blackfin/include/asm/unistd.h
index c35414bdf7bd..c8c8ff9eff61 100644
--- a/arch/blackfin/include/asm/unistd.h
+++ b/arch/blackfin/include/asm/unistd.h
@@ -12,7 +12,6 @@
12#define __ARCH_WANT_SYS_ALARM 12#define __ARCH_WANT_SYS_ALARM
13#define __ARCH_WANT_SYS_GETHOSTNAME 13#define __ARCH_WANT_SYS_GETHOSTNAME
14#define __ARCH_WANT_SYS_PAUSE 14#define __ARCH_WANT_SYS_PAUSE
15#define __ARCH_WANT_SYS_SGETMASK
16#define __ARCH_WANT_SYS_TIME 15#define __ARCH_WANT_SYS_TIME
17#define __ARCH_WANT_SYS_FADVISE64 16#define __ARCH_WANT_SYS_FADVISE64
18#define __ARCH_WANT_SYS_GETPGRP 17#define __ARCH_WANT_SYS_GETPGRP
diff --git a/arch/cris/include/asm/unistd.h b/arch/cris/include/asm/unistd.h
index 5cc7d1991e48..0f40fed1ba25 100644
--- a/arch/cris/include/asm/unistd.h
+++ b/arch/cris/include/asm/unistd.h
@@ -15,7 +15,6 @@
15#define __ARCH_WANT_SYS_GETHOSTNAME 15#define __ARCH_WANT_SYS_GETHOSTNAME
16#define __ARCH_WANT_SYS_IPC 16#define __ARCH_WANT_SYS_IPC
17#define __ARCH_WANT_SYS_PAUSE 17#define __ARCH_WANT_SYS_PAUSE
18#define __ARCH_WANT_SYS_SGETMASK
19#define __ARCH_WANT_SYS_SIGNAL 18#define __ARCH_WANT_SYS_SIGNAL
20#define __ARCH_WANT_SYS_TIME 19#define __ARCH_WANT_SYS_TIME
21#define __ARCH_WANT_SYS_UTIME 20#define __ARCH_WANT_SYS_UTIME
diff --git a/arch/frv/include/asm/unistd.h b/arch/frv/include/asm/unistd.h
index 70ec7293dce7..17b5df8fc28a 100644
--- a/arch/frv/include/asm/unistd.h
+++ b/arch/frv/include/asm/unistd.h
@@ -13,7 +13,6 @@
13/* #define __ARCH_WANT_SYS_GETHOSTNAME */ 13/* #define __ARCH_WANT_SYS_GETHOSTNAME */
14#define __ARCH_WANT_SYS_IPC 14#define __ARCH_WANT_SYS_IPC
15#define __ARCH_WANT_SYS_PAUSE 15#define __ARCH_WANT_SYS_PAUSE
16/* #define __ARCH_WANT_SYS_SGETMASK */
17/* #define __ARCH_WANT_SYS_SIGNAL */ 16/* #define __ARCH_WANT_SYS_SIGNAL */
18#define __ARCH_WANT_SYS_TIME 17#define __ARCH_WANT_SYS_TIME
19#define __ARCH_WANT_SYS_UTIME 18#define __ARCH_WANT_SYS_UTIME
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index 3202aa74e0d6..6437ca21f61b 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -21,7 +21,8 @@
21#define PENALTY_FOR_NODE_WITH_CPUS 255 21#define PENALTY_FOR_NODE_WITH_CPUS 255
22 22
23/* 23/*
24 * Distance above which we begin to use zone reclaim 24 * Nodes within this distance are eligible for reclaim by zone_reclaim() when
25 * zone_reclaim_mode is enabled.
25 */ 26 */
26#define RECLAIM_DISTANCE 15 27#define RECLAIM_DISTANCE 15
27 28
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index 68232db98baa..76069c18ee42 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -114,11 +114,6 @@ int pud_huge(pud_t pud)
114 return 0; 114 return 0;
115} 115}
116 116
117int pmd_huge_support(void)
118{
119 return 0;
120}
121
122struct page * 117struct page *
123follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) 118follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write)
124{ 119{
diff --git a/arch/m68k/include/asm/unistd.h b/arch/m68k/include/asm/unistd.h
index 33afa56ad47a..1fcdd344c7ad 100644
--- a/arch/m68k/include/asm/unistd.h
+++ b/arch/m68k/include/asm/unistd.h
@@ -13,7 +13,6 @@
13#define __ARCH_WANT_SYS_GETHOSTNAME 13#define __ARCH_WANT_SYS_GETHOSTNAME
14#define __ARCH_WANT_SYS_IPC 14#define __ARCH_WANT_SYS_IPC
15#define __ARCH_WANT_SYS_PAUSE 15#define __ARCH_WANT_SYS_PAUSE
16#define __ARCH_WANT_SYS_SGETMASK
17#define __ARCH_WANT_SYS_SIGNAL 16#define __ARCH_WANT_SYS_SIGNAL
18#define __ARCH_WANT_SYS_TIME 17#define __ARCH_WANT_SYS_TIME
19#define __ARCH_WANT_SYS_UTIME 18#define __ARCH_WANT_SYS_UTIME
diff --git a/arch/metag/mm/hugetlbpage.c b/arch/metag/mm/hugetlbpage.c
index 042431509b56..3c52fa6d0f8e 100644
--- a/arch/metag/mm/hugetlbpage.c
+++ b/arch/metag/mm/hugetlbpage.c
@@ -110,11 +110,6 @@ int pud_huge(pud_t pud)
110 return 0; 110 return 0;
111} 111}
112 112
113int pmd_huge_support(void)
114{
115 return 1;
116}
117
118struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, 113struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
119 pmd_t *pmd, int write) 114 pmd_t *pmd, int write)
120{ 115{
diff --git a/arch/microblaze/include/asm/unistd.h b/arch/microblaze/include/asm/unistd.h
index b14232b6878f..fd56a8f66489 100644
--- a/arch/microblaze/include/asm/unistd.h
+++ b/arch/microblaze/include/asm/unistd.h
@@ -19,7 +19,6 @@
19#define __ARCH_WANT_SYS_ALARM 19#define __ARCH_WANT_SYS_ALARM
20#define __ARCH_WANT_SYS_GETHOSTNAME 20#define __ARCH_WANT_SYS_GETHOSTNAME
21#define __ARCH_WANT_SYS_PAUSE 21#define __ARCH_WANT_SYS_PAUSE
22#define __ARCH_WANT_SYS_SGETMASK
23#define __ARCH_WANT_SYS_SIGNAL 22#define __ARCH_WANT_SYS_SIGNAL
24#define __ARCH_WANT_SYS_TIME 23#define __ARCH_WANT_SYS_TIME
25#define __ARCH_WANT_SYS_UTIME 24#define __ARCH_WANT_SYS_UTIME
diff --git a/arch/mips/include/asm/unistd.h b/arch/mips/include/asm/unistd.h
index 413d6c612bec..e55813029d5a 100644
--- a/arch/mips/include/asm/unistd.h
+++ b/arch/mips/include/asm/unistd.h
@@ -29,7 +29,6 @@
29#define __ARCH_WANT_SYS_GETHOSTNAME 29#define __ARCH_WANT_SYS_GETHOSTNAME
30#define __ARCH_WANT_SYS_IPC 30#define __ARCH_WANT_SYS_IPC
31#define __ARCH_WANT_SYS_PAUSE 31#define __ARCH_WANT_SYS_PAUSE
32#define __ARCH_WANT_SYS_SGETMASK
33#define __ARCH_WANT_SYS_UTIME 32#define __ARCH_WANT_SYS_UTIME
34#define __ARCH_WANT_SYS_WAITPID 33#define __ARCH_WANT_SYS_WAITPID
35#define __ARCH_WANT_SYS_SOCKETCALL 34#define __ARCH_WANT_SYS_SOCKETCALL
diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c
index 77e0ae036e7c..4ec8ee10d371 100644
--- a/arch/mips/mm/hugetlbpage.c
+++ b/arch/mips/mm/hugetlbpage.c
@@ -84,11 +84,6 @@ int pud_huge(pud_t pud)
84 return (pud_val(pud) & _PAGE_HUGE) != 0; 84 return (pud_val(pud) & _PAGE_HUGE) != 0;
85} 85}
86 86
87int pmd_huge_support(void)
88{
89 return 1;
90}
91
92struct page * 87struct page *
93follow_huge_pmd(struct mm_struct *mm, unsigned long address, 88follow_huge_pmd(struct mm_struct *mm, unsigned long address,
94 pmd_t *pmd, int write) 89 pmd_t *pmd, int write)
diff --git a/arch/mn10300/include/asm/unistd.h b/arch/mn10300/include/asm/unistd.h
index 9d4e2d1ef90e..0522468f488b 100644
--- a/arch/mn10300/include/asm/unistd.h
+++ b/arch/mn10300/include/asm/unistd.h
@@ -26,7 +26,6 @@
26#define __ARCH_WANT_SYS_GETHOSTNAME 26#define __ARCH_WANT_SYS_GETHOSTNAME
27#define __ARCH_WANT_SYS_IPC 27#define __ARCH_WANT_SYS_IPC
28#define __ARCH_WANT_SYS_PAUSE 28#define __ARCH_WANT_SYS_PAUSE
29#define __ARCH_WANT_SYS_SGETMASK
30#define __ARCH_WANT_SYS_SIGNAL 29#define __ARCH_WANT_SYS_SIGNAL
31#define __ARCH_WANT_SYS_TIME 30#define __ARCH_WANT_SYS_TIME
32#define __ARCH_WANT_SYS_UTIME 31#define __ARCH_WANT_SYS_UTIME
diff --git a/arch/parisc/include/asm/unistd.h b/arch/parisc/include/asm/unistd.h
index 74d835820ee7..5f4c68daa261 100644
--- a/arch/parisc/include/asm/unistd.h
+++ b/arch/parisc/include/asm/unistd.h
@@ -145,7 +145,6 @@ type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5) \
145#define __ARCH_WANT_SYS_ALARM 145#define __ARCH_WANT_SYS_ALARM
146#define __ARCH_WANT_SYS_GETHOSTNAME 146#define __ARCH_WANT_SYS_GETHOSTNAME
147#define __ARCH_WANT_SYS_PAUSE 147#define __ARCH_WANT_SYS_PAUSE
148#define __ARCH_WANT_SYS_SGETMASK
149#define __ARCH_WANT_SYS_SIGNAL 148#define __ARCH_WANT_SYS_SIGNAL
150#define __ARCH_WANT_SYS_TIME 149#define __ARCH_WANT_SYS_TIME
151#define __ARCH_WANT_COMPAT_SYS_TIME 150#define __ARCH_WANT_COMPAT_SYS_TIME
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 3ebb188c3ff5..d98c1ecc3266 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -44,6 +44,12 @@ static inline int pte_present(pte_t pte)
44 return pte_val(pte) & (_PAGE_PRESENT | _PAGE_NUMA); 44 return pte_val(pte) & (_PAGE_PRESENT | _PAGE_NUMA);
45} 45}
46 46
47#define pte_present_nonuma pte_present_nonuma
48static inline int pte_present_nonuma(pte_t pte)
49{
50 return pte_val(pte) & (_PAGE_PRESENT);
51}
52
47#define pte_numa pte_numa 53#define pte_numa pte_numa
48static inline int pte_numa(pte_t pte) 54static inline int pte_numa(pte_t pte)
49{ 55{
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index c9202151079f..6c8a8c5a37a1 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -9,12 +9,8 @@ struct device_node;
9#ifdef CONFIG_NUMA 9#ifdef CONFIG_NUMA
10 10
11/* 11/*
12 * Before going off node we want the VM to try and reclaim from the local 12 * If zone_reclaim_mode is enabled, a RECLAIM_DISTANCE of 10 will mean that
13 * node. It does this if the remote distance is larger than RECLAIM_DISTANCE. 13 * all zones on all nodes will be eligible for zone_reclaim().
14 * With the default REMOTE_DISTANCE of 20 and the default RECLAIM_DISTANCE of
15 * 20, we never reclaim and go off node straight away.
16 *
17 * To fix this we choose a smaller value of RECLAIM_DISTANCE.
18 */ 14 */
19#define RECLAIM_DISTANCE 10 15#define RECLAIM_DISTANCE 10
20 16
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index 9b892bbd9d84..5ce5552ab9f5 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -29,7 +29,6 @@
29#define __ARCH_WANT_SYS_GETHOSTNAME 29#define __ARCH_WANT_SYS_GETHOSTNAME
30#define __ARCH_WANT_SYS_IPC 30#define __ARCH_WANT_SYS_IPC
31#define __ARCH_WANT_SYS_PAUSE 31#define __ARCH_WANT_SYS_PAUSE
32#define __ARCH_WANT_SYS_SGETMASK
33#define __ARCH_WANT_SYS_SIGNAL 32#define __ARCH_WANT_SYS_SIGNAL
34#define __ARCH_WANT_SYS_TIME 33#define __ARCH_WANT_SYS_TIME
35#define __ARCH_WANT_SYS_UTIME 34#define __ARCH_WANT_SYS_UTIME
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index eb923654ba80..7e70ae968e5f 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -86,11 +86,6 @@ int pgd_huge(pgd_t pgd)
86 */ 86 */
87 return ((pgd_val(pgd) & 0x3) != 0x0); 87 return ((pgd_val(pgd) & 0x3) != 0x0);
88} 88}
89
90int pmd_huge_support(void)
91{
92 return 1;
93}
94#else 89#else
95int pmd_huge(pmd_t pmd) 90int pmd_huge(pmd_t pmd)
96{ 91{
@@ -106,11 +101,6 @@ int pgd_huge(pgd_t pgd)
106{ 101{
107 return 0; 102 return 0;
108} 103}
109
110int pmd_huge_support(void)
111{
112 return 0;
113}
114#endif 104#endif
115 105
116pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) 106pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index 0727a55d87d9..0ff66a7e29bb 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -220,11 +220,6 @@ int pud_huge(pud_t pud)
220 return 0; 220 return 0;
221} 221}
222 222
223int pmd_huge_support(void)
224{
225 return 1;
226}
227
228struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, 223struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
229 pmd_t *pmdp, int write) 224 pmd_t *pmdp, int write)
230{ 225{
diff --git a/arch/sh/include/asm/unistd.h b/arch/sh/include/asm/unistd.h
index e77816c4b9bc..126fe8340b22 100644
--- a/arch/sh/include/asm/unistd.h
+++ b/arch/sh/include/asm/unistd.h
@@ -11,7 +11,6 @@
11# define __ARCH_WANT_SYS_GETHOSTNAME 11# define __ARCH_WANT_SYS_GETHOSTNAME
12# define __ARCH_WANT_SYS_IPC 12# define __ARCH_WANT_SYS_IPC
13# define __ARCH_WANT_SYS_PAUSE 13# define __ARCH_WANT_SYS_PAUSE
14# define __ARCH_WANT_SYS_SGETMASK
15# define __ARCH_WANT_SYS_SIGNAL 14# define __ARCH_WANT_SYS_SIGNAL
16# define __ARCH_WANT_SYS_TIME 15# define __ARCH_WANT_SYS_TIME
17# define __ARCH_WANT_SYS_UTIME 16# define __ARCH_WANT_SYS_UTIME
diff --git a/arch/sh/kernel/hw_breakpoint.c b/arch/sh/kernel/hw_breakpoint.c
index f9173766ec4b..2197fc584186 100644
--- a/arch/sh/kernel/hw_breakpoint.c
+++ b/arch/sh/kernel/hw_breakpoint.c
@@ -52,7 +52,7 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
52 int i; 52 int i;
53 53
54 for (i = 0; i < sh_ubc->num_events; i++) { 54 for (i = 0; i < sh_ubc->num_events; i++) {
55 struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]); 55 struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]);
56 56
57 if (!*slot) { 57 if (!*slot) {
58 *slot = bp; 58 *slot = bp;
@@ -84,7 +84,7 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp)
84 int i; 84 int i;
85 85
86 for (i = 0; i < sh_ubc->num_events; i++) { 86 for (i = 0; i < sh_ubc->num_events; i++) {
87 struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]); 87 struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]);
88 88
89 if (*slot == bp) { 89 if (*slot == bp) {
90 *slot = NULL; 90 *slot = NULL;
diff --git a/arch/sh/kernel/kprobes.c b/arch/sh/kernel/kprobes.c
index 42b46e61a2d5..83acbf3f6de8 100644
--- a/arch/sh/kernel/kprobes.c
+++ b/arch/sh/kernel/kprobes.c
@@ -102,7 +102,7 @@ int __kprobes kprobe_handle_illslot(unsigned long pc)
102 102
103void __kprobes arch_remove_kprobe(struct kprobe *p) 103void __kprobes arch_remove_kprobe(struct kprobe *p)
104{ 104{
105 struct kprobe *saved = &__get_cpu_var(saved_next_opcode); 105 struct kprobe *saved = this_cpu_ptr(&saved_next_opcode);
106 106
107 if (saved->addr) { 107 if (saved->addr) {
108 arch_disarm_kprobe(p); 108 arch_disarm_kprobe(p);
@@ -111,7 +111,7 @@ void __kprobes arch_remove_kprobe(struct kprobe *p)
111 saved->addr = NULL; 111 saved->addr = NULL;
112 saved->opcode = 0; 112 saved->opcode = 0;
113 113
114 saved = &__get_cpu_var(saved_next_opcode2); 114 saved = this_cpu_ptr(&saved_next_opcode2);
115 if (saved->addr) { 115 if (saved->addr) {
116 arch_disarm_kprobe(saved); 116 arch_disarm_kprobe(saved);
117 117
@@ -129,14 +129,14 @@ static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
129 129
130static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) 130static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
131{ 131{
132 __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; 132 __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
133 kcb->kprobe_status = kcb->prev_kprobe.status; 133 kcb->kprobe_status = kcb->prev_kprobe.status;
134} 134}
135 135
136static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, 136static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
137 struct kprobe_ctlblk *kcb) 137 struct kprobe_ctlblk *kcb)
138{ 138{
139 __get_cpu_var(current_kprobe) = p; 139 __this_cpu_write(current_kprobe, p);
140} 140}
141 141
142/* 142/*
@@ -146,15 +146,15 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
146 */ 146 */
147static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) 147static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
148{ 148{
149 __get_cpu_var(saved_current_opcode).addr = (kprobe_opcode_t *)regs->pc; 149 __this_cpu_write(saved_current_opcode.addr, (kprobe_opcode_t *)regs->pc);
150 150
151 if (p != NULL) { 151 if (p != NULL) {
152 struct kprobe *op1, *op2; 152 struct kprobe *op1, *op2;
153 153
154 arch_disarm_kprobe(p); 154 arch_disarm_kprobe(p);
155 155
156 op1 = &__get_cpu_var(saved_next_opcode); 156 op1 = this_cpu_ptr(&saved_next_opcode);
157 op2 = &__get_cpu_var(saved_next_opcode2); 157 op2 = this_cpu_ptr(&saved_next_opcode2);
158 158
159 if (OPCODE_JSR(p->opcode) || OPCODE_JMP(p->opcode)) { 159 if (OPCODE_JSR(p->opcode) || OPCODE_JMP(p->opcode)) {
160 unsigned int reg_nr = ((p->opcode >> 8) & 0x000F); 160 unsigned int reg_nr = ((p->opcode >> 8) & 0x000F);
@@ -249,7 +249,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
249 kcb->kprobe_status = KPROBE_REENTER; 249 kcb->kprobe_status = KPROBE_REENTER;
250 return 1; 250 return 1;
251 } else { 251 } else {
252 p = __get_cpu_var(current_kprobe); 252 p = __this_cpu_read(current_kprobe);
253 if (p->break_handler && p->break_handler(p, regs)) { 253 if (p->break_handler && p->break_handler(p, regs)) {
254 goto ss_probe; 254 goto ss_probe;
255 } 255 }
@@ -336,9 +336,9 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
336 continue; 336 continue;
337 337
338 if (ri->rp && ri->rp->handler) { 338 if (ri->rp && ri->rp->handler) {
339 __get_cpu_var(current_kprobe) = &ri->rp->kp; 339 __this_cpu_write(current_kprobe, &ri->rp->kp);
340 ri->rp->handler(ri, regs); 340 ri->rp->handler(ri, regs);
341 __get_cpu_var(current_kprobe) = NULL; 341 __this_cpu_write(current_kprobe, NULL);
342 } 342 }
343 343
344 orig_ret_address = (unsigned long)ri->ret_addr; 344 orig_ret_address = (unsigned long)ri->ret_addr;
@@ -383,19 +383,19 @@ static int __kprobes post_kprobe_handler(struct pt_regs *regs)
383 cur->post_handler(cur, regs, 0); 383 cur->post_handler(cur, regs, 0);
384 } 384 }
385 385
386 p = &__get_cpu_var(saved_next_opcode); 386 p = this_cpu_ptr(&saved_next_opcode);
387 if (p->addr) { 387 if (p->addr) {
388 arch_disarm_kprobe(p); 388 arch_disarm_kprobe(p);
389 p->addr = NULL; 389 p->addr = NULL;
390 p->opcode = 0; 390 p->opcode = 0;
391 391
392 addr = __get_cpu_var(saved_current_opcode).addr; 392 addr = __this_cpu_read(saved_current_opcode.addr);
393 __get_cpu_var(saved_current_opcode).addr = NULL; 393 __this_cpu_write(saved_current_opcode.addr, NULL);
394 394
395 p = get_kprobe(addr); 395 p = get_kprobe(addr);
396 arch_arm_kprobe(p); 396 arch_arm_kprobe(p);
397 397
398 p = &__get_cpu_var(saved_next_opcode2); 398 p = this_cpu_ptr(&saved_next_opcode2);
399 if (p->addr) { 399 if (p->addr) {
400 arch_disarm_kprobe(p); 400 arch_disarm_kprobe(p);
401 p->addr = NULL; 401 p->addr = NULL;
@@ -511,7 +511,7 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
511 if (kprobe_handler(args->regs)) { 511 if (kprobe_handler(args->regs)) {
512 ret = NOTIFY_STOP; 512 ret = NOTIFY_STOP;
513 } else { 513 } else {
514 p = __get_cpu_var(current_kprobe); 514 p = __this_cpu_read(current_kprobe);
515 if (p->break_handler && 515 if (p->break_handler &&
516 p->break_handler(p, args->regs)) 516 p->break_handler(p, args->regs))
517 ret = NOTIFY_STOP; 517 ret = NOTIFY_STOP;
diff --git a/arch/sh/kernel/localtimer.c b/arch/sh/kernel/localtimer.c
index 8bfc6dfa8b94..b880a7e2ace7 100644
--- a/arch/sh/kernel/localtimer.c
+++ b/arch/sh/kernel/localtimer.c
@@ -32,7 +32,7 @@ static DEFINE_PER_CPU(struct clock_event_device, local_clockevent);
32 */ 32 */
33void local_timer_interrupt(void) 33void local_timer_interrupt(void)
34{ 34{
35 struct clock_event_device *clk = &__get_cpu_var(local_clockevent); 35 struct clock_event_device *clk = this_cpu_ptr(&local_clockevent);
36 36
37 irq_enter(); 37 irq_enter();
38 clk->event_handler(clk); 38 clk->event_handler(clk);
diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c
index b9cefebda55c..02331672b6db 100644
--- a/arch/sh/kernel/perf_event.c
+++ b/arch/sh/kernel/perf_event.c
@@ -227,7 +227,7 @@ again:
227 227
228static void sh_pmu_stop(struct perf_event *event, int flags) 228static void sh_pmu_stop(struct perf_event *event, int flags)
229{ 229{
230 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 230 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
231 struct hw_perf_event *hwc = &event->hw; 231 struct hw_perf_event *hwc = &event->hw;
232 int idx = hwc->idx; 232 int idx = hwc->idx;
233 233
@@ -245,7 +245,7 @@ static void sh_pmu_stop(struct perf_event *event, int flags)
245 245
246static void sh_pmu_start(struct perf_event *event, int flags) 246static void sh_pmu_start(struct perf_event *event, int flags)
247{ 247{
248 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 248 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
249 struct hw_perf_event *hwc = &event->hw; 249 struct hw_perf_event *hwc = &event->hw;
250 int idx = hwc->idx; 250 int idx = hwc->idx;
251 251
@@ -262,7 +262,7 @@ static void sh_pmu_start(struct perf_event *event, int flags)
262 262
263static void sh_pmu_del(struct perf_event *event, int flags) 263static void sh_pmu_del(struct perf_event *event, int flags)
264{ 264{
265 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 265 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
266 266
267 sh_pmu_stop(event, PERF_EF_UPDATE); 267 sh_pmu_stop(event, PERF_EF_UPDATE);
268 __clear_bit(event->hw.idx, cpuc->used_mask); 268 __clear_bit(event->hw.idx, cpuc->used_mask);
@@ -272,7 +272,7 @@ static void sh_pmu_del(struct perf_event *event, int flags)
272 272
273static int sh_pmu_add(struct perf_event *event, int flags) 273static int sh_pmu_add(struct perf_event *event, int flags)
274{ 274{
275 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 275 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
276 struct hw_perf_event *hwc = &event->hw; 276 struct hw_perf_event *hwc = &event->hw;
277 int idx = hwc->idx; 277 int idx = hwc->idx;
278 int ret = -EAGAIN; 278 int ret = -EAGAIN;
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index 86a7936a980b..fc5acfc93c92 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -111,7 +111,7 @@ void play_dead_common(void)
111 irq_ctx_exit(raw_smp_processor_id()); 111 irq_ctx_exit(raw_smp_processor_id());
112 mb(); 112 mb();
113 113
114 __get_cpu_var(cpu_state) = CPU_DEAD; 114 __this_cpu_write(cpu_state, CPU_DEAD);
115 local_irq_disable(); 115 local_irq_disable();
116} 116}
117 117
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index 0d676a41081e..d7762349ea48 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -83,11 +83,6 @@ int pud_huge(pud_t pud)
83 return 0; 83 return 0;
84} 84}
85 85
86int pmd_huge_support(void)
87{
88 return 0;
89}
90
91struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, 86struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
92 pmd_t *pmd, int write) 87 pmd_t *pmd, int write)
93{ 88{
diff --git a/arch/sparc/include/asm/unistd.h b/arch/sparc/include/asm/unistd.h
index dfa53fdd5cbc..0aac1e8f2968 100644
--- a/arch/sparc/include/asm/unistd.h
+++ b/arch/sparc/include/asm/unistd.h
@@ -25,7 +25,6 @@
25#define __ARCH_WANT_SYS_ALARM 25#define __ARCH_WANT_SYS_ALARM
26#define __ARCH_WANT_SYS_GETHOSTNAME 26#define __ARCH_WANT_SYS_GETHOSTNAME
27#define __ARCH_WANT_SYS_PAUSE 27#define __ARCH_WANT_SYS_PAUSE
28#define __ARCH_WANT_SYS_SGETMASK
29#define __ARCH_WANT_SYS_SIGNAL 28#define __ARCH_WANT_SYS_SIGNAL
30#define __ARCH_WANT_SYS_TIME 29#define __ARCH_WANT_SYS_TIME
31#define __ARCH_WANT_SYS_UTIME 30#define __ARCH_WANT_SYS_UTIME
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index 9bd9ce80bf77..d329537739c6 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -231,11 +231,6 @@ int pud_huge(pud_t pud)
231 return 0; 231 return 0;
232} 232}
233 233
234int pmd_huge_support(void)
235{
236 return 0;
237}
238
239struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, 234struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
240 pmd_t *pmd, int write) 235 pmd_t *pmd, int write)
241{ 236{
diff --git a/arch/tile/mm/homecache.c b/arch/tile/mm/homecache.c
index 004ba568d93f..33294fdc402e 100644
--- a/arch/tile/mm/homecache.c
+++ b/arch/tile/mm/homecache.c
@@ -417,7 +417,7 @@ void __homecache_free_pages(struct page *page, unsigned int order)
417 if (put_page_testzero(page)) { 417 if (put_page_testzero(page)) {
418 homecache_change_page_home(page, order, PAGE_HOME_HASH); 418 homecache_change_page_home(page, order, PAGE_HOME_HASH);
419 if (order == 0) { 419 if (order == 0) {
420 free_hot_cold_page(page, 0); 420 free_hot_cold_page(page, false);
421 } else { 421 } else {
422 init_page_count(page); 422 init_page_count(page);
423 __free_pages(page, order); 423 __free_pages(page, order);
diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c
index 0cb3bbaa580c..e514899e1100 100644
--- a/arch/tile/mm/hugetlbpage.c
+++ b/arch/tile/mm/hugetlbpage.c
@@ -166,11 +166,6 @@ int pud_huge(pud_t pud)
166 return !!(pud_val(pud) & _PAGE_HUGE_PAGE); 166 return !!(pud_val(pud) & _PAGE_HUGE_PAGE);
167} 167}
168 168
169int pmd_huge_support(void)
170{
171 return 1;
172}
173
174struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, 169struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
175 pmd_t *pmd, int write) 170 pmd_t *pmd, int write)
176{ 171{
diff --git a/arch/unicore32/mm/ioremap.c b/arch/unicore32/mm/ioremap.c
index 13068ee22f33..bf012b2b71a9 100644
--- a/arch/unicore32/mm/ioremap.c
+++ b/arch/unicore32/mm/ioremap.c
@@ -144,11 +144,11 @@ void __iomem *__uc32_ioremap_pfn_caller(unsigned long pfn,
144 * Don't allow RAM to be mapped 144 * Don't allow RAM to be mapped
145 */ 145 */
146 if (pfn_valid(pfn)) { 146 if (pfn_valid(pfn)) {
147 printk(KERN_WARNING "BUG: Your driver calls ioremap() on\n" 147 WARN(1, "BUG: Your driver calls ioremap() on\n"
148 "system memory. This leads to architecturally\n" 148 "system memory. This leads to architecturally\n"
149 "unpredictable behaviour, and ioremap() will fail in\n" 149 "unpredictable behaviour, and ioremap() will fail in\n"
150 "the next kernel release. Please fix your driver.\n"); 150 "the next kernel release. Please fix your driver.\n");
151 WARN_ON(1); 151 return NULL;
152 } 152 }
153 153
154 type = get_mem_type(mtype); 154 type = get_mem_type(mtype);
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7a01d4335029..272b493ea1bf 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -26,7 +26,7 @@ config X86
26 select ARCH_MIGHT_HAVE_PC_SERIO 26 select ARCH_MIGHT_HAVE_PC_SERIO
27 select HAVE_AOUT if X86_32 27 select HAVE_AOUT if X86_32
28 select HAVE_UNSTABLE_SCHED_CLOCK 28 select HAVE_UNSTABLE_SCHED_CLOCK
29 select ARCH_SUPPORTS_NUMA_BALANCING 29 select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
30 select ARCH_SUPPORTS_INT128 if X86_64 30 select ARCH_SUPPORTS_INT128 if X86_64
31 select ARCH_WANTS_PROT_NUMA_PROT_NONE 31 select ARCH_WANTS_PROT_NUMA_PROT_NONE
32 select HAVE_IDE 32 select HAVE_IDE
@@ -41,7 +41,7 @@ config X86
41 select ARCH_WANT_OPTIONAL_GPIOLIB 41 select ARCH_WANT_OPTIONAL_GPIOLIB
42 select ARCH_WANT_FRAME_POINTERS 42 select ARCH_WANT_FRAME_POINTERS
43 select HAVE_DMA_ATTRS 43 select HAVE_DMA_ATTRS
44 select HAVE_DMA_CONTIGUOUS if !SWIOTLB 44 select HAVE_DMA_CONTIGUOUS
45 select HAVE_KRETPROBES 45 select HAVE_KRETPROBES
46 select GENERIC_EARLY_IOREMAP 46 select GENERIC_EARLY_IOREMAP
47 select HAVE_OPTPROBES 47 select HAVE_OPTPROBES
@@ -105,7 +105,7 @@ config X86
105 select HAVE_ARCH_SECCOMP_FILTER 105 select HAVE_ARCH_SECCOMP_FILTER
106 select BUILDTIME_EXTABLE_SORT 106 select BUILDTIME_EXTABLE_SORT
107 select GENERIC_CMOS_UPDATE 107 select GENERIC_CMOS_UPDATE
108 select HAVE_ARCH_SOFT_DIRTY 108 select HAVE_ARCH_SOFT_DIRTY if X86_64
109 select CLOCKSOURCE_WATCHDOG 109 select CLOCKSOURCE_WATCHDOG
110 select GENERIC_CLOCKEVENTS 110 select GENERIC_CLOCKEVENTS
111 select ARCH_CLOCKSOURCE_DATA 111 select ARCH_CLOCKSOURCE_DATA
@@ -1874,6 +1874,10 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK
1874 def_bool y 1874 def_bool y
1875 depends on X86_64 || X86_PAE 1875 depends on X86_64 || X86_PAE
1876 1876
1877config ARCH_ENABLE_HUGEPAGE_MIGRATION
1878 def_bool y
1879 depends on X86_64 && HUGETLB_PAGE && MIGRATION
1880
1877menu "Power management and ACPI options" 1881menu "Power management and ACPI options"
1878 1882
1879config ARCH_HIBERNATION_HEADER 1883config ARCH_HIBERNATION_HEADER
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 6e4ce2df87cf..958b90f761e5 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -176,8 +176,6 @@ int mce_available(struct cpuinfo_x86 *c);
176DECLARE_PER_CPU(unsigned, mce_exception_count); 176DECLARE_PER_CPU(unsigned, mce_exception_count);
177DECLARE_PER_CPU(unsigned, mce_poll_count); 177DECLARE_PER_CPU(unsigned, mce_poll_count);
178 178
179extern atomic_t mce_entry;
180
181typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS); 179typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
182DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); 180DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
183 181
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index 0d193e234647..206a87fdd22d 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -62,66 +62,14 @@ static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshi
62 return ((value >> rightshift) & mask) << leftshift; 62 return ((value >> rightshift) & mask) << leftshift;
63} 63}
64 64
65#ifdef CONFIG_MEM_SOFT_DIRTY
66
67/*
68 * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE, _PAGE_BIT_SOFT_DIRTY and
69 * _PAGE_BIT_PROTNONE are taken, split up the 28 bits of offset
70 * into this range.
71 */
72#define PTE_FILE_MAX_BITS 28
73#define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1)
74#define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1)
75#define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1)
76#define PTE_FILE_SHIFT4 (_PAGE_BIT_SOFT_DIRTY + 1)
77#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1)
78#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1)
79#define PTE_FILE_BITS3 (PTE_FILE_SHIFT4 - PTE_FILE_SHIFT3 - 1)
80
81#define PTE_FILE_MASK1 ((1U << PTE_FILE_BITS1) - 1)
82#define PTE_FILE_MASK2 ((1U << PTE_FILE_BITS2) - 1)
83#define PTE_FILE_MASK3 ((1U << PTE_FILE_BITS3) - 1)
84
85#define PTE_FILE_LSHIFT2 (PTE_FILE_BITS1)
86#define PTE_FILE_LSHIFT3 (PTE_FILE_BITS1 + PTE_FILE_BITS2)
87#define PTE_FILE_LSHIFT4 (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3)
88
89static __always_inline pgoff_t pte_to_pgoff(pte_t pte)
90{
91 return (pgoff_t)
92 (pte_bitop(pte.pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1, 0) +
93 pte_bitop(pte.pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2, PTE_FILE_LSHIFT2) +
94 pte_bitop(pte.pte_low, PTE_FILE_SHIFT3, PTE_FILE_MASK3, PTE_FILE_LSHIFT3) +
95 pte_bitop(pte.pte_low, PTE_FILE_SHIFT4, -1UL, PTE_FILE_LSHIFT4));
96}
97
98static __always_inline pte_t pgoff_to_pte(pgoff_t off)
99{
100 return (pte_t){
101 .pte_low =
102 pte_bitop(off, 0, PTE_FILE_MASK1, PTE_FILE_SHIFT1) +
103 pte_bitop(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2, PTE_FILE_SHIFT2) +
104 pte_bitop(off, PTE_FILE_LSHIFT3, PTE_FILE_MASK3, PTE_FILE_SHIFT3) +
105 pte_bitop(off, PTE_FILE_LSHIFT4, -1UL, PTE_FILE_SHIFT4) +
106 _PAGE_FILE,
107 };
108}
109
110#else /* CONFIG_MEM_SOFT_DIRTY */
111
112/* 65/*
113 * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, 66 * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken,
114 * split up the 29 bits of offset into this range. 67 * split up the 29 bits of offset into this range.
115 */ 68 */
116#define PTE_FILE_MAX_BITS 29 69#define PTE_FILE_MAX_BITS 29
117#define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1) 70#define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1)
118#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
119#define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1) 71#define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1)
120#define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1) 72#define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1)
121#else
122#define PTE_FILE_SHIFT2 (_PAGE_BIT_PROTNONE + 1)
123#define PTE_FILE_SHIFT3 (_PAGE_BIT_FILE + 1)
124#endif
125#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1) 73#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1)
126#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) 74#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1)
127 75
@@ -150,16 +98,9 @@ static __always_inline pte_t pgoff_to_pte(pgoff_t off)
150 }; 98 };
151} 99}
152 100
153#endif /* CONFIG_MEM_SOFT_DIRTY */
154
155/* Encode and de-code a swap entry */ 101/* Encode and de-code a swap entry */
156#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
157#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) 102#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
158#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) 103#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
159#else
160#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1)
161#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
162#endif
163 104
164#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) 105#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
165 106
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index b459ddf27d64..0ec056012618 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -131,7 +131,8 @@ static inline int pte_exec(pte_t pte)
131 131
132static inline int pte_special(pte_t pte) 132static inline int pte_special(pte_t pte)
133{ 133{
134 return pte_flags(pte) & _PAGE_SPECIAL; 134 return (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_SPECIAL)) ==
135 (_PAGE_PRESENT|_PAGE_SPECIAL);
135} 136}
136 137
137static inline unsigned long pte_pfn(pte_t pte) 138static inline unsigned long pte_pfn(pte_t pte)
@@ -296,6 +297,7 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd)
296 return pmd_clear_flags(pmd, _PAGE_PRESENT); 297 return pmd_clear_flags(pmd, _PAGE_PRESENT);
297} 298}
298 299
300#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
299static inline int pte_soft_dirty(pte_t pte) 301static inline int pte_soft_dirty(pte_t pte)
300{ 302{
301 return pte_flags(pte) & _PAGE_SOFT_DIRTY; 303 return pte_flags(pte) & _PAGE_SOFT_DIRTY;
@@ -331,6 +333,8 @@ static inline int pte_file_soft_dirty(pte_t pte)
331 return pte_flags(pte) & _PAGE_SOFT_DIRTY; 333 return pte_flags(pte) & _PAGE_SOFT_DIRTY;
332} 334}
333 335
336#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
337
334/* 338/*
335 * Mask out unsupported bits in a present pgprot. Non-present pgprots 339 * Mask out unsupported bits in a present pgprot. Non-present pgprots
336 * can use those bits for other purposes, so leave them be. 340 * can use those bits for other purposes, so leave them be.
@@ -452,6 +456,12 @@ static inline int pte_present(pte_t a)
452 _PAGE_NUMA); 456 _PAGE_NUMA);
453} 457}
454 458
459#define pte_present_nonuma pte_present_nonuma
460static inline int pte_present_nonuma(pte_t a)
461{
462 return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
463}
464
455#define pte_accessible pte_accessible 465#define pte_accessible pte_accessible
456static inline bool pte_accessible(struct mm_struct *mm, pte_t a) 466static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
457{ 467{
@@ -858,23 +868,25 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
858{ 868{
859} 869}
860 870
871#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
861static inline pte_t pte_swp_mksoft_dirty(pte_t pte) 872static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
862{ 873{
863 VM_BUG_ON(pte_present(pte)); 874 VM_BUG_ON(pte_present_nonuma(pte));
864 return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); 875 return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
865} 876}
866 877
867static inline int pte_swp_soft_dirty(pte_t pte) 878static inline int pte_swp_soft_dirty(pte_t pte)
868{ 879{
869 VM_BUG_ON(pte_present(pte)); 880 VM_BUG_ON(pte_present_nonuma(pte));
870 return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; 881 return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
871} 882}
872 883
873static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) 884static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
874{ 885{
875 VM_BUG_ON(pte_present(pte)); 886 VM_BUG_ON(pte_present_nonuma(pte));
876 return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); 887 return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
877} 888}
889#endif
878 890
879#include <asm-generic/pgtable.h> 891#include <asm-generic/pgtable.h>
880#endif /* __ASSEMBLY__ */ 892#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index e22c1dbf7feb..5be9063545d2 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -143,12 +143,12 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
143#define pte_unmap(pte) ((void)(pte))/* NOP */ 143#define pte_unmap(pte) ((void)(pte))/* NOP */
144 144
145/* Encode and de-code a swap entry */ 145/* Encode and de-code a swap entry */
146#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
147#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) 146#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
148#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) 147#ifdef CONFIG_NUMA_BALANCING
148/* Automatic NUMA balancing needs to be distinguishable from swap entries */
149#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 2)
149#else 150#else
150#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1) 151#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
151#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
152#endif 152#endif
153 153
154#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) 154#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index eb3d44945133..f216963760e5 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -16,15 +16,26 @@
16#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ 16#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
17#define _PAGE_BIT_PAT 7 /* on 4KB pages */ 17#define _PAGE_BIT_PAT 7 /* on 4KB pages */
18#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ 18#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
19#define _PAGE_BIT_UNUSED1 9 /* available for programmer */ 19#define _PAGE_BIT_SOFTW1 9 /* available for programmer */
20#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */ 20#define _PAGE_BIT_SOFTW2 10 /* " */
21#define _PAGE_BIT_HIDDEN 11 /* hidden by kmemcheck */ 21#define _PAGE_BIT_SOFTW3 11 /* " */
22#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ 22#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
23#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 23#define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1
24#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 24#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
25#define _PAGE_BIT_SPLITTING _PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */ 25#define _PAGE_BIT_SPLITTING _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */
26#define _PAGE_BIT_IOMAP _PAGE_BIT_SOFTW2 /* flag used to indicate IO mapping */
27#define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
28#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
26#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ 29#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
27 30
31/*
32 * Swap offsets on configurations that allow automatic NUMA balancing use the
33 * bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from
34 * swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the
35 * maximum possible swap space from 16TB to 8TB.
36 */
37#define _PAGE_BIT_NUMA (_PAGE_BIT_GLOBAL+1)
38
28/* If _PAGE_BIT_PRESENT is clear, we use these: */ 39/* If _PAGE_BIT_PRESENT is clear, we use these: */
29/* - if the user mapped it with PROT_NONE; pte_present gives true */ 40/* - if the user mapped it with PROT_NONE; pte_present gives true */
30#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL 41#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
@@ -40,7 +51,7 @@
40#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) 51#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
41#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) 52#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
42#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) 53#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
43#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) 54#define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
44#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) 55#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
45#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) 56#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
46#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) 57#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
@@ -61,8 +72,6 @@
61 * they do not conflict with each other. 72 * they do not conflict with each other.
62 */ 73 */
63 74
64#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_HIDDEN
65
66#ifdef CONFIG_MEM_SOFT_DIRTY 75#ifdef CONFIG_MEM_SOFT_DIRTY
67#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY) 76#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY)
68#else 77#else
@@ -70,6 +79,21 @@
70#endif 79#endif
71 80
72/* 81/*
82 * _PAGE_NUMA distinguishes between a numa hinting minor fault and a page
83 * that is not present. The hinting fault gathers numa placement statistics
84 * (see pte_numa()). The bit is always zero when the PTE is not present.
85 *
86 * The bit picked must be always zero when the pmd is present and not
87 * present, so that we don't lose information when we set it while
88 * atomically clearing the present bit.
89 */
90#ifdef CONFIG_NUMA_BALANCING
91#define _PAGE_NUMA (_AT(pteval_t, 1) << _PAGE_BIT_NUMA)
92#else
93#define _PAGE_NUMA (_AT(pteval_t, 0))
94#endif
95
96/*
73 * Tracking soft dirty bit when a page goes to a swap is tricky. 97 * Tracking soft dirty bit when a page goes to a swap is tricky.
74 * We need a bit which can be stored in pte _and_ not conflict 98 * We need a bit which can be stored in pte _and_ not conflict
75 * with swap entry format. On x86 bits 6 and 7 are *not* involved 99 * with swap entry format. On x86 bits 6 and 7 are *not* involved
@@ -94,26 +118,6 @@
94#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) 118#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
95#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) 119#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
96 120
97/*
98 * _PAGE_NUMA indicates that this page will trigger a numa hinting
99 * minor page fault to gather numa placement statistics (see
100 * pte_numa()). The bit picked (8) is within the range between
101 * _PAGE_FILE (6) and _PAGE_PROTNONE (8) bits. Therefore, it doesn't
102 * require changes to the swp entry format because that bit is always
103 * zero when the pte is not present.
104 *
105 * The bit picked must be always zero when the pmd is present and not
106 * present, so that we don't lose information when we set it while
107 * atomically clearing the present bit.
108 *
109 * Because we shared the same bit (8) with _PAGE_PROTNONE this can be
110 * interpreted as _PAGE_NUMA only in places that _PAGE_PROTNONE
111 * couldn't reach, like handle_mm_fault() (see access_error in
112 * arch/x86/mm/fault.c, the vma protection must not be PROT_NONE for
113 * handle_mm_fault() to be invoked).
114 */
115#define _PAGE_NUMA _PAGE_PROTNONE
116
117#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ 121#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
118 _PAGE_ACCESSED | _PAGE_DIRTY) 122 _PAGE_ACCESSED | _PAGE_DIRTY)
119#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ 123#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
@@ -122,8 +126,8 @@
122/* Set of bits not changed in pte_modify */ 126/* Set of bits not changed in pte_modify */
123#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ 127#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
124 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ 128 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \
125 _PAGE_SOFT_DIRTY) 129 _PAGE_SOFT_DIRTY | _PAGE_NUMA)
126#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) 130#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA)
127 131
128#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) 132#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT)
129#define _PAGE_CACHE_WB (0) 133#define _PAGE_CACHE_WB (0)
diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h
index 977f1761a25d..ab05d73e2bb7 100644
--- a/arch/x86/include/asm/swiotlb.h
+++ b/arch/x86/include/asm/swiotlb.h
@@ -29,4 +29,11 @@ static inline void pci_swiotlb_late_init(void)
29 29
30static inline void dma_mark_clean(void *addr, size_t size) {} 30static inline void dma_mark_clean(void *addr, size_t size) {}
31 31
32extern void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
33 dma_addr_t *dma_handle, gfp_t flags,
34 struct dma_attrs *attrs);
35extern void x86_swiotlb_free_coherent(struct device *dev, size_t size,
36 void *vaddr, dma_addr_t dma_addr,
37 struct dma_attrs *attrs);
38
32#endif /* _ASM_X86_SWIOTLB_H */ 39#endif /* _ASM_X86_SWIOTLB_H */
diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
index 3f556c6a0157..2b19caa4081c 100644
--- a/arch/x86/include/asm/unistd.h
+++ b/arch/x86/include/asm/unistd.h
@@ -41,7 +41,6 @@
41# define __ARCH_WANT_SYS_OLD_GETRLIMIT 41# define __ARCH_WANT_SYS_OLD_GETRLIMIT
42# define __ARCH_WANT_SYS_OLD_UNAME 42# define __ARCH_WANT_SYS_OLD_UNAME
43# define __ARCH_WANT_SYS_PAUSE 43# define __ARCH_WANT_SYS_PAUSE
44# define __ARCH_WANT_SYS_SGETMASK
45# define __ARCH_WANT_SYS_SIGNAL 44# define __ARCH_WANT_SYS_SIGNAL
46# define __ARCH_WANT_SYS_SIGPENDING 45# define __ARCH_WANT_SYS_SIGPENDING
47# define __ARCH_WANT_SYS_SIGPROCMASK 46# define __ARCH_WANT_SYS_SIGPROCMASK
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index b574b295a2f9..8e3842fc8bea 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -512,7 +512,7 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr,
512 dma_addr_t dma_addr, struct dma_attrs *attrs) 512 dma_addr_t dma_addr, struct dma_attrs *attrs)
513{ 513{
514 gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL); 514 gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL);
515 free_pages((unsigned long)vaddr, get_order(size)); 515 dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs);
516} 516}
517 517
518static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr) 518static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr)
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 6cc800381d14..bb92f38153b2 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -60,8 +60,6 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
60 60
61#define SPINUNIT 100 /* 100ns */ 61#define SPINUNIT 100 /* 100ns */
62 62
63atomic_t mce_entry;
64
65DEFINE_PER_CPU(unsigned, mce_exception_count); 63DEFINE_PER_CPU(unsigned, mce_exception_count);
66 64
67struct mce_bank *mce_banks __read_mostly; 65struct mce_bank *mce_banks __read_mostly;
@@ -1040,8 +1038,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1040 DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); 1038 DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
1041 char *msg = "Unknown"; 1039 char *msg = "Unknown";
1042 1040
1043 atomic_inc(&mce_entry);
1044
1045 this_cpu_inc(mce_exception_count); 1041 this_cpu_inc(mce_exception_count);
1046 1042
1047 if (!cfg->banks) 1043 if (!cfg->banks)
@@ -1171,7 +1167,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1171 mce_report_event(regs); 1167 mce_report_event(regs);
1172 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 1168 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1173out: 1169out:
1174 atomic_dec(&mce_entry);
1175 sync_core(); 1170 sync_core();
1176} 1171}
1177EXPORT_SYMBOL_GPL(do_machine_check); 1172EXPORT_SYMBOL_GPL(do_machine_check);
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 068054f4bf20..eda1a865641e 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -172,7 +172,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
172 */ 172 */
173 load_ucode_bsp(); 173 load_ucode_bsp();
174 174
175 if (console_loglevel == 10) 175 if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG)
176 early_printk("Kernel alive\n"); 176 early_printk("Kernel alive\n");
177 177
178 clear_page(init_level4_pgt); 178 clear_page(init_level4_pgt);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index f7d0672481fd..a25e202bb319 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -97,12 +97,17 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size,
97 97
98 dma_mask = dma_alloc_coherent_mask(dev, flag); 98 dma_mask = dma_alloc_coherent_mask(dev, flag);
99 99
100 flag |= __GFP_ZERO; 100 flag &= ~__GFP_ZERO;
101again: 101again:
102 page = NULL; 102 page = NULL;
103 /* CMA can be used only in the context which permits sleeping */ 103 /* CMA can be used only in the context which permits sleeping */
104 if (flag & __GFP_WAIT) 104 if (flag & __GFP_WAIT) {
105 page = dma_alloc_from_contiguous(dev, count, get_order(size)); 105 page = dma_alloc_from_contiguous(dev, count, get_order(size));
106 if (page && page_to_phys(page) + size > dma_mask) {
107 dma_release_from_contiguous(dev, page, count);
108 page = NULL;
109 }
110 }
106 /* fallback */ 111 /* fallback */
107 if (!page) 112 if (!page)
108 page = alloc_pages_node(dev_to_node(dev), flag, get_order(size)); 113 page = alloc_pages_node(dev_to_node(dev), flag, get_order(size));
@@ -120,7 +125,7 @@ again:
120 125
121 return NULL; 126 return NULL;
122 } 127 }
123 128 memset(page_address(page), 0, size);
124 *dma_addr = addr; 129 *dma_addr = addr;
125 return page_address(page); 130 return page_address(page);
126} 131}
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 6c483ba98b9c..77dd0ad58be4 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -14,7 +14,7 @@
14#include <asm/iommu_table.h> 14#include <asm/iommu_table.h>
15int swiotlb __read_mostly; 15int swiotlb __read_mostly;
16 16
17static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, 17void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
18 dma_addr_t *dma_handle, gfp_t flags, 18 dma_addr_t *dma_handle, gfp_t flags,
19 struct dma_attrs *attrs) 19 struct dma_attrs *attrs)
20{ 20{
@@ -28,11 +28,14 @@ static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
28 return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags); 28 return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags);
29} 29}
30 30
31static void x86_swiotlb_free_coherent(struct device *dev, size_t size, 31void x86_swiotlb_free_coherent(struct device *dev, size_t size,
32 void *vaddr, dma_addr_t dma_addr, 32 void *vaddr, dma_addr_t dma_addr,
33 struct dma_attrs *attrs) 33 struct dma_attrs *attrs)
34{ 34{
35 swiotlb_free_coherent(dev, size, vaddr, dma_addr); 35 if (is_swiotlb_buffer(dma_to_phys(dev, dma_addr)))
36 swiotlb_free_coherent(dev, size, vaddr, dma_addr);
37 else
38 dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs);
36} 39}
37 40
38static struct dma_map_ops swiotlb_dma_ops = { 41static struct dma_map_ops swiotlb_dma_ops = {
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 09c76d265550..78a0e6298922 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1119,7 +1119,7 @@ void __init setup_arch(char **cmdline_p)
1119 setup_real_mode(); 1119 setup_real_mode();
1120 1120
1121 memblock_set_current_limit(get_max_mapped()); 1121 memblock_set_current_limit(get_max_mapped());
1122 dma_contiguous_reserve(0); 1122 dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT);
1123 1123
1124 /* 1124 /*
1125 * NOTE: On x86-32, only from this point on, fixmaps are ready for use. 1125 * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 8c9f647ff9e1..8b977ebf9388 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -58,11 +58,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
58{ 58{
59 return NULL; 59 return NULL;
60} 60}
61
62int pmd_huge_support(void)
63{
64 return 0;
65}
66#else 61#else
67 62
68struct page * 63struct page *
@@ -80,11 +75,6 @@ int pud_huge(pud_t pud)
80{ 75{
81 return !!(pud_val(pud) & _PAGE_PSE); 76 return !!(pud_val(pud) & _PAGE_PSE);
82} 77}
83
84int pmd_huge_support(void)
85{
86 return 1;
87}
88#endif 78#endif
89 79
90#ifdef CONFIG_HUGETLB_PAGE 80#ifdef CONFIG_HUGETLB_PAGE
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index f35c66c5959a..b92591fa8970 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1230,17 +1230,43 @@ const char *arch_vma_name(struct vm_area_struct *vma)
1230 return NULL; 1230 return NULL;
1231} 1231}
1232 1232
1233#ifdef CONFIG_X86_UV 1233static unsigned long probe_memory_block_size(void)
1234unsigned long memory_block_size_bytes(void)
1235{ 1234{
1235 /* start from 2g */
1236 unsigned long bz = 1UL<<31;
1237
1238#ifdef CONFIG_X86_UV
1236 if (is_uv_system()) { 1239 if (is_uv_system()) {
1237 printk(KERN_INFO "UV: memory block size 2GB\n"); 1240 printk(KERN_INFO "UV: memory block size 2GB\n");
1238 return 2UL * 1024 * 1024 * 1024; 1241 return 2UL * 1024 * 1024 * 1024;
1239 } 1242 }
1240 return MIN_MEMORY_BLOCK_SIZE;
1241}
1242#endif 1243#endif
1243 1244
1245 /* less than 64g installed */
1246 if ((max_pfn << PAGE_SHIFT) < (16UL << 32))
1247 return MIN_MEMORY_BLOCK_SIZE;
1248
1249 /* get the tail size */
1250 while (bz > MIN_MEMORY_BLOCK_SIZE) {
1251 if (!((max_pfn << PAGE_SHIFT) & (bz - 1)))
1252 break;
1253 bz >>= 1;
1254 }
1255
1256 printk(KERN_DEBUG "memory block size : %ldMB\n", bz >> 20);
1257
1258 return bz;
1259}
1260
1261static unsigned long memory_block_size_probed;
1262unsigned long memory_block_size_bytes(void)
1263{
1264 if (!memory_block_size_probed)
1265 memory_block_size_probed = probe_memory_block_size();
1266
1267 return memory_block_size_probed;
1268}
1269
1244#ifdef CONFIG_SPARSEMEM_VMEMMAP 1270#ifdef CONFIG_SPARSEMEM_VMEMMAP
1245/* 1271/*
1246 * Initialise the sparsemem vmemmap using huge-pages at the PMD level. 1272 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 1d045f9c390f..a32b706c401a 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -559,7 +559,7 @@ static void __init numa_clear_kernel_node_hotplug(void)
559 int i, nid; 559 int i, nid;
560 nodemask_t numa_kernel_nodes = NODE_MASK_NONE; 560 nodemask_t numa_kernel_nodes = NODE_MASK_NONE;
561 unsigned long start, end; 561 unsigned long start, end;
562 struct memblock_type *type = &memblock.reserved; 562 struct memblock_region *r;
563 563
564 /* 564 /*
565 * At this time, all memory regions reserved by memblock are 565 * At this time, all memory regions reserved by memblock are
@@ -573,8 +573,8 @@ static void __init numa_clear_kernel_node_hotplug(void)
573 } 573 }
574 574
575 /* Mark all kernel nodes. */ 575 /* Mark all kernel nodes. */
576 for (i = 0; i < type->cnt; i++) 576 for_each_memblock(reserved, r)
577 node_set(type->regions[i].nid, numa_kernel_nodes); 577 node_set(r->nid, numa_kernel_nodes);
578 578
579 /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */ 579 /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */
580 for (i = 0; i < numa_meminfo.nr_blks; i++) { 580 for (i = 0; i < numa_meminfo.nr_blks; i++) {
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 461bc8289024..6629f397b467 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -35,7 +35,7 @@ enum {
35 35
36static int pte_testbit(pte_t pte) 36static int pte_testbit(pte_t pte)
37{ 37{
38 return pte_flags(pte) & _PAGE_UNUSED1; 38 return pte_flags(pte) & _PAGE_SOFTW1;
39} 39}
40 40
41struct split_state { 41struct split_state {
diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c
index 9d8a509c9730..5ceda85b8687 100644
--- a/arch/x86/pci/sta2x11-fixup.c
+++ b/arch/x86/pci/sta2x11-fixup.c
@@ -173,9 +173,7 @@ static void *sta2x11_swiotlb_alloc_coherent(struct device *dev,
173{ 173{
174 void *vaddr; 174 void *vaddr;
175 175
176 vaddr = dma_generic_alloc_coherent(dev, size, dma_handle, flags, attrs); 176 vaddr = x86_swiotlb_alloc_coherent(dev, size, dma_handle, flags, attrs);
177 if (!vaddr)
178 vaddr = swiotlb_alloc_coherent(dev, size, dma_handle, flags);
179 *dma_handle = p2a(*dma_handle, to_pci_dev(dev)); 177 *dma_handle = p2a(*dma_handle, to_pci_dev(dev));
180 return vaddr; 178 return vaddr;
181} 179}
@@ -183,7 +181,7 @@ static void *sta2x11_swiotlb_alloc_coherent(struct device *dev,
183/* We have our own dma_ops: the same as swiotlb but from alloc (above) */ 181/* We have our own dma_ops: the same as swiotlb but from alloc (above) */
184static struct dma_map_ops sta2x11_dma_ops = { 182static struct dma_map_ops sta2x11_dma_ops = {
185 .alloc = sta2x11_swiotlb_alloc_coherent, 183 .alloc = sta2x11_swiotlb_alloc_coherent,
186 .free = swiotlb_free_coherent, 184 .free = x86_swiotlb_free_coherent,
187 .map_page = swiotlb_map_page, 185 .map_page = swiotlb_map_page,
188 .unmap_page = swiotlb_unmap_page, 186 .unmap_page = swiotlb_unmap_page,
189 .map_sg = swiotlb_map_sg_attrs, 187 .map_sg = swiotlb_map_sg_attrs,
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index be27da60dc8f..c89c93320c12 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -85,7 +85,7 @@ static cpumask_var_t uv_nmi_cpu_mask;
85 * Default is all stack dumps go to the console and buffer. 85 * Default is all stack dumps go to the console and buffer.
86 * Lower level to send to log buffer only. 86 * Lower level to send to log buffer only.
87 */ 87 */
88static int uv_nmi_loglevel = 7; 88static int uv_nmi_loglevel = CONSOLE_LOGLEVEL_DEFAULT;
89module_param_named(dump_loglevel, uv_nmi_loglevel, int, 0644); 89module_param_named(dump_loglevel, uv_nmi_loglevel, int, 0644);
90 90
91/* 91/*
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index 4b7b4522b64f..23b8726962af 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -258,7 +258,7 @@ endchoice
258 258
259config CMA_ALIGNMENT 259config CMA_ALIGNMENT
260 int "Maximum PAGE_SIZE order of alignment for contiguous buffers" 260 int "Maximum PAGE_SIZE order of alignment for contiguous buffers"
261 range 4 9 261 range 4 12
262 default 8 262 default 8
263 help 263 help
264 DMA mapping framework by default aligns all buffers to the smallest 264 DMA mapping framework by default aligns all buffers to the smallest
diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c
index c34ec3364243..83969f8c5727 100644
--- a/drivers/base/dma-contiguous.c
+++ b/drivers/base/dma-contiguous.c
@@ -60,11 +60,22 @@ struct cma *dma_contiguous_default_area;
60 */ 60 */
61static const phys_addr_t size_bytes = CMA_SIZE_MBYTES * SZ_1M; 61static const phys_addr_t size_bytes = CMA_SIZE_MBYTES * SZ_1M;
62static phys_addr_t size_cmdline = -1; 62static phys_addr_t size_cmdline = -1;
63static phys_addr_t base_cmdline;
64static phys_addr_t limit_cmdline;
63 65
64static int __init early_cma(char *p) 66static int __init early_cma(char *p)
65{ 67{
66 pr_debug("%s(%s)\n", __func__, p); 68 pr_debug("%s(%s)\n", __func__, p);
67 size_cmdline = memparse(p, &p); 69 size_cmdline = memparse(p, &p);
70 if (*p != '@')
71 return 0;
72 base_cmdline = memparse(p + 1, &p);
73 if (*p != '-') {
74 limit_cmdline = base_cmdline + size_cmdline;
75 return 0;
76 }
77 limit_cmdline = memparse(p + 1, &p);
78
68 return 0; 79 return 0;
69} 80}
70early_param("cma", early_cma); 81early_param("cma", early_cma);
@@ -108,11 +119,18 @@ static inline __maybe_unused phys_addr_t cma_early_percent_memory(void)
108void __init dma_contiguous_reserve(phys_addr_t limit) 119void __init dma_contiguous_reserve(phys_addr_t limit)
109{ 120{
110 phys_addr_t selected_size = 0; 121 phys_addr_t selected_size = 0;
122 phys_addr_t selected_base = 0;
123 phys_addr_t selected_limit = limit;
124 bool fixed = false;
111 125
112 pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit); 126 pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit);
113 127
114 if (size_cmdline != -1) { 128 if (size_cmdline != -1) {
115 selected_size = size_cmdline; 129 selected_size = size_cmdline;
130 selected_base = base_cmdline;
131 selected_limit = min_not_zero(limit_cmdline, limit);
132 if (base_cmdline + size_cmdline == limit_cmdline)
133 fixed = true;
116 } else { 134 } else {
117#ifdef CONFIG_CMA_SIZE_SEL_MBYTES 135#ifdef CONFIG_CMA_SIZE_SEL_MBYTES
118 selected_size = size_bytes; 136 selected_size = size_bytes;
@@ -129,10 +147,12 @@ void __init dma_contiguous_reserve(phys_addr_t limit)
129 pr_debug("%s: reserving %ld MiB for global area\n", __func__, 147 pr_debug("%s: reserving %ld MiB for global area\n", __func__,
130 (unsigned long)selected_size / SZ_1M); 148 (unsigned long)selected_size / SZ_1M);
131 149
132 dma_contiguous_reserve_area(selected_size, 0, limit, 150 dma_contiguous_reserve_area(selected_size, selected_base,
133 &dma_contiguous_default_area); 151 selected_limit,
152 &dma_contiguous_default_area,
153 fixed);
134 } 154 }
135}; 155}
136 156
137static DEFINE_MUTEX(cma_mutex); 157static DEFINE_MUTEX(cma_mutex);
138 158
@@ -189,15 +209,20 @@ core_initcall(cma_init_reserved_areas);
189 * @base: Base address of the reserved area optional, use 0 for any 209 * @base: Base address of the reserved area optional, use 0 for any
190 * @limit: End address of the reserved memory (optional, 0 for any). 210 * @limit: End address of the reserved memory (optional, 0 for any).
191 * @res_cma: Pointer to store the created cma region. 211 * @res_cma: Pointer to store the created cma region.
212 * @fixed: hint about where to place the reserved area
192 * 213 *
193 * This function reserves memory from early allocator. It should be 214 * This function reserves memory from early allocator. It should be
194 * called by arch specific code once the early allocator (memblock or bootmem) 215 * called by arch specific code once the early allocator (memblock or bootmem)
195 * has been activated and all other subsystems have already allocated/reserved 216 * has been activated and all other subsystems have already allocated/reserved
196 * memory. This function allows to create custom reserved areas for specific 217 * memory. This function allows to create custom reserved areas for specific
197 * devices. 218 * devices.
219 *
220 * If @fixed is true, reserve contiguous area at exactly @base. If false,
221 * reserve in range from @base to @limit.
198 */ 222 */
199int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, 223int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base,
200 phys_addr_t limit, struct cma **res_cma) 224 phys_addr_t limit, struct cma **res_cma,
225 bool fixed)
201{ 226{
202 struct cma *cma = &cma_areas[cma_area_count]; 227 struct cma *cma = &cma_areas[cma_area_count];
203 phys_addr_t alignment; 228 phys_addr_t alignment;
@@ -223,18 +248,15 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base,
223 limit &= ~(alignment - 1); 248 limit &= ~(alignment - 1);
224 249
225 /* Reserve memory */ 250 /* Reserve memory */
226 if (base) { 251 if (base && fixed) {
227 if (memblock_is_region_reserved(base, size) || 252 if (memblock_is_region_reserved(base, size) ||
228 memblock_reserve(base, size) < 0) { 253 memblock_reserve(base, size) < 0) {
229 ret = -EBUSY; 254 ret = -EBUSY;
230 goto err; 255 goto err;
231 } 256 }
232 } else { 257 } else {
233 /* 258 phys_addr_t addr = memblock_alloc_range(size, alignment, base,
234 * Use __memblock_alloc_base() since 259 limit);
235 * memblock_alloc_base() panic()s.
236 */
237 phys_addr_t addr = __memblock_alloc_base(size, alignment, limit);
238 if (!addr) { 260 if (!addr) {
239 ret = -ENOMEM; 261 ret = -ENOMEM;
240 goto err; 262 goto err;
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index bece691cb5d9..89f752dd8465 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -118,16 +118,6 @@ static ssize_t show_mem_start_phys_index(struct device *dev,
118 return sprintf(buf, "%08lx\n", phys_index); 118 return sprintf(buf, "%08lx\n", phys_index);
119} 119}
120 120
121static ssize_t show_mem_end_phys_index(struct device *dev,
122 struct device_attribute *attr, char *buf)
123{
124 struct memory_block *mem = to_memory_block(dev);
125 unsigned long phys_index;
126
127 phys_index = mem->end_section_nr / sections_per_block;
128 return sprintf(buf, "%08lx\n", phys_index);
129}
130
131/* 121/*
132 * Show whether the section of memory is likely to be hot-removable 122 * Show whether the section of memory is likely to be hot-removable
133 */ 123 */
@@ -384,7 +374,6 @@ static ssize_t show_phys_device(struct device *dev,
384} 374}
385 375
386static DEVICE_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL); 376static DEVICE_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL);
387static DEVICE_ATTR(end_phys_index, 0444, show_mem_end_phys_index, NULL);
388static DEVICE_ATTR(state, 0644, show_mem_state, store_mem_state); 377static DEVICE_ATTR(state, 0644, show_mem_state, store_mem_state);
389static DEVICE_ATTR(phys_device, 0444, show_phys_device, NULL); 378static DEVICE_ATTR(phys_device, 0444, show_phys_device, NULL);
390static DEVICE_ATTR(removable, 0444, show_mem_removable, NULL); 379static DEVICE_ATTR(removable, 0444, show_mem_removable, NULL);
@@ -529,7 +518,6 @@ struct memory_block *find_memory_block(struct mem_section *section)
529 518
530static struct attribute *memory_memblk_attrs[] = { 519static struct attribute *memory_memblk_attrs[] = {
531 &dev_attr_phys_index.attr, 520 &dev_attr_phys_index.attr,
532 &dev_attr_end_phys_index.attr,
533 &dev_attr_state.attr, 521 &dev_attr_state.attr,
534 &dev_attr_phys_device.attr, 522 &dev_attr_phys_device.attr,
535 &dev_attr_removable.attr, 523 &dev_attr_removable.attr,
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index e73b85cf0756..c7d138eca731 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -200,11 +200,11 @@ static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n)
200 200
201 copy = min_t(size_t, n, PAGE_SIZE - offset); 201 copy = min_t(size_t, n, PAGE_SIZE - offset);
202 if (!brd_insert_page(brd, sector)) 202 if (!brd_insert_page(brd, sector))
203 return -ENOMEM; 203 return -ENOSPC;
204 if (copy < n) { 204 if (copy < n) {
205 sector += copy >> SECTOR_SHIFT; 205 sector += copy >> SECTOR_SHIFT;
206 if (!brd_insert_page(brd, sector)) 206 if (!brd_insert_page(brd, sector))
207 return -ENOMEM; 207 return -ENOSPC;
208 } 208 }
209 return 0; 209 return 0;
210} 210}
@@ -360,6 +360,15 @@ out:
360 bio_endio(bio, err); 360 bio_endio(bio, err);
361} 361}
362 362
363static int brd_rw_page(struct block_device *bdev, sector_t sector,
364 struct page *page, int rw)
365{
366 struct brd_device *brd = bdev->bd_disk->private_data;
367 int err = brd_do_bvec(brd, page, PAGE_CACHE_SIZE, 0, rw, sector);
368 page_endio(page, rw & WRITE, err);
369 return err;
370}
371
363#ifdef CONFIG_BLK_DEV_XIP 372#ifdef CONFIG_BLK_DEV_XIP
364static int brd_direct_access(struct block_device *bdev, sector_t sector, 373static int brd_direct_access(struct block_device *bdev, sector_t sector,
365 void **kaddr, unsigned long *pfn) 374 void **kaddr, unsigned long *pfn)
@@ -375,7 +384,7 @@ static int brd_direct_access(struct block_device *bdev, sector_t sector,
375 return -ERANGE; 384 return -ERANGE;
376 page = brd_insert_page(brd, sector); 385 page = brd_insert_page(brd, sector);
377 if (!page) 386 if (!page)
378 return -ENOMEM; 387 return -ENOSPC;
379 *kaddr = page_address(page); 388 *kaddr = page_address(page);
380 *pfn = page_to_pfn(page); 389 *pfn = page_to_pfn(page);
381 390
@@ -419,6 +428,7 @@ static int brd_ioctl(struct block_device *bdev, fmode_t mode,
419 428
420static const struct block_device_operations brd_fops = { 429static const struct block_device_operations brd_fops = {
421 .owner = THIS_MODULE, 430 .owner = THIS_MODULE,
431 .rw_page = brd_rw_page,
422 .ioctl = brd_ioctl, 432 .ioctl = brd_ioctl,
423#ifdef CONFIG_BLK_DEV_XIP 433#ifdef CONFIG_BLK_DEV_XIP
424 .direct_access = brd_direct_access, 434 .direct_access = brd_direct_access,
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 9849b5233bf4..48eccb350180 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -572,10 +572,10 @@ static void zram_bio_discard(struct zram *zram, u32 index,
572 * skipping this logical block is appropriate here. 572 * skipping this logical block is appropriate here.
573 */ 573 */
574 if (offset) { 574 if (offset) {
575 if (n < offset) 575 if (n <= (PAGE_SIZE - offset))
576 return; 576 return;
577 577
578 n -= offset; 578 n -= (PAGE_SIZE - offset);
579 index++; 579 index++;
580 } 580 }
581 581
diff --git a/drivers/gpu/drm/exynos/exynos_drm_g2d.c b/drivers/gpu/drm/exynos/exynos_drm_g2d.c
index 6c1885eedfdf..800158714473 100644
--- a/drivers/gpu/drm/exynos/exynos_drm_g2d.c
+++ b/drivers/gpu/drm/exynos/exynos_drm_g2d.c
@@ -467,14 +467,17 @@ static dma_addr_t *g2d_userptr_get_dma_addr(struct drm_device *drm_dev,
467 goto err_free; 467 goto err_free;
468 } 468 }
469 469
470 down_read(&current->mm->mmap_sem);
470 vma = find_vma(current->mm, userptr); 471 vma = find_vma(current->mm, userptr);
471 if (!vma) { 472 if (!vma) {
473 up_read(&current->mm->mmap_sem);
472 DRM_ERROR("failed to get vm region.\n"); 474 DRM_ERROR("failed to get vm region.\n");
473 ret = -EFAULT; 475 ret = -EFAULT;
474 goto err_free_pages; 476 goto err_free_pages;
475 } 477 }
476 478
477 if (vma->vm_end < userptr + size) { 479 if (vma->vm_end < userptr + size) {
480 up_read(&current->mm->mmap_sem);
478 DRM_ERROR("vma is too small.\n"); 481 DRM_ERROR("vma is too small.\n");
479 ret = -EFAULT; 482 ret = -EFAULT;
480 goto err_free_pages; 483 goto err_free_pages;
@@ -482,6 +485,7 @@ static dma_addr_t *g2d_userptr_get_dma_addr(struct drm_device *drm_dev,
482 485
483 g2d_userptr->vma = exynos_gem_get_vma(vma); 486 g2d_userptr->vma = exynos_gem_get_vma(vma);
484 if (!g2d_userptr->vma) { 487 if (!g2d_userptr->vma) {
488 up_read(&current->mm->mmap_sem);
485 DRM_ERROR("failed to copy vma.\n"); 489 DRM_ERROR("failed to copy vma.\n");
486 ret = -ENOMEM; 490 ret = -ENOMEM;
487 goto err_free_pages; 491 goto err_free_pages;
@@ -492,10 +496,12 @@ static dma_addr_t *g2d_userptr_get_dma_addr(struct drm_device *drm_dev,
492 ret = exynos_gem_get_pages_from_userptr(start & PAGE_MASK, 496 ret = exynos_gem_get_pages_from_userptr(start & PAGE_MASK,
493 npages, pages, vma); 497 npages, pages, vma);
494 if (ret < 0) { 498 if (ret < 0) {
499 up_read(&current->mm->mmap_sem);
495 DRM_ERROR("failed to get user pages from userptr.\n"); 500 DRM_ERROR("failed to get user pages from userptr.\n");
496 goto err_put_vma; 501 goto err_put_vma;
497 } 502 }
498 503
504 up_read(&current->mm->mmap_sem);
499 g2d_userptr->pages = pages; 505 g2d_userptr->pages = pages;
500 506
501 sgt = kzalloc(sizeof(*sgt), GFP_KERNEL); 507 sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index f256ffc02e29..6bb32773c3ac 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -39,6 +39,7 @@
39#include <linux/dmi.h> 39#include <linux/dmi.h>
40#include <linux/pci-ats.h> 40#include <linux/pci-ats.h>
41#include <linux/memblock.h> 41#include <linux/memblock.h>
42#include <linux/dma-contiguous.h>
42#include <asm/irq_remapping.h> 43#include <asm/irq_remapping.h>
43#include <asm/cacheflush.h> 44#include <asm/cacheflush.h>
44#include <asm/iommu.h> 45#include <asm/iommu.h>
@@ -3193,7 +3194,7 @@ static void *intel_alloc_coherent(struct device *dev, size_t size,
3193 dma_addr_t *dma_handle, gfp_t flags, 3194 dma_addr_t *dma_handle, gfp_t flags,
3194 struct dma_attrs *attrs) 3195 struct dma_attrs *attrs)
3195{ 3196{
3196 void *vaddr; 3197 struct page *page = NULL;
3197 int order; 3198 int order;
3198 3199
3199 size = PAGE_ALIGN(size); 3200 size = PAGE_ALIGN(size);
@@ -3208,17 +3209,31 @@ static void *intel_alloc_coherent(struct device *dev, size_t size,
3208 flags |= GFP_DMA32; 3209 flags |= GFP_DMA32;
3209 } 3210 }
3210 3211
3211 vaddr = (void *)__get_free_pages(flags, order); 3212 if (flags & __GFP_WAIT) {
3212 if (!vaddr) 3213 unsigned int count = size >> PAGE_SHIFT;
3214
3215 page = dma_alloc_from_contiguous(dev, count, order);
3216 if (page && iommu_no_mapping(dev) &&
3217 page_to_phys(page) + size > dev->coherent_dma_mask) {
3218 dma_release_from_contiguous(dev, page, count);
3219 page = NULL;
3220 }
3221 }
3222
3223 if (!page)
3224 page = alloc_pages(flags, order);
3225 if (!page)
3213 return NULL; 3226 return NULL;
3214 memset(vaddr, 0, size); 3227 memset(page_address(page), 0, size);
3215 3228
3216 *dma_handle = __intel_map_single(dev, virt_to_bus(vaddr), size, 3229 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3217 DMA_BIDIRECTIONAL, 3230 DMA_BIDIRECTIONAL,
3218 dev->coherent_dma_mask); 3231 dev->coherent_dma_mask);
3219 if (*dma_handle) 3232 if (*dma_handle)
3220 return vaddr; 3233 return page_address(page);
3221 free_pages((unsigned long)vaddr, order); 3234 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3235 __free_pages(page, order);
3236
3222 return NULL; 3237 return NULL;
3223} 3238}
3224 3239
@@ -3226,12 +3241,14 @@ static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3226 dma_addr_t dma_handle, struct dma_attrs *attrs) 3241 dma_addr_t dma_handle, struct dma_attrs *attrs)
3227{ 3242{
3228 int order; 3243 int order;
3244 struct page *page = virt_to_page(vaddr);
3229 3245
3230 size = PAGE_ALIGN(size); 3246 size = PAGE_ALIGN(size);
3231 order = get_order(size); 3247 order = get_order(size);
3232 3248
3233 intel_unmap_page(dev, dma_handle, size, DMA_BIDIRECTIONAL, NULL); 3249 intel_unmap_page(dev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3234 free_pages((unsigned long)vaddr, order); 3250 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3251 __free_pages(page, order);
3235} 3252}
3236 3253
3237static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist, 3254static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
diff --git a/drivers/nubus/nubus.c b/drivers/nubus/nubus.c
index 43926cd25ae8..5066a7ef7b6c 100644
--- a/drivers/nubus/nubus.c
+++ b/drivers/nubus/nubus.c
@@ -473,7 +473,7 @@ static struct nubus_dev* __init
473 if (slot == 0 && (unsigned long)dir.base % 2) 473 if (slot == 0 && (unsigned long)dir.base % 2)
474 dir.base += 1; 474 dir.base += 1;
475 475
476 if (console_loglevel >= 10) 476 if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG)
477 printk(KERN_DEBUG "nubus_get_functional_resource: parent is 0x%p, dir is 0x%p\n", 477 printk(KERN_DEBUG "nubus_get_functional_resource: parent is 0x%p, dir is 0x%p\n",
478 parent->base, dir.base); 478 parent->base, dir.base);
479 479
@@ -568,7 +568,7 @@ static int __init nubus_get_vidnames(struct nubus_board* board,
568 568
569 printk(KERN_INFO " video modes supported:\n"); 569 printk(KERN_INFO " video modes supported:\n");
570 nubus_get_subdir(parent, &dir); 570 nubus_get_subdir(parent, &dir);
571 if (console_loglevel >= 10) 571 if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG)
572 printk(KERN_DEBUG "nubus_get_vidnames: parent is 0x%p, dir is 0x%p\n", 572 printk(KERN_DEBUG "nubus_get_vidnames: parent is 0x%p, dir is 0x%p\n",
573 parent->base, dir.base); 573 parent->base, dir.base);
574 574
@@ -629,7 +629,7 @@ static int __init nubus_get_vendorinfo(struct nubus_board* board,
629 629
630 printk(KERN_INFO " vendor info:\n"); 630 printk(KERN_INFO " vendor info:\n");
631 nubus_get_subdir(parent, &dir); 631 nubus_get_subdir(parent, &dir);
632 if (console_loglevel >= 10) 632 if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG)
633 printk(KERN_DEBUG "nubus_get_vendorinfo: parent is 0x%p, dir is 0x%p\n", 633 printk(KERN_DEBUG "nubus_get_vendorinfo: parent is 0x%p, dir is 0x%p\n",
634 parent->base, dir.base); 634 parent->base, dir.base);
635 635
@@ -654,7 +654,7 @@ static int __init nubus_get_board_resource(struct nubus_board* board, int slot,
654 struct nubus_dirent ent; 654 struct nubus_dirent ent;
655 655
656 nubus_get_subdir(parent, &dir); 656 nubus_get_subdir(parent, &dir);
657 if (console_loglevel >= 10) 657 if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG)
658 printk(KERN_DEBUG "nubus_get_board_resource: parent is 0x%p, dir is 0x%p\n", 658 printk(KERN_DEBUG "nubus_get_board_resource: parent is 0x%p, dir is 0x%p\n",
659 parent->base, dir.base); 659 parent->base, dir.base);
660 660
@@ -753,19 +753,19 @@ static void __init nubus_find_rom_dir(struct nubus_board* board)
753 if (nubus_readdir(&dir, &ent) == -1) 753 if (nubus_readdir(&dir, &ent) == -1)
754 goto badrom; 754 goto badrom;
755 755
756 if (console_loglevel >= 10) 756 if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG)
757 printk(KERN_INFO "nubus_get_rom_dir: entry %02x %06x\n", ent.type, ent.data); 757 printk(KERN_INFO "nubus_get_rom_dir: entry %02x %06x\n", ent.type, ent.data);
758 /* This one takes us to where we want to go. */ 758 /* This one takes us to where we want to go. */
759 if (nubus_readdir(&dir, &ent) == -1) 759 if (nubus_readdir(&dir, &ent) == -1)
760 goto badrom; 760 goto badrom;
761 if (console_loglevel >= 10) 761 if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG)
762 printk(KERN_DEBUG "nubus_get_rom_dir: entry %02x %06x\n", ent.type, ent.data); 762 printk(KERN_DEBUG "nubus_get_rom_dir: entry %02x %06x\n", ent.type, ent.data);
763 nubus_get_subdir(&ent, &dir); 763 nubus_get_subdir(&ent, &dir);
764 764
765 /* Resource ID 01, also an "Unknown Macintosh" */ 765 /* Resource ID 01, also an "Unknown Macintosh" */
766 if (nubus_readdir(&dir, &ent) == -1) 766 if (nubus_readdir(&dir, &ent) == -1)
767 goto badrom; 767 goto badrom;
768 if (console_loglevel >= 10) 768 if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG)
769 printk(KERN_DEBUG "nubus_get_rom_dir: entry %02x %06x\n", ent.type, ent.data); 769 printk(KERN_DEBUG "nubus_get_rom_dir: entry %02x %06x\n", ent.type, ent.data);
770 770
771 /* FIXME: the first one is *not* always the right one. We 771 /* FIXME: the first one is *not* always the right one. We
@@ -780,7 +780,7 @@ static void __init nubus_find_rom_dir(struct nubus_board* board)
780 path to that address... */ 780 path to that address... */
781 if (nubus_readdir(&dir, &ent) == -1) 781 if (nubus_readdir(&dir, &ent) == -1)
782 goto badrom; 782 goto badrom;
783 if (console_loglevel >= 10) 783 if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG)
784 printk(KERN_DEBUG "nubus_get_rom_dir: entry %02x %06x\n", ent.type, ent.data); 784 printk(KERN_DEBUG "nubus_get_rom_dir: entry %02x %06x\n", ent.type, ent.data);
785 785
786 /* Bwahahahaha... */ 786 /* Bwahahahaha... */
@@ -816,7 +816,7 @@ static struct nubus_board* __init nubus_add_board(int slot, int bytelanes)
816 board->fblock = rp; 816 board->fblock = rp;
817 817
818 /* Dump the format block for debugging purposes */ 818 /* Dump the format block for debugging purposes */
819 if (console_loglevel >= 10) { 819 if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) {
820 int i; 820 int i;
821 printk(KERN_DEBUG "Slot %X, format block at 0x%p\n", 821 printk(KERN_DEBUG "Slot %X, format block at 0x%p\n",
822 slot, rp); 822 slot, rp);
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index ce396ecdf412..b767a64e49d9 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -88,7 +88,7 @@ static void sysrq_handle_loglevel(int key)
88 int i; 88 int i;
89 89
90 i = key - '0'; 90 i = key - '0';
91 console_loglevel = 7; 91 console_loglevel = CONSOLE_LOGLEVEL_DEFAULT;
92 printk("Loglevel set to %d\n", i); 92 printk("Loglevel set to %d\n", i);
93 console_loglevel = i; 93 console_loglevel = i;
94} 94}
@@ -343,7 +343,7 @@ static void send_sig_all(int sig)
343static void sysrq_handle_term(int key) 343static void sysrq_handle_term(int key)
344{ 344{
345 send_sig_all(SIGTERM); 345 send_sig_all(SIGTERM);
346 console_loglevel = 8; 346 console_loglevel = CONSOLE_LOGLEVEL_DEBUG;
347} 347}
348static struct sysrq_key_op sysrq_term_op = { 348static struct sysrq_key_op sysrq_term_op = {
349 .handler = sysrq_handle_term, 349 .handler = sysrq_handle_term,
@@ -387,7 +387,7 @@ static struct sysrq_key_op sysrq_thaw_op = {
387static void sysrq_handle_kill(int key) 387static void sysrq_handle_kill(int key)
388{ 388{
389 send_sig_all(SIGKILL); 389 send_sig_all(SIGKILL);
390 console_loglevel = 8; 390 console_loglevel = CONSOLE_LOGLEVEL_DEBUG;
391} 391}
392static struct sysrq_key_op sysrq_kill_op = { 392static struct sysrq_key_op sysrq_kill_op = {
393 .handler = sysrq_handle_kill, 393 .handler = sysrq_handle_kill,
@@ -520,7 +520,7 @@ void __handle_sysrq(int key, bool check_mask)
520 * routing in the consumers of /proc/kmsg. 520 * routing in the consumers of /proc/kmsg.
521 */ 521 */
522 orig_log_level = console_loglevel; 522 orig_log_level = console_loglevel;
523 console_loglevel = 7; 523 console_loglevel = CONSOLE_LOGLEVEL_DEFAULT;
524 printk(KERN_INFO "SysRq : "); 524 printk(KERN_INFO "SysRq : ");
525 525
526 op_p = __sysrq_get_key_op(key); 526 op_p = __sysrq_get_key_op(key);
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 14da82564f4e..6894b085f0ee 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -537,7 +537,7 @@ static struct attribute_group v9fs_attr_group = {
537 * 537 *
538 */ 538 */
539 539
540static int v9fs_sysfs_init(void) 540static int __init v9fs_sysfs_init(void)
541{ 541{
542 v9fs_kobj = kobject_create_and_add("9p", fs_kobj); 542 v9fs_kobj = kobject_create_and_add("9p", fs_kobj);
543 if (!v9fs_kobj) 543 if (!v9fs_kobj)
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 4d0c2e0be7e5..0b3bfa303dda 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -42,7 +42,6 @@
42 42
43/** 43/**
44 * struct p9_rdir - readdir accounting 44 * struct p9_rdir - readdir accounting
45 * @mutex: mutex protecting readdir
46 * @head: start offset of current dirread buffer 45 * @head: start offset of current dirread buffer
47 * @tail: end offset of current dirread buffer 46 * @tail: end offset of current dirread buffer
48 * @buf: dirread buffer 47 * @buf: dirread buffer
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 59e3fe3d56c0..96e550760699 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -681,7 +681,7 @@ v9fs_direct_read(struct file *filp, char __user *udata, size_t count,
681/** 681/**
682 * v9fs_cached_file_read - read from a file 682 * v9fs_cached_file_read - read from a file
683 * @filp: file pointer to read 683 * @filp: file pointer to read
684 * @udata: user data buffer to read data into 684 * @data: user data buffer to read data into
685 * @count: size of buffer 685 * @count: size of buffer
686 * @offset: offset at which to read data 686 * @offset: offset at which to read data
687 * 687 *
@@ -698,7 +698,7 @@ v9fs_cached_file_read(struct file *filp, char __user *data, size_t count,
698/** 698/**
699 * v9fs_mmap_file_read - read from a file 699 * v9fs_mmap_file_read - read from a file
700 * @filp: file pointer to read 700 * @filp: file pointer to read
701 * @udata: user data buffer to read data into 701 * @data: user data buffer to read data into
702 * @count: size of buffer 702 * @count: size of buffer
703 * @offset: offset at which to read data 703 * @offset: offset at which to read data
704 * 704 *
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 53161ec058a7..00d140fb2263 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -580,7 +580,7 @@ static int v9fs_at_to_dotl_flags(int flags)
580 * v9fs_remove - helper function to remove files and directories 580 * v9fs_remove - helper function to remove files and directories
581 * @dir: directory inode that is being deleted 581 * @dir: directory inode that is being deleted
582 * @dentry: dentry that is being deleted 582 * @dentry: dentry that is being deleted
583 * @rmdir: removing a directory 583 * @flags: removing a directory
584 * 584 *
585 */ 585 */
586 586
@@ -778,7 +778,7 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
778 * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode 778 * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode
779 * @dir: inode that is being walked from 779 * @dir: inode that is being walked from
780 * @dentry: dentry that is being walked to? 780 * @dentry: dentry that is being walked to?
781 * @nameidata: path data 781 * @flags: lookup flags (unused)
782 * 782 *
783 */ 783 */
784 784
@@ -1324,7 +1324,7 @@ v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
1324 * v9fs_vfs_mkspecial - create a special file 1324 * v9fs_vfs_mkspecial - create a special file
1325 * @dir: inode to create special file in 1325 * @dir: inode to create special file in
1326 * @dentry: dentry to create 1326 * @dentry: dentry to create
1327 * @mode: mode to create special file 1327 * @perm: mode to create special file
1328 * @extension: 9p2000.u format extension string representing special file 1328 * @extension: 9p2000.u format extension string representing special file
1329 * 1329 *
1330 */ 1330 */
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 59dc8e87647f..1fa85aae24df 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -226,7 +226,7 @@ int v9fs_open_to_dotl_flags(int flags)
226 * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol. 226 * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
227 * @dir: directory inode that is being created 227 * @dir: directory inode that is being created
228 * @dentry: dentry that is being deleted 228 * @dentry: dentry that is being deleted
229 * @mode: create permissions 229 * @omode: create permissions
230 * 230 *
231 */ 231 */
232 232
@@ -375,7 +375,7 @@ err_clunk_old_fid:
375 * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory 375 * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory
376 * @dir: inode that is being unlinked 376 * @dir: inode that is being unlinked
377 * @dentry: dentry that is being unlinked 377 * @dentry: dentry that is being unlinked
378 * @mode: mode for new directory 378 * @omode: mode for new directory
379 * 379 *
380 */ 380 */
381 381
@@ -607,7 +607,6 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
607 * v9fs_stat2inode_dotl - populate an inode structure with stat info 607 * v9fs_stat2inode_dotl - populate an inode structure with stat info
608 * @stat: stat structure 608 * @stat: stat structure
609 * @inode: inode to populate 609 * @inode: inode to populate
610 * @sb: superblock of filesystem
611 * 610 *
612 */ 611 */
613 612
@@ -808,7 +807,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
808 * v9fs_vfs_mknod_dotl - create a special file 807 * v9fs_vfs_mknod_dotl - create a special file
809 * @dir: inode destination for new link 808 * @dir: inode destination for new link
810 * @dentry: dentry for file 809 * @dentry: dentry for file
811 * @mode: mode for creation 810 * @omode: mode for creation
812 * @rdev: device associated with special file 811 * @rdev: device associated with special file
813 * 812 *
814 */ 813 */
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 232e03d4780d..5b570b6efa28 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -737,7 +737,7 @@ MODULE_ALIAS_MISCDEV(AUTOFS_MINOR);
737MODULE_ALIAS("devname:autofs"); 737MODULE_ALIAS("devname:autofs");
738 738
739/* Register/deregister misc character device */ 739/* Register/deregister misc character device */
740int autofs_dev_ioctl_init(void) 740int __init autofs_dev_ioctl_init(void)
741{ 741{
742 int r; 742 int r;
743 743
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index aa3cb626671e..dabc73ab900f 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1686,7 +1686,7 @@ static size_t get_note_info_size(struct elf_note_info *info)
1686static int write_note_info(struct elf_note_info *info, 1686static int write_note_info(struct elf_note_info *info,
1687 struct coredump_params *cprm) 1687 struct coredump_params *cprm)
1688{ 1688{
1689 bool first = 1; 1689 bool first = true;
1690 struct elf_thread_core_info *t = info->thread; 1690 struct elf_thread_core_info *t = info->thread;
1691 1691
1692 do { 1692 do {
@@ -1710,7 +1710,7 @@ static int write_note_info(struct elf_note_info *info,
1710 !writenote(&t->notes[i], cprm)) 1710 !writenote(&t->notes[i], cprm))
1711 return 0; 1711 return 0;
1712 1712
1713 first = 0; 1713 first = false;
1714 t = t->next; 1714 t = t->next;
1715 } while (t); 1715 } while (t);
1716 1716
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index d50bbe59da1e..f723cd3a455c 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -380,7 +380,7 @@ failed:
380 380
381/****************************************************************************/ 381/****************************************************************************/
382 382
383void old_reloc(unsigned long rl) 383static void old_reloc(unsigned long rl)
384{ 384{
385#ifdef DEBUG 385#ifdef DEBUG
386 char *segment[] = { "TEXT", "DATA", "BSS", "*UNKNOWN*" }; 386 char *segment[] = { "TEXT", "DATA", "BSS", "*UNKNOWN*" };
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 552a8d13bc32..83fba15cc394 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -363,6 +363,69 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
363} 363}
364EXPORT_SYMBOL(blkdev_fsync); 364EXPORT_SYMBOL(blkdev_fsync);
365 365
366/**
367 * bdev_read_page() - Start reading a page from a block device
368 * @bdev: The device to read the page from
369 * @sector: The offset on the device to read the page to (need not be aligned)
370 * @page: The page to read
371 *
372 * On entry, the page should be locked. It will be unlocked when the page
373 * has been read. If the block driver implements rw_page synchronously,
374 * that will be true on exit from this function, but it need not be.
375 *
376 * Errors returned by this function are usually "soft", eg out of memory, or
377 * queue full; callers should try a different route to read this page rather
378 * than propagate an error back up the stack.
379 *
380 * Return: negative errno if an error occurs, 0 if submission was successful.
381 */
382int bdev_read_page(struct block_device *bdev, sector_t sector,
383 struct page *page)
384{
385 const struct block_device_operations *ops = bdev->bd_disk->fops;
386 if (!ops->rw_page)
387 return -EOPNOTSUPP;
388 return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ);
389}
390EXPORT_SYMBOL_GPL(bdev_read_page);
391
392/**
393 * bdev_write_page() - Start writing a page to a block device
394 * @bdev: The device to write the page to
395 * @sector: The offset on the device to write the page to (need not be aligned)
396 * @page: The page to write
397 * @wbc: The writeback_control for the write
398 *
399 * On entry, the page should be locked and not currently under writeback.
400 * On exit, if the write started successfully, the page will be unlocked and
401 * under writeback. If the write failed already (eg the driver failed to
402 * queue the page to the device), the page will still be locked. If the
403 * caller is a ->writepage implementation, it will need to unlock the page.
404 *
405 * Errors returned by this function are usually "soft", eg out of memory, or
406 * queue full; callers should try a different route to write this page rather
407 * than propagate an error back up the stack.
408 *
409 * Return: negative errno if an error occurs, 0 if submission was successful.
410 */
411int bdev_write_page(struct block_device *bdev, sector_t sector,
412 struct page *page, struct writeback_control *wbc)
413{
414 int result;
415 int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE;
416 const struct block_device_operations *ops = bdev->bd_disk->fops;
417 if (!ops->rw_page)
418 return -EOPNOTSUPP;
419 set_page_writeback(page);
420 result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw);
421 if (result)
422 end_page_writeback(page);
423 else
424 unlock_page(page);
425 return result;
426}
427EXPORT_SYMBOL_GPL(bdev_write_page);
428
366/* 429/*
367 * pseudo-fs 430 * pseudo-fs
368 */ 431 */
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index f29a54e454d4..4cd0ac983f91 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4510,7 +4510,8 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
4510 spin_unlock(&eb->refs_lock); 4510 spin_unlock(&eb->refs_lock);
4511} 4511}
4512 4512
4513static void mark_extent_buffer_accessed(struct extent_buffer *eb) 4513static void mark_extent_buffer_accessed(struct extent_buffer *eb,
4514 struct page *accessed)
4514{ 4515{
4515 unsigned long num_pages, i; 4516 unsigned long num_pages, i;
4516 4517
@@ -4519,7 +4520,8 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb)
4519 num_pages = num_extent_pages(eb->start, eb->len); 4520 num_pages = num_extent_pages(eb->start, eb->len);
4520 for (i = 0; i < num_pages; i++) { 4521 for (i = 0; i < num_pages; i++) {
4521 struct page *p = extent_buffer_page(eb, i); 4522 struct page *p = extent_buffer_page(eb, i);
4522 mark_page_accessed(p); 4523 if (p != accessed)
4524 mark_page_accessed(p);
4523 } 4525 }
4524} 4526}
4525 4527
@@ -4533,7 +4535,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
4533 start >> PAGE_CACHE_SHIFT); 4535 start >> PAGE_CACHE_SHIFT);
4534 if (eb && atomic_inc_not_zero(&eb->refs)) { 4536 if (eb && atomic_inc_not_zero(&eb->refs)) {
4535 rcu_read_unlock(); 4537 rcu_read_unlock();
4536 mark_extent_buffer_accessed(eb); 4538 mark_extent_buffer_accessed(eb, NULL);
4537 return eb; 4539 return eb;
4538 } 4540 }
4539 rcu_read_unlock(); 4541 rcu_read_unlock();
@@ -4581,7 +4583,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
4581 spin_unlock(&mapping->private_lock); 4583 spin_unlock(&mapping->private_lock);
4582 unlock_page(p); 4584 unlock_page(p);
4583 page_cache_release(p); 4585 page_cache_release(p);
4584 mark_extent_buffer_accessed(exists); 4586 mark_extent_buffer_accessed(exists, p);
4585 goto free_eb; 4587 goto free_eb;
4586 } 4588 }
4587 4589
@@ -4596,7 +4598,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
4596 attach_extent_buffer_page(eb, p); 4598 attach_extent_buffer_page(eb, p);
4597 spin_unlock(&mapping->private_lock); 4599 spin_unlock(&mapping->private_lock);
4598 WARN_ON(PageDirty(p)); 4600 WARN_ON(PageDirty(p));
4599 mark_page_accessed(p);
4600 eb->pages[i] = p; 4601 eb->pages[i] = p;
4601 if (!PageUptodate(p)) 4602 if (!PageUptodate(p))
4602 uptodate = 0; 4603 uptodate = 0;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index ae6af072b635..74272a3f9d9b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -470,11 +470,12 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
470 for (i = 0; i < num_pages; i++) { 470 for (i = 0; i < num_pages; i++) {
471 /* page checked is some magic around finding pages that 471 /* page checked is some magic around finding pages that
472 * have been modified without going through btrfs_set_page_dirty 472 * have been modified without going through btrfs_set_page_dirty
473 * clear it here 473 * clear it here. There should be no need to mark the pages
474 * accessed as prepare_pages should have marked them accessed
475 * in prepare_pages via find_or_create_page()
474 */ 476 */
475 ClearPageChecked(pages[i]); 477 ClearPageChecked(pages[i]);
476 unlock_page(pages[i]); 478 unlock_page(pages[i]);
477 mark_page_accessed(pages[i]);
478 page_cache_release(pages[i]); 479 page_cache_release(pages[i]);
479 } 480 }
480} 481}
diff --git a/fs/buffer.c b/fs/buffer.c
index 6a8110c03a47..eba6e4f621ce 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -227,7 +227,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
227 int all_mapped = 1; 227 int all_mapped = 1;
228 228
229 index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits); 229 index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
230 page = find_get_page(bd_mapping, index); 230 page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
231 if (!page) 231 if (!page)
232 goto out; 232 goto out;
233 233
@@ -1366,12 +1366,13 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1366 struct buffer_head *bh = lookup_bh_lru(bdev, block, size); 1366 struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1367 1367
1368 if (bh == NULL) { 1368 if (bh == NULL) {
1369 /* __find_get_block_slow will mark the page accessed */
1369 bh = __find_get_block_slow(bdev, block); 1370 bh = __find_get_block_slow(bdev, block);
1370 if (bh) 1371 if (bh)
1371 bh_lru_install(bh); 1372 bh_lru_install(bh);
1372 } 1373 } else
1373 if (bh)
1374 touch_buffer(bh); 1374 touch_buffer(bh);
1375
1375 return bh; 1376 return bh;
1376} 1377}
1377EXPORT_SYMBOL(__find_get_block); 1378EXPORT_SYMBOL(__find_get_block);
@@ -1483,16 +1484,27 @@ EXPORT_SYMBOL(set_bh_page);
1483/* 1484/*
1484 * Called when truncating a buffer on a page completely. 1485 * Called when truncating a buffer on a page completely.
1485 */ 1486 */
1487
1488/* Bits that are cleared during an invalidate */
1489#define BUFFER_FLAGS_DISCARD \
1490 (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
1491 1 << BH_Delay | 1 << BH_Unwritten)
1492
1486static void discard_buffer(struct buffer_head * bh) 1493static void discard_buffer(struct buffer_head * bh)
1487{ 1494{
1495 unsigned long b_state, b_state_old;
1496
1488 lock_buffer(bh); 1497 lock_buffer(bh);
1489 clear_buffer_dirty(bh); 1498 clear_buffer_dirty(bh);
1490 bh->b_bdev = NULL; 1499 bh->b_bdev = NULL;
1491 clear_buffer_mapped(bh); 1500 b_state = bh->b_state;
1492 clear_buffer_req(bh); 1501 for (;;) {
1493 clear_buffer_new(bh); 1502 b_state_old = cmpxchg(&bh->b_state, b_state,
1494 clear_buffer_delay(bh); 1503 (b_state & ~BUFFER_FLAGS_DISCARD));
1495 clear_buffer_unwritten(bh); 1504 if (b_state_old == b_state)
1505 break;
1506 b_state = b_state_old;
1507 }
1496 unlock_buffer(bh); 1508 unlock_buffer(bh);
1497} 1509}
1498 1510
@@ -2879,10 +2891,9 @@ EXPORT_SYMBOL(block_truncate_page);
2879 2891
2880/* 2892/*
2881 * The generic ->writepage function for buffer-backed address_spaces 2893 * The generic ->writepage function for buffer-backed address_spaces
2882 * this form passes in the end_io handler used to finish the IO.
2883 */ 2894 */
2884int block_write_full_page_endio(struct page *page, get_block_t *get_block, 2895int block_write_full_page(struct page *page, get_block_t *get_block,
2885 struct writeback_control *wbc, bh_end_io_t *handler) 2896 struct writeback_control *wbc)
2886{ 2897{
2887 struct inode * const inode = page->mapping->host; 2898 struct inode * const inode = page->mapping->host;
2888 loff_t i_size = i_size_read(inode); 2899 loff_t i_size = i_size_read(inode);
@@ -2892,7 +2903,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block,
2892 /* Is the page fully inside i_size? */ 2903 /* Is the page fully inside i_size? */
2893 if (page->index < end_index) 2904 if (page->index < end_index)
2894 return __block_write_full_page(inode, page, get_block, wbc, 2905 return __block_write_full_page(inode, page, get_block, wbc,
2895 handler); 2906 end_buffer_async_write);
2896 2907
2897 /* Is the page fully outside i_size? (truncate in progress) */ 2908 /* Is the page fully outside i_size? (truncate in progress) */
2898 offset = i_size & (PAGE_CACHE_SIZE-1); 2909 offset = i_size & (PAGE_CACHE_SIZE-1);
@@ -2915,18 +2926,8 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block,
2915 * writes to that region are not written out to the file." 2926 * writes to that region are not written out to the file."
2916 */ 2927 */
2917 zero_user_segment(page, offset, PAGE_CACHE_SIZE); 2928 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2918 return __block_write_full_page(inode, page, get_block, wbc, handler); 2929 return __block_write_full_page(inode, page, get_block, wbc,
2919} 2930 end_buffer_async_write);
2920EXPORT_SYMBOL(block_write_full_page_endio);
2921
2922/*
2923 * The generic ->writepage function for buffer-backed address_spaces
2924 */
2925int block_write_full_page(struct page *page, get_block_t *get_block,
2926 struct writeback_control *wbc)
2927{
2928 return block_write_full_page_endio(page, get_block, wbc,
2929 end_buffer_async_write);
2930} 2931}
2931EXPORT_SYMBOL(block_write_full_page); 2932EXPORT_SYMBOL(block_write_full_page);
2932 2933
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index b5f0a3b91f18..bd4a3c167091 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -24,6 +24,12 @@
24 * configfs Copyright (C) 2005 Oracle. All rights reserved. 24 * configfs Copyright (C) 2005 Oracle. All rights reserved.
25 */ 25 */
26 26
27#ifdef pr_fmt
28#undef pr_fmt
29#endif
30
31#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
32
27#include <linux/slab.h> 33#include <linux/slab.h>
28#include <linux/list.h> 34#include <linux/list.h>
29#include <linux/spinlock.h> 35#include <linux/spinlock.h>
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index e081acbac2e7..668dcabc5695 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -940,9 +940,9 @@ static void client_drop_item(struct config_item *parent_item,
940#ifdef DEBUG 940#ifdef DEBUG
941static void configfs_dump_one(struct configfs_dirent *sd, int level) 941static void configfs_dump_one(struct configfs_dirent *sd, int level)
942{ 942{
943 printk(KERN_INFO "%*s\"%s\":\n", level, " ", configfs_get_name(sd)); 943 pr_info("%*s\"%s\":\n", level, " ", configfs_get_name(sd));
944 944
945#define type_print(_type) if (sd->s_type & _type) printk(KERN_INFO "%*s %s\n", level, " ", #_type); 945#define type_print(_type) if (sd->s_type & _type) pr_info("%*s %s\n", level, " ", #_type);
946 type_print(CONFIGFS_ROOT); 946 type_print(CONFIGFS_ROOT);
947 type_print(CONFIGFS_DIR); 947 type_print(CONFIGFS_DIR);
948 type_print(CONFIGFS_ITEM_ATTR); 948 type_print(CONFIGFS_ITEM_ATTR);
@@ -1699,7 +1699,7 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
1699 struct dentry *root = dentry->d_sb->s_root; 1699 struct dentry *root = dentry->d_sb->s_root;
1700 1700
1701 if (dentry->d_parent != root) { 1701 if (dentry->d_parent != root) {
1702 printk(KERN_ERR "configfs: Tried to unregister non-subsystem!\n"); 1702 pr_err("Tried to unregister non-subsystem!\n");
1703 return; 1703 return;
1704 } 1704 }
1705 1705
@@ -1709,7 +1709,7 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
1709 mutex_lock(&configfs_symlink_mutex); 1709 mutex_lock(&configfs_symlink_mutex);
1710 spin_lock(&configfs_dirent_lock); 1710 spin_lock(&configfs_dirent_lock);
1711 if (configfs_detach_prep(dentry, NULL)) { 1711 if (configfs_detach_prep(dentry, NULL)) {
1712 printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n"); 1712 pr_err("Tried to unregister non-empty subsystem!\n");
1713 } 1713 }
1714 spin_unlock(&configfs_dirent_lock); 1714 spin_unlock(&configfs_dirent_lock);
1715 mutex_unlock(&configfs_symlink_mutex); 1715 mutex_unlock(&configfs_symlink_mutex);
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index a9d35b0e06cf..5946ad98053f 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -168,9 +168,8 @@ static void configfs_set_inode_lock_class(struct configfs_dirent *sd,
168 * In practice the maximum level of locking depth is 168 * In practice the maximum level of locking depth is
169 * already reached. Just inform about possible reasons. 169 * already reached. Just inform about possible reasons.
170 */ 170 */
171 printk(KERN_INFO "configfs: Too many levels of inodes" 171 pr_info("Too many levels of inodes for the locking correctness validator.\n");
172 " for the locking correctness validator.\n"); 172 pr_info("Spurious warnings may appear.\n");
173 printk(KERN_INFO "Spurious warnings may appear.\n");
174 } 173 }
175 } 174 }
176} 175}
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index 50cee7f9110b..e65f9ffbb999 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -19,7 +19,7 @@
19 * Boston, MA 021110-1307, USA. 19 * Boston, MA 021110-1307, USA.
20 * 20 *
21 * Based on kobject: 21 * Based on kobject:
22 * kobject is Copyright (c) 2002-2003 Patrick Mochel 22 * kobject is Copyright (c) 2002-2003 Patrick Mochel
23 * 23 *
24 * configfs Copyright (C) 2005 Oracle. All rights reserved. 24 * configfs Copyright (C) 2005 Oracle. All rights reserved.
25 * 25 *
@@ -35,9 +35,9 @@
35#include <linux/configfs.h> 35#include <linux/configfs.h>
36 36
37 37
38static inline struct config_item * to_item(struct list_head * entry) 38static inline struct config_item *to_item(struct list_head *entry)
39{ 39{
40 return container_of(entry,struct config_item,ci_entry); 40 return container_of(entry, struct config_item, ci_entry);
41} 41}
42 42
43/* Evil kernel */ 43/* Evil kernel */
@@ -47,34 +47,35 @@ static void config_item_release(struct kref *kref);
47 * config_item_init - initialize item. 47 * config_item_init - initialize item.
48 * @item: item in question. 48 * @item: item in question.
49 */ 49 */
50void config_item_init(struct config_item * item) 50void config_item_init(struct config_item *item)
51{ 51{
52 kref_init(&item->ci_kref); 52 kref_init(&item->ci_kref);
53 INIT_LIST_HEAD(&item->ci_entry); 53 INIT_LIST_HEAD(&item->ci_entry);
54} 54}
55EXPORT_SYMBOL(config_item_init);
55 56
56/** 57/**
57 * config_item_set_name - Set the name of an item 58 * config_item_set_name - Set the name of an item
58 * @item: item. 59 * @item: item.
59 * @name: name. 60 * @fmt: The vsnprintf()'s format string.
60 * 61 *
61 * If strlen(name) >= CONFIGFS_ITEM_NAME_LEN, then use a 62 * If strlen(name) >= CONFIGFS_ITEM_NAME_LEN, then use a
62 * dynamically allocated string that @item->ci_name points to. 63 * dynamically allocated string that @item->ci_name points to.
63 * Otherwise, use the static @item->ci_namebuf array. 64 * Otherwise, use the static @item->ci_namebuf array.
64 */ 65 */
65int config_item_set_name(struct config_item * item, const char * fmt, ...) 66int config_item_set_name(struct config_item *item, const char *fmt, ...)
66{ 67{
67 int error = 0; 68 int error = 0;
68 int limit = CONFIGFS_ITEM_NAME_LEN; 69 int limit = CONFIGFS_ITEM_NAME_LEN;
69 int need; 70 int need;
70 va_list args; 71 va_list args;
71 char * name; 72 char *name;
72 73
73 /* 74 /*
74 * First, try the static array 75 * First, try the static array
75 */ 76 */
76 va_start(args,fmt); 77 va_start(args, fmt);
77 need = vsnprintf(item->ci_namebuf,limit,fmt,args); 78 need = vsnprintf(item->ci_namebuf, limit, fmt, args);
78 va_end(args); 79 va_end(args);
79 if (need < limit) 80 if (need < limit)
80 name = item->ci_namebuf; 81 name = item->ci_namebuf;
@@ -83,13 +84,13 @@ int config_item_set_name(struct config_item * item, const char * fmt, ...)
83 * Need more space? Allocate it and try again 84 * Need more space? Allocate it and try again
84 */ 85 */
85 limit = need + 1; 86 limit = need + 1;
86 name = kmalloc(limit,GFP_KERNEL); 87 name = kmalloc(limit, GFP_KERNEL);
87 if (!name) { 88 if (!name) {
88 error = -ENOMEM; 89 error = -ENOMEM;
89 goto Done; 90 goto Done;
90 } 91 }
91 va_start(args,fmt); 92 va_start(args, fmt);
92 need = vsnprintf(name,limit,fmt,args); 93 need = vsnprintf(name, limit, fmt, args);
93 va_end(args); 94 va_end(args);
94 95
95 /* Still? Give up. */ 96 /* Still? Give up. */
@@ -109,7 +110,6 @@ int config_item_set_name(struct config_item * item, const char * fmt, ...)
109 Done: 110 Done:
110 return error; 111 return error;
111} 112}
112
113EXPORT_SYMBOL(config_item_set_name); 113EXPORT_SYMBOL(config_item_set_name);
114 114
115void config_item_init_type_name(struct config_item *item, 115void config_item_init_type_name(struct config_item *item,
@@ -131,20 +131,21 @@ void config_group_init_type_name(struct config_group *group, const char *name,
131} 131}
132EXPORT_SYMBOL(config_group_init_type_name); 132EXPORT_SYMBOL(config_group_init_type_name);
133 133
134struct config_item * config_item_get(struct config_item * item) 134struct config_item *config_item_get(struct config_item *item)
135{ 135{
136 if (item) 136 if (item)
137 kref_get(&item->ci_kref); 137 kref_get(&item->ci_kref);
138 return item; 138 return item;
139} 139}
140EXPORT_SYMBOL(config_item_get);
140 141
141static void config_item_cleanup(struct config_item * item) 142static void config_item_cleanup(struct config_item *item)
142{ 143{
143 struct config_item_type * t = item->ci_type; 144 struct config_item_type *t = item->ci_type;
144 struct config_group * s = item->ci_group; 145 struct config_group *s = item->ci_group;
145 struct config_item * parent = item->ci_parent; 146 struct config_item *parent = item->ci_parent;
146 147
147 pr_debug("config_item %s: cleaning up\n",config_item_name(item)); 148 pr_debug("config_item %s: cleaning up\n", config_item_name(item));
148 if (item->ci_name != item->ci_namebuf) 149 if (item->ci_name != item->ci_namebuf)
149 kfree(item->ci_name); 150 kfree(item->ci_name);
150 item->ci_name = NULL; 151 item->ci_name = NULL;
@@ -167,21 +168,23 @@ static void config_item_release(struct kref *kref)
167 * 168 *
168 * Decrement the refcount, and if 0, call config_item_cleanup(). 169 * Decrement the refcount, and if 0, call config_item_cleanup().
169 */ 170 */
170void config_item_put(struct config_item * item) 171void config_item_put(struct config_item *item)
171{ 172{
172 if (item) 173 if (item)
173 kref_put(&item->ci_kref, config_item_release); 174 kref_put(&item->ci_kref, config_item_release);
174} 175}
176EXPORT_SYMBOL(config_item_put);
175 177
176/** 178/**
177 * config_group_init - initialize a group for use 179 * config_group_init - initialize a group for use
178 * @k: group 180 * @group: config_group
179 */ 181 */
180void config_group_init(struct config_group *group) 182void config_group_init(struct config_group *group)
181{ 183{
182 config_item_init(&group->cg_item); 184 config_item_init(&group->cg_item);
183 INIT_LIST_HEAD(&group->cg_children); 185 INIT_LIST_HEAD(&group->cg_children);
184} 186}
187EXPORT_SYMBOL(config_group_init);
185 188
186/** 189/**
187 * config_group_find_item - search for item in group. 190 * config_group_find_item - search for item in group.
@@ -195,11 +198,11 @@ void config_group_init(struct config_group *group)
195struct config_item *config_group_find_item(struct config_group *group, 198struct config_item *config_group_find_item(struct config_group *group,
196 const char *name) 199 const char *name)
197{ 200{
198 struct list_head * entry; 201 struct list_head *entry;
199 struct config_item * ret = NULL; 202 struct config_item *ret = NULL;
200 203
201 list_for_each(entry,&group->cg_children) { 204 list_for_each(entry, &group->cg_children) {
202 struct config_item * item = to_item(entry); 205 struct config_item *item = to_item(entry);
203 if (config_item_name(item) && 206 if (config_item_name(item) &&
204 !strcmp(config_item_name(item), name)) { 207 !strcmp(config_item_name(item), name)) {
205 ret = config_item_get(item); 208 ret = config_item_get(item);
@@ -208,9 +211,4 @@ struct config_item *config_group_find_item(struct config_group *group,
208 } 211 }
209 return ret; 212 return ret;
210} 213}
211
212EXPORT_SYMBOL(config_item_init);
213EXPORT_SYMBOL(config_group_init);
214EXPORT_SYMBOL(config_item_get);
215EXPORT_SYMBOL(config_item_put);
216EXPORT_SYMBOL(config_group_find_item); 214EXPORT_SYMBOL(config_group_find_item);
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 7f26c3cf75ae..f6c285833390 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -85,7 +85,7 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
85 /* directory inodes start off with i_nlink == 2 (for "." entry) */ 85 /* directory inodes start off with i_nlink == 2 (for "." entry) */
86 inc_nlink(inode); 86 inc_nlink(inode);
87 } else { 87 } else {
88 pr_debug("configfs: could not get root inode\n"); 88 pr_debug("could not get root inode\n");
89 return -ENOMEM; 89 return -ENOMEM;
90 } 90 }
91 91
@@ -155,7 +155,7 @@ static int __init configfs_init(void)
155 155
156 return 0; 156 return 0;
157out4: 157out4:
158 printk(KERN_ERR "configfs: Unable to register filesystem!\n"); 158 pr_err("Unable to register filesystem!\n");
159 configfs_inode_exit(); 159 configfs_inode_exit();
160out3: 160out3:
161 kobject_put(config_kobj); 161 kobject_put(config_kobj);
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index becc725a1953..0a48886e069c 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -83,7 +83,7 @@ static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr)
83 return 0; 83 return 0;
84} 84}
85 85
86static struct dentry_operations efivarfs_d_ops = { 86static const struct dentry_operations efivarfs_d_ops = {
87 .d_compare = efivarfs_d_compare, 87 .d_compare = efivarfs_d_compare,
88 .d_hash = efivarfs_d_hash, 88 .d_hash = efivarfs_d_hash,
89 .d_delete = always_delete_dentry, 89 .d_delete = always_delete_dentry,
diff --git a/fs/efs/dir.c b/fs/efs/dir.c
index b72307ccdf7a..ce63b24f7c3e 100644
--- a/fs/efs/dir.c
+++ b/fs/efs/dir.c
@@ -26,7 +26,8 @@ static int efs_readdir(struct file *file, struct dir_context *ctx)
26 int slot; 26 int slot;
27 27
28 if (inode->i_size & (EFS_DIRBSIZE-1)) 28 if (inode->i_size & (EFS_DIRBSIZE-1))
29 printk(KERN_WARNING "EFS: WARNING: readdir(): directory size not a multiple of EFS_DIRBSIZE\n"); 29 pr_warn("%s(): directory size not a multiple of EFS_DIRBSIZE\n",
30 __func__);
30 31
31 /* work out where this entry can be found */ 32 /* work out where this entry can be found */
32 block = ctx->pos >> EFS_DIRBSIZE_BITS; 33 block = ctx->pos >> EFS_DIRBSIZE_BITS;
@@ -43,14 +44,15 @@ static int efs_readdir(struct file *file, struct dir_context *ctx)
43 bh = sb_bread(inode->i_sb, efs_bmap(inode, block)); 44 bh = sb_bread(inode->i_sb, efs_bmap(inode, block));
44 45
45 if (!bh) { 46 if (!bh) {
46 printk(KERN_ERR "EFS: readdir(): failed to read dir block %d\n", block); 47 pr_err("%s(): failed to read dir block %d\n",
48 __func__, block);
47 break; 49 break;
48 } 50 }
49 51
50 dirblock = (struct efs_dir *) bh->b_data; 52 dirblock = (struct efs_dir *) bh->b_data;
51 53
52 if (be16_to_cpu(dirblock->magic) != EFS_DIRBLK_MAGIC) { 54 if (be16_to_cpu(dirblock->magic) != EFS_DIRBLK_MAGIC) {
53 printk(KERN_ERR "EFS: readdir(): invalid directory block\n"); 55 pr_err("%s(): invalid directory block\n", __func__);
54 brelse(bh); 56 brelse(bh);
55 break; 57 break;
56 } 58 }
@@ -69,10 +71,9 @@ static int efs_readdir(struct file *file, struct dir_context *ctx)
69 inodenum = be32_to_cpu(dirslot->inode); 71 inodenum = be32_to_cpu(dirslot->inode);
70 namelen = dirslot->namelen; 72 namelen = dirslot->namelen;
71 nameptr = dirslot->name; 73 nameptr = dirslot->name;
72 74 pr_debug("%s(): block %d slot %d/%d: inode %u, name \"%s\", namelen %u\n",
73#ifdef DEBUG 75 __func__, block, slot, dirblock->slots-1,
74 printk(KERN_DEBUG "EFS: readdir(): block %d slot %d/%d: inode %u, name \"%s\", namelen %u\n", block, slot, dirblock->slots-1, inodenum, nameptr, namelen); 76 inodenum, nameptr, namelen);
75#endif
76 if (!namelen) 77 if (!namelen)
77 continue; 78 continue;
78 /* found the next entry */ 79 /* found the next entry */
@@ -80,7 +81,8 @@ static int efs_readdir(struct file *file, struct dir_context *ctx)
80 81
81 /* sanity check */ 82 /* sanity check */
82 if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) { 83 if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) {
83 printk(KERN_WARNING "EFS: directory entry %d exceeds directory block\n", slot); 84 pr_warn("directory entry %d exceeds directory block\n",
85 slot);
84 continue; 86 continue;
85 } 87 }
86 88
diff --git a/fs/efs/efs.h b/fs/efs/efs.h
index 5528926ac7f6..5bbf9612140c 100644
--- a/fs/efs/efs.h
+++ b/fs/efs/efs.h
@@ -7,6 +7,12 @@
7#ifndef _EFS_EFS_H_ 7#ifndef _EFS_EFS_H_
8#define _EFS_EFS_H_ 8#define _EFS_EFS_H_
9 9
10#ifdef pr_fmt
11#undef pr_fmt
12#endif
13
14#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15
10#include <linux/fs.h> 16#include <linux/fs.h>
11#include <asm/uaccess.h> 17#include <asm/uaccess.h>
12 18
diff --git a/fs/efs/file.c b/fs/efs/file.c
index 1ccb364ffa63..a37dcee46866 100644
--- a/fs/efs/file.c
+++ b/fs/efs/file.c
@@ -22,10 +22,8 @@ int efs_get_block(struct inode *inode, sector_t iblock,
22 /* 22 /*
23 * i have no idea why this happens as often as it does 23 * i have no idea why this happens as often as it does
24 */ 24 */
25 printk(KERN_WARNING "EFS: bmap(): block %d >= %ld (filesize %ld)\n", 25 pr_warn("%s(): block %d >= %ld (filesize %ld)\n",
26 block, 26 __func__, block, inode->i_blocks, inode->i_size);
27 inode->i_blocks,
28 inode->i_size);
29#endif 27#endif
30 return 0; 28 return 0;
31 } 29 }
@@ -38,7 +36,7 @@ int efs_get_block(struct inode *inode, sector_t iblock,
38int efs_bmap(struct inode *inode, efs_block_t block) { 36int efs_bmap(struct inode *inode, efs_block_t block) {
39 37
40 if (block < 0) { 38 if (block < 0) {
41 printk(KERN_WARNING "EFS: bmap(): block < 0\n"); 39 pr_warn("%s(): block < 0\n", __func__);
42 return 0; 40 return 0;
43 } 41 }
44 42
@@ -48,10 +46,8 @@ int efs_bmap(struct inode *inode, efs_block_t block) {
48 /* 46 /*
49 * i have no idea why this happens as often as it does 47 * i have no idea why this happens as often as it does
50 */ 48 */
51 printk(KERN_WARNING "EFS: bmap(): block %d >= %ld (filesize %ld)\n", 49 pr_warn("%s(): block %d >= %ld (filesize %ld)\n",
52 block, 50 __func__, block, inode->i_blocks, inode->i_size);
53 inode->i_blocks,
54 inode->i_size);
55#endif 51#endif
56 return 0; 52 return 0;
57 } 53 }
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index d15ccf20f1b3..079d20306ee1 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -89,7 +89,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino)
89 89
90 bh = sb_bread(inode->i_sb, block); 90 bh = sb_bread(inode->i_sb, block);
91 if (!bh) { 91 if (!bh) {
92 printk(KERN_WARNING "EFS: bread() failed at block %d\n", block); 92 pr_warn("%s() failed at block %d\n", __func__, block);
93 goto read_inode_error; 93 goto read_inode_error;
94 } 94 }
95 95
@@ -130,19 +130,16 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino)
130 for(i = 0; i < EFS_DIRECTEXTENTS; i++) { 130 for(i = 0; i < EFS_DIRECTEXTENTS; i++) {
131 extent_copy(&(efs_inode->di_u.di_extents[i]), &(in->extents[i])); 131 extent_copy(&(efs_inode->di_u.di_extents[i]), &(in->extents[i]));
132 if (i < in->numextents && in->extents[i].cooked.ex_magic != 0) { 132 if (i < in->numextents && in->extents[i].cooked.ex_magic != 0) {
133 printk(KERN_WARNING "EFS: extent %d has bad magic number in inode %lu\n", i, inode->i_ino); 133 pr_warn("extent %d has bad magic number in inode %lu\n",
134 i, inode->i_ino);
134 brelse(bh); 135 brelse(bh);
135 goto read_inode_error; 136 goto read_inode_error;
136 } 137 }
137 } 138 }
138 139
139 brelse(bh); 140 brelse(bh);
140 141 pr_debug("efs_iget(): inode %lu, extents %d, mode %o\n",
141#ifdef DEBUG 142 inode->i_ino, in->numextents, inode->i_mode);
142 printk(KERN_DEBUG "EFS: efs_iget(): inode %lu, extents %d, mode %o\n",
143 inode->i_ino, in->numextents, inode->i_mode);
144#endif
145
146 switch (inode->i_mode & S_IFMT) { 143 switch (inode->i_mode & S_IFMT) {
147 case S_IFDIR: 144 case S_IFDIR:
148 inode->i_op = &efs_dir_inode_operations; 145 inode->i_op = &efs_dir_inode_operations;
@@ -162,7 +159,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino)
162 init_special_inode(inode, inode->i_mode, device); 159 init_special_inode(inode, inode->i_mode, device);
163 break; 160 break;
164 default: 161 default:
165 printk(KERN_WARNING "EFS: unsupported inode mode %o\n", inode->i_mode); 162 pr_warn("unsupported inode mode %o\n", inode->i_mode);
166 goto read_inode_error; 163 goto read_inode_error;
167 break; 164 break;
168 } 165 }
@@ -171,7 +168,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino)
171 return inode; 168 return inode;
172 169
173read_inode_error: 170read_inode_error:
174 printk(KERN_WARNING "EFS: failed to read inode %lu\n", inode->i_ino); 171 pr_warn("failed to read inode %lu\n", inode->i_ino);
175 iget_failed(inode); 172 iget_failed(inode);
176 return ERR_PTR(-EIO); 173 return ERR_PTR(-EIO);
177} 174}
@@ -216,7 +213,7 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) {
216 213
217 /* if we only have one extent then nothing can be found */ 214 /* if we only have one extent then nothing can be found */
218 if (in->numextents == 1) { 215 if (in->numextents == 1) {
219 printk(KERN_ERR "EFS: map_block() failed to map (1 extent)\n"); 216 pr_err("%s() failed to map (1 extent)\n", __func__);
220 return 0; 217 return 0;
221 } 218 }
222 219
@@ -234,13 +231,12 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) {
234 } 231 }
235 } 232 }
236 233
237 printk(KERN_ERR "EFS: map_block() failed to map block %u (dir)\n", block); 234 pr_err("%s() failed to map block %u (dir)\n", __func__, block);
238 return 0; 235 return 0;
239 } 236 }
240 237
241#ifdef DEBUG 238 pr_debug("%s(): indirect search for logical block %u\n",
242 printk(KERN_DEBUG "EFS: map_block(): indirect search for logical block %u\n", block); 239 __func__, block);
243#endif
244 direxts = in->extents[0].cooked.ex_offset; 240 direxts = in->extents[0].cooked.ex_offset;
245 indexts = in->numextents; 241 indexts = in->numextents;
246 242
@@ -262,7 +258,8 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) {
262 258
263 if (dirext == direxts) { 259 if (dirext == direxts) {
264 /* should never happen */ 260 /* should never happen */
265 printk(KERN_ERR "EFS: couldn't find direct extent for indirect extent %d (block %u)\n", cur, block); 261 pr_err("couldn't find direct extent for indirect extent %d (block %u)\n",
262 cur, block);
266 if (bh) brelse(bh); 263 if (bh) brelse(bh);
267 return 0; 264 return 0;
268 } 265 }
@@ -279,12 +276,12 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) {
279 276
280 bh = sb_bread(inode->i_sb, iblock); 277 bh = sb_bread(inode->i_sb, iblock);
281 if (!bh) { 278 if (!bh) {
282 printk(KERN_ERR "EFS: bread() failed at block %d\n", iblock); 279 pr_err("%s() failed at block %d\n",
280 __func__, iblock);
283 return 0; 281 return 0;
284 } 282 }
285#ifdef DEBUG 283 pr_debug("%s(): read indirect extent block %d\n",
286 printk(KERN_DEBUG "EFS: map_block(): read indirect extent block %d\n", iblock); 284 __func__, iblock);
287#endif
288 first = 0; 285 first = 0;
289 lastblock = iblock; 286 lastblock = iblock;
290 } 287 }
@@ -294,7 +291,8 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) {
294 extent_copy(&(exts[ioffset]), &ext); 291 extent_copy(&(exts[ioffset]), &ext);
295 292
296 if (ext.cooked.ex_magic != 0) { 293 if (ext.cooked.ex_magic != 0) {
297 printk(KERN_ERR "EFS: extent %d has bad magic number in block %d\n", cur, iblock); 294 pr_err("extent %d has bad magic number in block %d\n",
295 cur, iblock);
298 if (bh) brelse(bh); 296 if (bh) brelse(bh);
299 return 0; 297 return 0;
300 } 298 }
@@ -306,7 +304,7 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) {
306 } 304 }
307 } 305 }
308 if (bh) brelse(bh); 306 if (bh) brelse(bh);
309 printk(KERN_ERR "EFS: map_block() failed to map block %u (indir)\n", block); 307 pr_err("%s() failed to map block %u (indir)\n", __func__, block);
310 return 0; 308 return 0;
311} 309}
312 310
diff --git a/fs/efs/namei.c b/fs/efs/namei.c
index 96f66d213a19..356c044e2cd3 100644
--- a/fs/efs/namei.c
+++ b/fs/efs/namei.c
@@ -23,20 +23,22 @@ static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len)
23 efs_block_t block; 23 efs_block_t block;
24 24
25 if (inode->i_size & (EFS_DIRBSIZE-1)) 25 if (inode->i_size & (EFS_DIRBSIZE-1))
26 printk(KERN_WARNING "EFS: WARNING: find_entry(): directory size not a multiple of EFS_DIRBSIZE\n"); 26 pr_warn("%s(): directory size not a multiple of EFS_DIRBSIZE\n",
27 __func__);
27 28
28 for(block = 0; block < inode->i_blocks; block++) { 29 for(block = 0; block < inode->i_blocks; block++) {
29 30
30 bh = sb_bread(inode->i_sb, efs_bmap(inode, block)); 31 bh = sb_bread(inode->i_sb, efs_bmap(inode, block));
31 if (!bh) { 32 if (!bh) {
32 printk(KERN_ERR "EFS: find_entry(): failed to read dir block %d\n", block); 33 pr_err("%s(): failed to read dir block %d\n",
34 __func__, block);
33 return 0; 35 return 0;
34 } 36 }
35 37
36 dirblock = (struct efs_dir *) bh->b_data; 38 dirblock = (struct efs_dir *) bh->b_data;
37 39
38 if (be16_to_cpu(dirblock->magic) != EFS_DIRBLK_MAGIC) { 40 if (be16_to_cpu(dirblock->magic) != EFS_DIRBLK_MAGIC) {
39 printk(KERN_ERR "EFS: find_entry(): invalid directory block\n"); 41 pr_err("%s(): invalid directory block\n", __func__);
40 brelse(bh); 42 brelse(bh);
41 return(0); 43 return(0);
42 } 44 }
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 3befcc9f5d63..7fca462ea4e3 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -134,7 +134,7 @@ static const struct export_operations efs_export_ops = {
134 134
135static int __init init_efs_fs(void) { 135static int __init init_efs_fs(void) {
136 int err; 136 int err;
137 printk("EFS: "EFS_VERSION" - http://aeschi.ch.eu.org/efs/\n"); 137 pr_info(EFS_VERSION" - http://aeschi.ch.eu.org/efs/\n");
138 err = init_inodecache(); 138 err = init_inodecache();
139 if (err) 139 if (err)
140 goto out1; 140 goto out1;
@@ -179,12 +179,12 @@ static efs_block_t efs_validate_vh(struct volume_header *vh) {
179 csum += be32_to_cpu(cs); 179 csum += be32_to_cpu(cs);
180 } 180 }
181 if (csum) { 181 if (csum) {
182 printk(KERN_INFO "EFS: SGI disklabel: checksum bad, label corrupted\n"); 182 pr_warn("SGI disklabel: checksum bad, label corrupted\n");
183 return 0; 183 return 0;
184 } 184 }
185 185
186#ifdef DEBUG 186#ifdef DEBUG
187 printk(KERN_DEBUG "EFS: bf: \"%16s\"\n", vh->vh_bootfile); 187 pr_debug("bf: \"%16s\"\n", vh->vh_bootfile);
188 188
189 for(i = 0; i < NVDIR; i++) { 189 for(i = 0; i < NVDIR; i++) {
190 int j; 190 int j;
@@ -196,9 +196,8 @@ static efs_block_t efs_validate_vh(struct volume_header *vh) {
196 name[j] = (char) 0; 196 name[j] = (char) 0;
197 197
198 if (name[0]) { 198 if (name[0]) {
199 printk(KERN_DEBUG "EFS: vh: %8s block: 0x%08x size: 0x%08x\n", 199 pr_debug("vh: %8s block: 0x%08x size: 0x%08x\n",
200 name, 200 name, (int) be32_to_cpu(vh->vh_vd[i].vd_lbn),
201 (int) be32_to_cpu(vh->vh_vd[i].vd_lbn),
202 (int) be32_to_cpu(vh->vh_vd[i].vd_nbytes)); 201 (int) be32_to_cpu(vh->vh_vd[i].vd_nbytes));
203 } 202 }
204 } 203 }
@@ -211,12 +210,11 @@ static efs_block_t efs_validate_vh(struct volume_header *vh) {
211 } 210 }
212#ifdef DEBUG 211#ifdef DEBUG
213 if (be32_to_cpu(vh->vh_pt[i].pt_nblks)) { 212 if (be32_to_cpu(vh->vh_pt[i].pt_nblks)) {
214 printk(KERN_DEBUG "EFS: pt %2d: start: %08d size: %08d type: 0x%02x (%s)\n", 213 pr_debug("pt %2d: start: %08d size: %08d type: 0x%02x (%s)\n",
215 i, 214 i, (int)be32_to_cpu(vh->vh_pt[i].pt_firstlbn),
216 (int) be32_to_cpu(vh->vh_pt[i].pt_firstlbn), 215 (int)be32_to_cpu(vh->vh_pt[i].pt_nblks),
217 (int) be32_to_cpu(vh->vh_pt[i].pt_nblks), 216 pt_type, (pt_entry->pt_name) ?
218 pt_type, 217 pt_entry->pt_name : "unknown");
219 (pt_entry->pt_name) ? pt_entry->pt_name : "unknown");
220 } 218 }
221#endif 219#endif
222 if (IS_EFS(pt_type)) { 220 if (IS_EFS(pt_type)) {
@@ -226,11 +224,10 @@ static efs_block_t efs_validate_vh(struct volume_header *vh) {
226 } 224 }
227 225
228 if (slice == -1) { 226 if (slice == -1) {
229 printk(KERN_NOTICE "EFS: partition table contained no EFS partitions\n"); 227 pr_notice("partition table contained no EFS partitions\n");
230#ifdef DEBUG 228#ifdef DEBUG
231 } else { 229 } else {
232 printk(KERN_INFO "EFS: using slice %d (type %s, offset 0x%x)\n", 230 pr_info("using slice %d (type %s, offset 0x%x)\n", slice,
233 slice,
234 (pt_entry->pt_name) ? pt_entry->pt_name : "unknown", 231 (pt_entry->pt_name) ? pt_entry->pt_name : "unknown",
235 sblock); 232 sblock);
236#endif 233#endif
@@ -268,7 +265,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
268 265
269 s->s_magic = EFS_SUPER_MAGIC; 266 s->s_magic = EFS_SUPER_MAGIC;
270 if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) { 267 if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) {
271 printk(KERN_ERR "EFS: device does not support %d byte blocks\n", 268 pr_err("device does not support %d byte blocks\n",
272 EFS_BLOCKSIZE); 269 EFS_BLOCKSIZE);
273 return -EINVAL; 270 return -EINVAL;
274 } 271 }
@@ -277,7 +274,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
277 bh = sb_bread(s, 0); 274 bh = sb_bread(s, 0);
278 275
279 if (!bh) { 276 if (!bh) {
280 printk(KERN_ERR "EFS: cannot read volume header\n"); 277 pr_err("cannot read volume header\n");
281 return -EINVAL; 278 return -EINVAL;
282 } 279 }
283 280
@@ -295,13 +292,14 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
295 292
296 bh = sb_bread(s, sb->fs_start + EFS_SUPER); 293 bh = sb_bread(s, sb->fs_start + EFS_SUPER);
297 if (!bh) { 294 if (!bh) {
298 printk(KERN_ERR "EFS: cannot read superblock\n"); 295 pr_err("cannot read superblock\n");
299 return -EINVAL; 296 return -EINVAL;
300 } 297 }
301 298
302 if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) { 299 if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) {
303#ifdef DEBUG 300#ifdef DEBUG
304 printk(KERN_WARNING "EFS: invalid superblock at block %u\n", sb->fs_start + EFS_SUPER); 301 pr_warn("invalid superblock at block %u\n",
302 sb->fs_start + EFS_SUPER);
305#endif 303#endif
306 brelse(bh); 304 brelse(bh);
307 return -EINVAL; 305 return -EINVAL;
@@ -310,7 +308,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
310 308
311 if (!(s->s_flags & MS_RDONLY)) { 309 if (!(s->s_flags & MS_RDONLY)) {
312#ifdef DEBUG 310#ifdef DEBUG
313 printk(KERN_INFO "EFS: forcing read-only mode\n"); 311 pr_info("forcing read-only mode\n");
314#endif 312#endif
315 s->s_flags |= MS_RDONLY; 313 s->s_flags |= MS_RDONLY;
316 } 314 }
@@ -318,13 +316,13 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
318 s->s_export_op = &efs_export_ops; 316 s->s_export_op = &efs_export_ops;
319 root = efs_iget(s, EFS_ROOTINODE); 317 root = efs_iget(s, EFS_ROOTINODE);
320 if (IS_ERR(root)) { 318 if (IS_ERR(root)) {
321 printk(KERN_ERR "EFS: get root inode failed\n"); 319 pr_err("get root inode failed\n");
322 return PTR_ERR(root); 320 return PTR_ERR(root);
323 } 321 }
324 322
325 s->s_root = d_make_root(root); 323 s->s_root = d_make_root(root);
326 if (!(s->s_root)) { 324 if (!(s->s_root)) {
327 printk(KERN_ERR "EFS: get root dentry failed\n"); 325 pr_err("get root dentry failed\n");
328 return -ENOMEM; 326 return -ENOMEM;
329 } 327 }
330 328
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 48a359dd286e..b01fbfb51f43 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -259,7 +259,7 @@ static int filldir_one(void * __buf, const char * name, int len,
259 259
260/** 260/**
261 * get_name - default export_operations->get_name function 261 * get_name - default export_operations->get_name function
262 * @dentry: the directory in which to find a name 262 * @path: the directory in which to find a name
263 * @name: a pointer to a %NAME_MAX+1 char buffer to store the name 263 * @name: a pointer to a %NAME_MAX+1 char buffer to store the name
264 * @child: the dentry for the child directory. 264 * @child: the dentry for the child directory.
265 * 265 *
@@ -337,7 +337,7 @@ out:
337/** 337/**
338 * export_encode_fh - default export_operations->encode_fh function 338 * export_encode_fh - default export_operations->encode_fh function
339 * @inode: the object to encode 339 * @inode: the object to encode
340 * @fh: where to store the file handle fragment 340 * @fid: where to store the file handle fragment
341 * @max_len: maximum length to store there 341 * @max_len: maximum length to store there
342 * @parent: parent directory inode, if wanted 342 * @parent: parent directory inode, if wanted
343 * 343 *
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c8238a26818c..afe8a133e3d1 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1044,6 +1044,8 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1044 * allocating. If we are looking at the buddy cache we would 1044 * allocating. If we are looking at the buddy cache we would
1045 * have taken a reference using ext4_mb_load_buddy and that 1045 * have taken a reference using ext4_mb_load_buddy and that
1046 * would have pinned buddy page to page cache. 1046 * would have pinned buddy page to page cache.
1047 * The call to ext4_mb_get_buddy_page_lock will mark the
1048 * page accessed.
1047 */ 1049 */
1048 ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b); 1050 ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
1049 if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { 1051 if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
@@ -1062,7 +1064,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1062 ret = -EIO; 1064 ret = -EIO;
1063 goto err; 1065 goto err;
1064 } 1066 }
1065 mark_page_accessed(page);
1066 1067
1067 if (e4b.bd_buddy_page == NULL) { 1068 if (e4b.bd_buddy_page == NULL) {
1068 /* 1069 /*
@@ -1082,7 +1083,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1082 ret = -EIO; 1083 ret = -EIO;
1083 goto err; 1084 goto err;
1084 } 1085 }
1085 mark_page_accessed(page);
1086err: 1086err:
1087 ext4_mb_put_buddy_page_lock(&e4b); 1087 ext4_mb_put_buddy_page_lock(&e4b);
1088 return ret; 1088 return ret;
@@ -1141,7 +1141,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1141 1141
1142 /* we could use find_or_create_page(), but it locks page 1142 /* we could use find_or_create_page(), but it locks page
1143 * what we'd like to avoid in fast path ... */ 1143 * what we'd like to avoid in fast path ... */
1144 page = find_get_page(inode->i_mapping, pnum); 1144 page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
1145 if (page == NULL || !PageUptodate(page)) { 1145 if (page == NULL || !PageUptodate(page)) {
1146 if (page) 1146 if (page)
1147 /* 1147 /*
@@ -1176,15 +1176,16 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1176 ret = -EIO; 1176 ret = -EIO;
1177 goto err; 1177 goto err;
1178 } 1178 }
1179
1180 /* Pages marked accessed already */
1179 e4b->bd_bitmap_page = page; 1181 e4b->bd_bitmap_page = page;
1180 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); 1182 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1181 mark_page_accessed(page);
1182 1183
1183 block++; 1184 block++;
1184 pnum = block / blocks_per_page; 1185 pnum = block / blocks_per_page;
1185 poff = block % blocks_per_page; 1186 poff = block % blocks_per_page;
1186 1187
1187 page = find_get_page(inode->i_mapping, pnum); 1188 page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
1188 if (page == NULL || !PageUptodate(page)) { 1189 if (page == NULL || !PageUptodate(page)) {
1189 if (page) 1190 if (page)
1190 page_cache_release(page); 1191 page_cache_release(page);
@@ -1209,9 +1210,10 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1209 ret = -EIO; 1210 ret = -EIO;
1210 goto err; 1211 goto err;
1211 } 1212 }
1213
1214 /* Pages marked accessed already */
1212 e4b->bd_buddy_page = page; 1215 e4b->bd_buddy_page = page;
1213 e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); 1216 e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
1214 mark_page_accessed(page);
1215 1217
1216 BUG_ON(e4b->bd_bitmap_page == NULL); 1218 BUG_ON(e4b->bd_bitmap_page == NULL);
1217 BUG_ON(e4b->bd_buddy_page == NULL); 1219 BUG_ON(e4b->bd_buddy_page == NULL);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index c18d95b50540..1a64e7a52b84 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -429,7 +429,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
429 block_start = bh_offset(bh); 429 block_start = bh_offset(bh);
430 if (block_start >= len) { 430 if (block_start >= len) {
431 /* 431 /*
432 * Comments copied from block_write_full_page_endio: 432 * Comments copied from block_write_full_page:
433 * 433 *
434 * The page straddles i_size. It must be zeroed out on 434 * The page straddles i_size. It must be zeroed out on
435 * each and every writepage invocation because it may 435 * each and every writepage invocation because it may
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 4aa521aa9bc3..c405b8f17054 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -69,7 +69,6 @@ repeat:
69 goto repeat; 69 goto repeat;
70 } 70 }
71out: 71out:
72 mark_page_accessed(page);
73 return page; 72 return page;
74} 73}
75 74
@@ -137,13 +136,11 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type)
137 if (!page) 136 if (!page)
138 continue; 137 continue;
139 if (PageUptodate(page)) { 138 if (PageUptodate(page)) {
140 mark_page_accessed(page);
141 f2fs_put_page(page, 1); 139 f2fs_put_page(page, 1);
142 continue; 140 continue;
143 } 141 }
144 142
145 f2fs_submit_page_mbio(sbi, page, blk_addr, &fio); 143 f2fs_submit_page_mbio(sbi, page, blk_addr, &fio);
146 mark_page_accessed(page);
147 f2fs_put_page(page, 0); 144 f2fs_put_page(page, 0);
148 } 145 }
149out: 146out:
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index a161e955c4c8..57caa6eaf47b 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -967,7 +967,6 @@ repeat:
967 goto repeat; 967 goto repeat;
968 } 968 }
969got_it: 969got_it:
970 mark_page_accessed(page);
971 return page; 970 return page;
972} 971}
973 972
@@ -1022,7 +1021,6 @@ page_hit:
1022 f2fs_put_page(page, 1); 1021 f2fs_put_page(page, 1);
1023 return ERR_PTR(-EIO); 1022 return ERR_PTR(-EIO);
1024 } 1023 }
1025 mark_page_accessed(page);
1026 return page; 1024 return page;
1027} 1025}
1028 1026
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
index f7cff367db7f..56cce7fdd39e 100644
--- a/fs/fscache/cache.c
+++ b/fs/fscache/cache.c
@@ -280,15 +280,15 @@ int fscache_add_cache(struct fscache_cache *cache,
280 spin_unlock(&fscache_fsdef_index.lock); 280 spin_unlock(&fscache_fsdef_index.lock);
281 up_write(&fscache_addremove_sem); 281 up_write(&fscache_addremove_sem);
282 282
283 printk(KERN_NOTICE "FS-Cache: Cache \"%s\" added (type %s)\n", 283 pr_notice("Cache \"%s\" added (type %s)\n",
284 cache->tag->name, cache->ops->name); 284 cache->tag->name, cache->ops->name);
285 kobject_uevent(cache->kobj, KOBJ_ADD); 285 kobject_uevent(cache->kobj, KOBJ_ADD);
286 286
287 _leave(" = 0 [%s]", cache->identifier); 287 _leave(" = 0 [%s]", cache->identifier);
288 return 0; 288 return 0;
289 289
290tag_in_use: 290tag_in_use:
291 printk(KERN_ERR "FS-Cache: Cache tag '%s' already in use\n", tagname); 291 pr_err("Cache tag '%s' already in use\n", tagname);
292 __fscache_release_cache_tag(tag); 292 __fscache_release_cache_tag(tag);
293 _leave(" = -EXIST"); 293 _leave(" = -EXIST");
294 return -EEXIST; 294 return -EEXIST;
@@ -317,8 +317,7 @@ EXPORT_SYMBOL(fscache_add_cache);
317void fscache_io_error(struct fscache_cache *cache) 317void fscache_io_error(struct fscache_cache *cache)
318{ 318{
319 if (!test_and_set_bit(FSCACHE_IOERROR, &cache->flags)) 319 if (!test_and_set_bit(FSCACHE_IOERROR, &cache->flags))
320 printk(KERN_ERR "FS-Cache:" 320 pr_err("Cache '%s' stopped due to I/O error\n",
321 " Cache '%s' stopped due to I/O error\n",
322 cache->ops->name); 321 cache->ops->name);
323} 322}
324EXPORT_SYMBOL(fscache_io_error); 323EXPORT_SYMBOL(fscache_io_error);
@@ -369,8 +368,8 @@ void fscache_withdraw_cache(struct fscache_cache *cache)
369 368
370 _enter(""); 369 _enter("");
371 370
372 printk(KERN_NOTICE "FS-Cache: Withdrawing cache \"%s\"\n", 371 pr_notice("Withdrawing cache \"%s\"\n",
373 cache->tag->name); 372 cache->tag->name);
374 373
375 /* make the cache unavailable for cookie acquisition */ 374 /* make the cache unavailable for cookie acquisition */
376 if (test_and_set_bit(FSCACHE_CACHE_WITHDRAWN, &cache->flags)) 375 if (test_and_set_bit(FSCACHE_CACHE_WITHDRAWN, &cache->flags))
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 29d7feb62cf7..aec01be91b0a 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -519,7 +519,7 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate)
519 ASSERTCMP(atomic_read(&cookie->n_active), >, 0); 519 ASSERTCMP(atomic_read(&cookie->n_active), >, 0);
520 520
521 if (atomic_read(&cookie->n_children) != 0) { 521 if (atomic_read(&cookie->n_children) != 0) {
522 printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n", 522 pr_err("Cookie '%s' still has children\n",
523 cookie->def->name); 523 cookie->def->name);
524 BUG(); 524 BUG();
525 } 525 }
diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c
index bad496748a59..7d637e2335fd 100644
--- a/fs/fscache/histogram.c
+++ b/fs/fscache/histogram.c
@@ -31,12 +31,10 @@ static int fscache_histogram_show(struct seq_file *m, void *v)
31 31
32 switch ((unsigned long) v) { 32 switch ((unsigned long) v) {
33 case 1: 33 case 1:
34 seq_puts(m, "JIFS SECS OBJ INST OP RUNS OBJ RUNS " 34 seq_puts(m, "JIFS SECS OBJ INST OP RUNS OBJ RUNS RETRV DLY RETRIEVLS\n");
35 " RETRV DLY RETRIEVLS\n");
36 return 0; 35 return 0;
37 case 2: 36 case 2:
38 seq_puts(m, "===== ===== ========= ========= =========" 37 seq_puts(m, "===== ===== ========= ========= ========= ========= =========\n");
39 " ========= =========\n");
40 return 0; 38 return 0;
41 default: 39 default:
42 index = (unsigned long) v - 3; 40 index = (unsigned long) v - 3;
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 4226f6680b06..bc6c08fcfddd 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -22,6 +22,12 @@
22 * 22 *
23 */ 23 */
24 24
25#ifdef pr_fmt
26#undef pr_fmt
27#endif
28
29#define pr_fmt(fmt) "FS-Cache: " fmt
30
25#include <linux/fscache-cache.h> 31#include <linux/fscache-cache.h>
26#include <linux/sched.h> 32#include <linux/sched.h>
27 33
@@ -413,8 +419,8 @@ do { \
413#define ASSERT(X) \ 419#define ASSERT(X) \
414do { \ 420do { \
415 if (unlikely(!(X))) { \ 421 if (unlikely(!(X))) { \
416 printk(KERN_ERR "\n"); \ 422 pr_err("\n"); \
417 printk(KERN_ERR "FS-Cache: Assertion failed\n"); \ 423 pr_err("Assertion failed\n"); \
418 BUG(); \ 424 BUG(); \
419 } \ 425 } \
420} while (0) 426} while (0)
@@ -422,9 +428,9 @@ do { \
422#define ASSERTCMP(X, OP, Y) \ 428#define ASSERTCMP(X, OP, Y) \
423do { \ 429do { \
424 if (unlikely(!((X) OP (Y)))) { \ 430 if (unlikely(!((X) OP (Y)))) { \
425 printk(KERN_ERR "\n"); \ 431 pr_err("\n"); \
426 printk(KERN_ERR "FS-Cache: Assertion failed\n"); \ 432 pr_err("Assertion failed\n"); \
427 printk(KERN_ERR "%lx " #OP " %lx is false\n", \ 433 pr_err("%lx " #OP " %lx is false\n", \
428 (unsigned long)(X), (unsigned long)(Y)); \ 434 (unsigned long)(X), (unsigned long)(Y)); \
429 BUG(); \ 435 BUG(); \
430 } \ 436 } \
@@ -433,8 +439,8 @@ do { \
433#define ASSERTIF(C, X) \ 439#define ASSERTIF(C, X) \
434do { \ 440do { \
435 if (unlikely((C) && !(X))) { \ 441 if (unlikely((C) && !(X))) { \
436 printk(KERN_ERR "\n"); \ 442 pr_err("\n"); \
437 printk(KERN_ERR "FS-Cache: Assertion failed\n"); \ 443 pr_err("Assertion failed\n"); \
438 BUG(); \ 444 BUG(); \
439 } \ 445 } \
440} while (0) 446} while (0)
@@ -442,9 +448,9 @@ do { \
442#define ASSERTIFCMP(C, X, OP, Y) \ 448#define ASSERTIFCMP(C, X, OP, Y) \
443do { \ 449do { \
444 if (unlikely((C) && !((X) OP (Y)))) { \ 450 if (unlikely((C) && !((X) OP (Y)))) { \
445 printk(KERN_ERR "\n"); \ 451 pr_err("\n"); \
446 printk(KERN_ERR "FS-Cache: Assertion failed\n"); \ 452 pr_err("Assertion failed\n"); \
447 printk(KERN_ERR "%lx " #OP " %lx is false\n", \ 453 pr_err("%lx " #OP " %lx is false\n", \
448 (unsigned long)(X), (unsigned long)(Y)); \ 454 (unsigned long)(X), (unsigned long)(Y)); \
449 BUG(); \ 455 BUG(); \
450 } \ 456 } \
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index 7c27907e650c..acd4bf1fc277 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -146,8 +146,7 @@ static int __init fscache_init(void)
146 0, 146 0,
147 fscache_cookie_init_once); 147 fscache_cookie_init_once);
148 if (!fscache_cookie_jar) { 148 if (!fscache_cookie_jar) {
149 printk(KERN_NOTICE 149 pr_notice("Failed to allocate a cookie jar\n");
150 "FS-Cache: Failed to allocate a cookie jar\n");
151 ret = -ENOMEM; 150 ret = -ENOMEM;
152 goto error_cookie_jar; 151 goto error_cookie_jar;
153 } 152 }
@@ -156,7 +155,7 @@ static int __init fscache_init(void)
156 if (!fscache_root) 155 if (!fscache_root)
157 goto error_kobj; 156 goto error_kobj;
158 157
159 printk(KERN_NOTICE "FS-Cache: Loaded\n"); 158 pr_notice("Loaded\n");
160 return 0; 159 return 0;
161 160
162error_kobj: 161error_kobj:
@@ -192,7 +191,7 @@ static void __exit fscache_exit(void)
192 fscache_proc_cleanup(); 191 fscache_proc_cleanup();
193 destroy_workqueue(fscache_op_wq); 192 destroy_workqueue(fscache_op_wq);
194 destroy_workqueue(fscache_object_wq); 193 destroy_workqueue(fscache_object_wq);
195 printk(KERN_NOTICE "FS-Cache: Unloaded\n"); 194 pr_notice("Unloaded\n");
196} 195}
197 196
198module_exit(fscache_exit); 197module_exit(fscache_exit);
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
index 989f39401547..6d941f56faf4 100644
--- a/fs/fscache/netfs.c
+++ b/fs/fscache/netfs.c
@@ -65,8 +65,7 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
65 list_add(&netfs->link, &fscache_netfs_list); 65 list_add(&netfs->link, &fscache_netfs_list);
66 ret = 0; 66 ret = 0;
67 67
68 printk(KERN_NOTICE "FS-Cache: Netfs '%s' registered for caching\n", 68 pr_notice("Netfs '%s' registered for caching\n", netfs->name);
69 netfs->name);
70 69
71already_registered: 70already_registered:
72 up_write(&fscache_addremove_sem); 71 up_write(&fscache_addremove_sem);
@@ -97,8 +96,8 @@ void __fscache_unregister_netfs(struct fscache_netfs *netfs)
97 96
98 up_write(&fscache_addremove_sem); 97 up_write(&fscache_addremove_sem);
99 98
100 printk(KERN_NOTICE "FS-Cache: Netfs '%s' unregistered from caching\n", 99 pr_notice("Netfs '%s' unregistered from caching\n",
101 netfs->name); 100 netfs->name);
102 101
103 _leave(""); 102 _leave("");
104} 103}
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index b5ebc2d7d80d..b8179ca6bf9d 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -285,20 +285,20 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
285 fscache_unuse_cookie(obj); 285 fscache_unuse_cookie(obj);
286 286
287 if (keylen > 0 || auxlen > 0) { 287 if (keylen > 0 || auxlen > 0) {
288 seq_printf(m, " "); 288 seq_puts(m, " ");
289 for (p = buf; keylen > 0; keylen--) 289 for (p = buf; keylen > 0; keylen--)
290 seq_printf(m, "%02x", *p++); 290 seq_printf(m, "%02x", *p++);
291 if (auxlen > 0) { 291 if (auxlen > 0) {
292 if (config & FSCACHE_OBJLIST_CONFIG_KEY) 292 if (config & FSCACHE_OBJLIST_CONFIG_KEY)
293 seq_printf(m, ", "); 293 seq_puts(m, ", ");
294 for (; auxlen > 0; auxlen--) 294 for (; auxlen > 0; auxlen--)
295 seq_printf(m, "%02x", *p++); 295 seq_printf(m, "%02x", *p++);
296 } 296 }
297 } 297 }
298 298
299 seq_printf(m, "\n"); 299 seq_puts(m, "\n");
300 } else { 300 } else {
301 seq_printf(m, "<no_netfs>\n"); 301 seq_puts(m, "<no_netfs>\n");
302 } 302 }
303 return 0; 303 return 0;
304} 304}
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 318071aca217..e7b87a0e5185 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -51,8 +51,7 @@ void fscache_enqueue_operation(struct fscache_operation *op)
51 _debug("queue for caller's attention"); 51 _debug("queue for caller's attention");
52 break; 52 break;
53 default: 53 default:
54 printk(KERN_ERR "FS-Cache: Unexpected op type %lx", 54 pr_err("Unexpected op type %lx", op->flags);
55 op->flags);
56 BUG(); 55 BUG();
57 break; 56 break;
58 } 57 }
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 7f5c658af755..ed70714503fa 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -1108,10 +1108,8 @@ void fscache_mark_page_cached(struct fscache_retrieval *op, struct page *page)
1108 static bool once_only; 1108 static bool once_only;
1109 if (!once_only) { 1109 if (!once_only) {
1110 once_only = true; 1110 once_only = true;
1111 printk(KERN_WARNING "FS-Cache:" 1111 pr_warn("Cookie type %s marked page %lx multiple times\n",
1112 " Cookie type %s marked page %lx" 1112 cookie->def->name, page->index);
1113 " multiple times\n",
1114 cookie->def->name, page->index);
1115 } 1113 }
1116 } 1114 }
1117 1115
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index aac71ce373e4..098f97bdcf1b 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1614,7 +1614,7 @@ out_finish:
1614 1614
1615static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req) 1615static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req)
1616{ 1616{
1617 release_pages(req->pages, req->num_pages, 0); 1617 release_pages(req->pages, req->num_pages, false);
1618} 1618}
1619 1619
1620static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, 1620static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index f680d2c44e97..903cbc9cd6bd 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1089,8 +1089,6 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
1089 tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); 1089 tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
1090 flush_dcache_page(page); 1090 flush_dcache_page(page);
1091 1091
1092 mark_page_accessed(page);
1093
1094 if (!tmp) { 1092 if (!tmp) {
1095 unlock_page(page); 1093 unlock_page(page);
1096 page_cache_release(page); 1094 page_cache_release(page);
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 5a49b037da81..492123cda64a 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -577,7 +577,6 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos,
577 p = kmap_atomic(page); 577 p = kmap_atomic(page);
578 memcpy(buf + copied, p + offset, amt); 578 memcpy(buf + copied, p + offset, amt);
579 kunmap_atomic(p); 579 kunmap_atomic(p);
580 mark_page_accessed(page);
581 page_cache_release(page); 580 page_cache_release(page);
582 copied += amt; 581 copied += amt;
583 index++; 582 index++;
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 2cf09b63a6b4..b984a6e190bc 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -136,7 +136,8 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
136 yield(); 136 yield();
137 } 137 }
138 } else { 138 } else {
139 page = find_lock_page(mapping, index); 139 page = find_get_page_flags(mapping, index,
140 FGP_LOCK|FGP_ACCESSED);
140 if (!page) 141 if (!page)
141 return NULL; 142 return NULL;
142 } 143 }
@@ -153,7 +154,6 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
153 map_bh(bh, sdp->sd_vfs, blkno); 154 map_bh(bh, sdp->sd_vfs, blkno);
154 155
155 unlock_page(page); 156 unlock_page(page);
156 mark_page_accessed(page);
157 page_cache_release(page); 157 page_cache_release(page);
158 158
159 return bh; 159 return bh;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index e19d4c0cacae..1e2872b25343 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -6,6 +6,8 @@
6 * Copyright (C) 2002 Linus Torvalds. 6 * Copyright (C) 2002 Linus Torvalds.
7 */ 7 */
8 8
9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10
9#include <linux/module.h> 11#include <linux/module.h>
10#include <linux/thread_info.h> 12#include <linux/thread_info.h>
11#include <asm/current.h> 13#include <asm/current.h>
@@ -475,7 +477,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
475 * annotation because huge_pmd_share() does an allocation under 477 * annotation because huge_pmd_share() does an allocation under
476 * i_mmap_mutex. 478 * i_mmap_mutex.
477 */ 479 */
478struct lock_class_key hugetlbfs_i_mmap_mutex_key; 480static struct lock_class_key hugetlbfs_i_mmap_mutex_key;
479 481
480static struct inode *hugetlbfs_get_inode(struct super_block *sb, 482static struct inode *hugetlbfs_get_inode(struct super_block *sb,
481 struct inode *dir, 483 struct inode *dir,
@@ -823,8 +825,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
823 ps = memparse(args[0].from, &rest); 825 ps = memparse(args[0].from, &rest);
824 pconfig->hstate = size_to_hstate(ps); 826 pconfig->hstate = size_to_hstate(ps);
825 if (!pconfig->hstate) { 827 if (!pconfig->hstate) {
826 printk(KERN_ERR 828 pr_err("Unsupported page size %lu MB\n",
827 "hugetlbfs: Unsupported page size %lu MB\n",
828 ps >> 20); 829 ps >> 20);
829 return -EINVAL; 830 return -EINVAL;
830 } 831 }
@@ -832,8 +833,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
832 } 833 }
833 834
834 default: 835 default:
835 printk(KERN_ERR "hugetlbfs: Bad mount option: \"%s\"\n", 836 pr_err("Bad mount option: \"%s\"\n", p);
836 p);
837 return -EINVAL; 837 return -EINVAL;
838 break; 838 break;
839 } 839 }
@@ -853,8 +853,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
853 return 0; 853 return 0;
854 854
855bad_val: 855bad_val:
856 printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n", 856 pr_err("Bad value '%s' for mount option '%s'\n", args[0].from, p);
857 args[0].from, p);
858 return -EINVAL; 857 return -EINVAL;
859} 858}
860 859
@@ -902,8 +901,7 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
902 goto out_free; 901 goto out_free;
903 return 0; 902 return 0;
904out_free: 903out_free:
905 if (sbinfo->spool) 904 kfree(sbinfo->spool);
906 kfree(sbinfo->spool);
907 kfree(sbinfo); 905 kfree(sbinfo);
908 return -ENOMEM; 906 return -ENOMEM;
909} 907}
@@ -939,7 +937,7 @@ static int get_hstate_idx(int page_size_log)
939 return h - hstates; 937 return h - hstates;
940} 938}
941 939
942static struct dentry_operations anon_ops = { 940static const struct dentry_operations anon_ops = {
943 .d_dname = simple_dname 941 .d_dname = simple_dname
944}; 942};
945 943
@@ -970,8 +968,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
970 *user = current_user(); 968 *user = current_user();
971 if (user_shm_lock(size, *user)) { 969 if (user_shm_lock(size, *user)) {
972 task_lock(current); 970 task_lock(current);
973 printk_once(KERN_WARNING 971 pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
974 "%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
975 current->comm, current->pid); 972 current->comm, current->pid);
976 task_unlock(current); 973 task_unlock(current);
977 } else { 974 } else {
@@ -1031,7 +1028,7 @@ static int __init init_hugetlbfs_fs(void)
1031 int i; 1028 int i;
1032 1029
1033 if (!hugepages_supported()) { 1030 if (!hugepages_supported()) {
1034 pr_info("hugetlbfs: disabling because there are no supported hugepage sizes\n"); 1031 pr_info("disabling because there are no supported hugepage sizes\n");
1035 return -ENOTSUPP; 1032 return -ENOTSUPP;
1036 } 1033 }
1037 1034
@@ -1060,7 +1057,7 @@ static int __init init_hugetlbfs_fs(void)
1060 buf); 1057 buf);
1061 1058
1062 if (IS_ERR(hugetlbfs_vfsmount[i])) { 1059 if (IS_ERR(hugetlbfs_vfsmount[i])) {
1063 pr_err("hugetlb: Cannot mount internal hugetlbfs for " 1060 pr_err("Cannot mount internal hugetlbfs for "
1064 "page size %uK", ps_kb); 1061 "page size %uK", ps_kb);
1065 error = PTR_ERR(hugetlbfs_vfsmount[i]); 1062 error = PTR_ERR(hugetlbfs_vfsmount[i]);
1066 hugetlbfs_vfsmount[i] = NULL; 1063 hugetlbfs_vfsmount[i] = NULL;
diff --git a/fs/libfs.c b/fs/libfs.c
index a1844244246f..88e3e00e2eca 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -3,6 +3,7 @@
3 * Library for filesystems writers. 3 * Library for filesystems writers.
4 */ 4 */
5 5
6#include <linux/blkdev.h>
6#include <linux/export.h> 7#include <linux/export.h>
7#include <linux/pagemap.h> 8#include <linux/pagemap.h>
8#include <linux/slab.h> 9#include <linux/slab.h>
@@ -923,16 +924,19 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
923EXPORT_SYMBOL_GPL(generic_fh_to_parent); 924EXPORT_SYMBOL_GPL(generic_fh_to_parent);
924 925
925/** 926/**
926 * generic_file_fsync - generic fsync implementation for simple filesystems 927 * __generic_file_fsync - generic fsync implementation for simple filesystems
928 *
927 * @file: file to synchronize 929 * @file: file to synchronize
930 * @start: start offset in bytes
931 * @end: end offset in bytes (inclusive)
928 * @datasync: only synchronize essential metadata if true 932 * @datasync: only synchronize essential metadata if true
929 * 933 *
930 * This is a generic implementation of the fsync method for simple 934 * This is a generic implementation of the fsync method for simple
931 * filesystems which track all non-inode metadata in the buffers list 935 * filesystems which track all non-inode metadata in the buffers list
932 * hanging off the address_space structure. 936 * hanging off the address_space structure.
933 */ 937 */
934int generic_file_fsync(struct file *file, loff_t start, loff_t end, 938int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
935 int datasync) 939 int datasync)
936{ 940{
937 struct inode *inode = file->f_mapping->host; 941 struct inode *inode = file->f_mapping->host;
938 int err; 942 int err;
@@ -952,10 +956,34 @@ int generic_file_fsync(struct file *file, loff_t start, loff_t end,
952 err = sync_inode_metadata(inode, 1); 956 err = sync_inode_metadata(inode, 1);
953 if (ret == 0) 957 if (ret == 0)
954 ret = err; 958 ret = err;
959
955out: 960out:
956 mutex_unlock(&inode->i_mutex); 961 mutex_unlock(&inode->i_mutex);
957 return ret; 962 return ret;
958} 963}
964EXPORT_SYMBOL(__generic_file_fsync);
965
966/**
967 * generic_file_fsync - generic fsync implementation for simple filesystems
968 * with flush
969 * @file: file to synchronize
970 * @start: start offset in bytes
971 * @end: end offset in bytes (inclusive)
972 * @datasync: only synchronize essential metadata if true
973 *
974 */
975
976int generic_file_fsync(struct file *file, loff_t start, loff_t end,
977 int datasync)
978{
979 struct inode *inode = file->f_mapping->host;
980 int err;
981
982 err = __generic_file_fsync(file, start, end, datasync);
983 if (err)
984 return err;
985 return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
986}
959EXPORT_SYMBOL(generic_file_fsync); 987EXPORT_SYMBOL(generic_file_fsync);
960 988
961/** 989/**
diff --git a/fs/mpage.c b/fs/mpage.c
index 4979ffa60aaa..5f9ed622274f 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -48,23 +48,7 @@ static void mpage_end_io(struct bio *bio, int err)
48 48
49 bio_for_each_segment_all(bv, bio, i) { 49 bio_for_each_segment_all(bv, bio, i) {
50 struct page *page = bv->bv_page; 50 struct page *page = bv->bv_page;
51 51 page_endio(page, bio_data_dir(bio), err);
52 if (bio_data_dir(bio) == READ) {
53 if (!err) {
54 SetPageUptodate(page);
55 } else {
56 ClearPageUptodate(page);
57 SetPageError(page);
58 }
59 unlock_page(page);
60 } else { /* bio_data_dir(bio) == WRITE */
61 if (err) {
62 SetPageError(page);
63 if (page->mapping)
64 set_bit(AS_EIO, &page->mapping->flags);
65 }
66 end_page_writeback(page);
67 }
68 } 52 }
69 53
70 bio_put(bio); 54 bio_put(bio);
@@ -285,6 +269,11 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
285 269
286alloc_new: 270alloc_new:
287 if (bio == NULL) { 271 if (bio == NULL) {
272 if (first_hole == blocks_per_page) {
273 if (!bdev_read_page(bdev, blocks[0] << (blkbits - 9),
274 page))
275 goto out;
276 }
288 bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), 277 bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
289 min_t(int, nr_pages, bio_get_nr_vecs(bdev)), 278 min_t(int, nr_pages, bio_get_nr_vecs(bdev)),
290 GFP_KERNEL); 279 GFP_KERNEL);
@@ -439,6 +428,35 @@ struct mpage_data {
439 unsigned use_writepage; 428 unsigned use_writepage;
440}; 429};
441 430
431/*
432 * We have our BIO, so we can now mark the buffers clean. Make
433 * sure to only clean buffers which we know we'll be writing.
434 */
435static void clean_buffers(struct page *page, unsigned first_unmapped)
436{
437 unsigned buffer_counter = 0;
438 struct buffer_head *bh, *head;
439 if (!page_has_buffers(page))
440 return;
441 head = page_buffers(page);
442 bh = head;
443
444 do {
445 if (buffer_counter++ == first_unmapped)
446 break;
447 clear_buffer_dirty(bh);
448 bh = bh->b_this_page;
449 } while (bh != head);
450
451 /*
452 * we cannot drop the bh if the page is not uptodate or a concurrent
453 * readpage would fail to serialize with the bh and it would read from
454 * disk before we reach the platter.
455 */
456 if (buffer_heads_over_limit && PageUptodate(page))
457 try_to_free_buffers(page);
458}
459
442static int __mpage_writepage(struct page *page, struct writeback_control *wbc, 460static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
443 void *data) 461 void *data)
444{ 462{
@@ -574,6 +592,13 @@ page_is_mapped:
574 592
575alloc_new: 593alloc_new:
576 if (bio == NULL) { 594 if (bio == NULL) {
595 if (first_unmapped == blocks_per_page) {
596 if (!bdev_write_page(bdev, blocks[0] << (blkbits - 9),
597 page, wbc)) {
598 clean_buffers(page, first_unmapped);
599 goto out;
600 }
601 }
577 bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), 602 bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
578 bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH); 603 bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH);
579 if (bio == NULL) 604 if (bio == NULL)
@@ -591,30 +616,7 @@ alloc_new:
591 goto alloc_new; 616 goto alloc_new;
592 } 617 }
593 618
594 /* 619 clean_buffers(page, first_unmapped);
595 * OK, we have our BIO, so we can now mark the buffers clean. Make
596 * sure to only clean buffers which we know we'll be writing.
597 */
598 if (page_has_buffers(page)) {
599 struct buffer_head *head = page_buffers(page);
600 struct buffer_head *bh = head;
601 unsigned buffer_counter = 0;
602
603 do {
604 if (buffer_counter++ == first_unmapped)
605 break;
606 clear_buffer_dirty(bh);
607 bh = bh->b_this_page;
608 } while (bh != head);
609
610 /*
611 * we cannot drop the bh if the page is not uptodate
612 * or a concurrent readpage would fail to serialize with the bh
613 * and it would read from disk before we reach the platter.
614 */
615 if (buffer_heads_over_limit && PageUptodate(page))
616 try_to_free_buffers(page);
617 }
618 620
619 BUG_ON(PageWriteback(page)); 621 BUG_ON(PageWriteback(page));
620 set_page_writeback(page); 622 set_page_writeback(page);
diff --git a/fs/ncpfs/getopt.c b/fs/ncpfs/getopt.c
index 03ffde1f44d6..344889cd120e 100644
--- a/fs/ncpfs/getopt.c
+++ b/fs/ncpfs/getopt.c
@@ -53,15 +53,14 @@ int ncp_getopt(const char *caller, char **options, const struct ncp_option *opts
53 return -EINVAL; 53 return -EINVAL;
54 } 54 }
55 if (opts->has_arg & OPT_INT) { 55 if (opts->has_arg & OPT_INT) {
56 char* v; 56 int rc = kstrtoul(val, 0, value);
57 57
58 *value = simple_strtoul(val, &v, 0); 58 if (rc) {
59 if (!*v) { 59 pr_info("%s: invalid numeric value in %s=%s\n",
60 return opts->val; 60 caller, token, val);
61 return rc;
61 } 62 }
62 pr_info("%s: invalid numeric value in %s=%s\n", 63 return opts->val;
63 caller, token, val);
64 return -EDOM;
65 } 64 }
66 if (opts->has_arg & OPT_STRING) { 65 if (opts->has_arg & OPT_STRING) {
67 return opts->val; 66 return opts->val;
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 732648b270dc..3fdc8a3e1134 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -25,6 +25,19 @@
25#define FANOTIFY_DEFAULT_MAX_MARKS 8192 25#define FANOTIFY_DEFAULT_MAX_MARKS 8192
26#define FANOTIFY_DEFAULT_MAX_LISTENERS 128 26#define FANOTIFY_DEFAULT_MAX_LISTENERS 128
27 27
28/*
29 * All flags that may be specified in parameter event_f_flags of fanotify_init.
30 *
31 * Internal and external open flags are stored together in field f_flags of
32 * struct file. Only external open flags shall be allowed in event_f_flags.
33 * Internal flags like FMODE_NONOTIFY, FMODE_EXEC, FMODE_NOCMTIME shall be
34 * excluded.
35 */
36#define FANOTIFY_INIT_ALL_EVENT_F_BITS ( \
37 O_ACCMODE | O_APPEND | O_NONBLOCK | \
38 __O_SYNC | O_DSYNC | O_CLOEXEC | \
39 O_LARGEFILE | O_NOATIME )
40
28extern const struct fsnotify_ops fanotify_fsnotify_ops; 41extern const struct fsnotify_ops fanotify_fsnotify_ops;
29 42
30static struct kmem_cache *fanotify_mark_cache __read_mostly; 43static struct kmem_cache *fanotify_mark_cache __read_mostly;
@@ -669,6 +682,18 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
669 if (flags & ~FAN_ALL_INIT_FLAGS) 682 if (flags & ~FAN_ALL_INIT_FLAGS)
670 return -EINVAL; 683 return -EINVAL;
671 684
685 if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
686 return -EINVAL;
687
688 switch (event_f_flags & O_ACCMODE) {
689 case O_RDONLY:
690 case O_RDWR:
691 case O_WRONLY:
692 break;
693 default:
694 return -EINVAL;
695 }
696
672 user = get_current_user(); 697 user = get_current_user();
673 if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) { 698 if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) {
674 free_uid(user); 699 free_uid(user);
@@ -776,7 +801,10 @@ SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
776 case FAN_MARK_REMOVE: 801 case FAN_MARK_REMOVE:
777 if (!mask) 802 if (!mask)
778 return -EINVAL; 803 return -EINVAL;
804 break;
779 case FAN_MARK_FLUSH: 805 case FAN_MARK_FLUSH:
806 if (flags & ~(FAN_MARK_MOUNT | FAN_MARK_FLUSH))
807 return -EINVAL;
780 break; 808 break;
781 default: 809 default:
782 return -EINVAL; 810 return -EINVAL;
@@ -813,6 +841,15 @@ SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
813 group->priority == FS_PRIO_0) 841 group->priority == FS_PRIO_0)
814 goto fput_and_out; 842 goto fput_and_out;
815 843
844 if (flags & FAN_MARK_FLUSH) {
845 ret = 0;
846 if (flags & FAN_MARK_MOUNT)
847 fsnotify_clear_vfsmount_marks_by_group(group);
848 else
849 fsnotify_clear_inode_marks_by_group(group);
850 goto fput_and_out;
851 }
852
816 ret = fanotify_find_path(dfd, pathname, &path, flags); 853 ret = fanotify_find_path(dfd, pathname, &path, flags);
817 if (ret) 854 if (ret)
818 goto fput_and_out; 855 goto fput_and_out;
@@ -824,7 +861,7 @@ SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
824 mnt = path.mnt; 861 mnt = path.mnt;
825 862
826 /* create/update an inode mark */ 863 /* create/update an inode mark */
827 switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) { 864 switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) {
828 case FAN_MARK_ADD: 865 case FAN_MARK_ADD:
829 if (flags & FAN_MARK_MOUNT) 866 if (flags & FAN_MARK_MOUNT)
830 ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags); 867 ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags);
@@ -837,12 +874,6 @@ SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
837 else 874 else
838 ret = fanotify_remove_inode_mark(group, inode, mask, flags); 875 ret = fanotify_remove_inode_mark(group, inode, mask, flags);
839 break; 876 break;
840 case FAN_MARK_FLUSH:
841 if (flags & FAN_MARK_MOUNT)
842 fsnotify_clear_vfsmount_marks_by_group(group);
843 else
844 fsnotify_clear_inode_marks_by_group(group);
845 break;
846 default: 877 default:
847 ret = -EINVAL; 878 ret = -EINVAL;
848 } 879 }
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 923fe4a5f503..d90deaa08e78 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -340,7 +340,7 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
340static int fsnotify_mark_destroy(void *ignored) 340static int fsnotify_mark_destroy(void *ignored)
341{ 341{
342 struct fsnotify_mark *mark, *next; 342 struct fsnotify_mark *mark, *next;
343 LIST_HEAD(private_destroy_list); 343 struct list_head private_destroy_list;
344 344
345 for (;;) { 345 for (;;) {
346 spin_lock(&destroy_lock); 346 spin_lock(&destroy_lock);
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index a27e3fecefaf..250ed5b20c8f 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -1748,7 +1748,6 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
1748 if (page) { 1748 if (page) {
1749 set_page_dirty(page); 1749 set_page_dirty(page);
1750 unlock_page(page); 1750 unlock_page(page);
1751 mark_page_accessed(page);
1752 page_cache_release(page); 1751 page_cache_release(page);
1753 } 1752 }
1754 ntfs_debug("Done."); 1753 ntfs_debug("Done.");
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index ee4144ce5d7c..f82498c35e78 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -58,7 +58,7 @@ typedef enum {
58/** 58/**
59 * ntfs_compression_buffer - one buffer for the decompression engine 59 * ntfs_compression_buffer - one buffer for the decompression engine
60 */ 60 */
61static u8 *ntfs_compression_buffer = NULL; 61static u8 *ntfs_compression_buffer;
62 62
63/** 63/**
64 * ntfs_cb_lock - spinlock which protects ntfs_compression_buffer 64 * ntfs_cb_lock - spinlock which protects ntfs_compression_buffer
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index db9bd8a31725..86ddab916b66 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2060,7 +2060,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
2060 } 2060 }
2061 do { 2061 do {
2062 unlock_page(pages[--do_pages]); 2062 unlock_page(pages[--do_pages]);
2063 mark_page_accessed(pages[do_pages]);
2064 page_cache_release(pages[do_pages]); 2063 page_cache_release(pages[do_pages]);
2065 } while (do_pages); 2064 } while (do_pages);
2066 if (unlikely(status)) 2065 if (unlikely(status))
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 9de2491f2926..6c3296e546c3 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -50,8 +50,8 @@
50static unsigned long ntfs_nr_compression_users; 50static unsigned long ntfs_nr_compression_users;
51 51
52/* A global default upcase table and a corresponding reference count. */ 52/* A global default upcase table and a corresponding reference count. */
53static ntfschar *default_upcase = NULL; 53static ntfschar *default_upcase;
54static unsigned long ntfs_nr_upcase_users = 0; 54static unsigned long ntfs_nr_upcase_users;
55 55
56/* Error constants/strings used in inode.c::ntfs_show_options(). */ 56/* Error constants/strings used in inode.c::ntfs_show_options(). */
57typedef enum { 57typedef enum {
diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c
index 79a89184cb5e..1927170a35ce 100644
--- a/fs/ntfs/sysctl.c
+++ b/fs/ntfs/sysctl.c
@@ -56,7 +56,7 @@ static ctl_table sysctls_root[] = {
56}; 56};
57 57
58/* Storage for the sysctls header. */ 58/* Storage for the sysctls header. */
59static struct ctl_table_header *sysctls_root_table = NULL; 59static struct ctl_table_header *sysctls_root_table;
60 60
61/** 61/**
62 * ntfs_sysctl - add or remove the debug sysctl 62 * ntfs_sysctl - add or remove the debug sysctl
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index b4deb5f750d9..9d8fcf2f3b94 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6046,7 +6046,8 @@ static void ocfs2_truncate_log_worker(struct work_struct *work)
6046void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, 6046void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
6047 int cancel) 6047 int cancel)
6048{ 6048{
6049 if (osb->osb_tl_inode) { 6049 if (osb->osb_tl_inode &&
6050 atomic_read(&osb->osb_tl_disable) == 0) {
6050 /* We want to push off log flushes while truncates are 6051 /* We want to push off log flushes while truncates are
6051 * still running. */ 6052 * still running. */
6052 if (cancel) 6053 if (cancel)
@@ -6223,6 +6224,8 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
6223 int status; 6224 int status;
6224 struct inode *tl_inode = osb->osb_tl_inode; 6225 struct inode *tl_inode = osb->osb_tl_inode;
6225 6226
6227 atomic_set(&osb->osb_tl_disable, 1);
6228
6226 if (tl_inode) { 6229 if (tl_inode) {
6227 cancel_delayed_work(&osb->osb_truncate_log_wq); 6230 cancel_delayed_work(&osb->osb_truncate_log_wq);
6228 flush_workqueue(ocfs2_wq); 6231 flush_workqueue(ocfs2_wq);
@@ -6254,6 +6257,7 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
6254 * until we're sure all is well. */ 6257 * until we're sure all is well. */
6255 INIT_DELAYED_WORK(&osb->osb_truncate_log_wq, 6258 INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
6256 ocfs2_truncate_log_worker); 6259 ocfs2_truncate_log_worker);
6260 atomic_set(&osb->osb_tl_disable, 0);
6257 osb->osb_tl_bh = tl_bh; 6261 osb->osb_tl_bh = tl_bh;
6258 osb->osb_tl_inode = tl_inode; 6262 osb->osb_tl_inode = tl_inode;
6259 6263
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index c6b90e670389..a68e07a9bd46 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -108,7 +108,7 @@ static struct rb_root o2net_handler_tree = RB_ROOT;
108static struct o2net_node o2net_nodes[O2NM_MAX_NODES]; 108static struct o2net_node o2net_nodes[O2NM_MAX_NODES];
109 109
110/* XXX someday we'll need better accounting */ 110/* XXX someday we'll need better accounting */
111static struct socket *o2net_listen_sock = NULL; 111static struct socket *o2net_listen_sock;
112 112
113/* 113/*
114 * listen work is only queued by the listening socket callbacks on the 114 * listen work is only queued by the listening socket callbacks on the
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index e0517762fcc0..a106b3f2b22a 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -108,7 +108,6 @@ static inline int dlm_is_recovery_lock(const char *lock_name, int name_len)
108struct dlm_recovery_ctxt 108struct dlm_recovery_ctxt
109{ 109{
110 struct list_head resources; 110 struct list_head resources;
111 struct list_head received;
112 struct list_head node_data; 111 struct list_head node_data;
113 u8 new_master; 112 u8 new_master;
114 u8 dead_node; 113 u8 dead_node;
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index e33cd7a3c582..18f13c2e4a10 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -338,7 +338,7 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle)
338 338
339#ifdef CONFIG_DEBUG_FS 339#ifdef CONFIG_DEBUG_FS
340 340
341static struct dentry *dlm_debugfs_root = NULL; 341static struct dentry *dlm_debugfs_root;
342 342
343#define DLM_DEBUGFS_DIR "o2dlm" 343#define DLM_DEBUGFS_DIR "o2dlm"
344#define DLM_DEBUGFS_DLM_STATE "dlm_state" 344#define DLM_DEBUGFS_DLM_STATE "dlm_state"
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index c973690dc0bc..39efc5057a36 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -959,6 +959,14 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
959 * domain. Set him in the map and clean up our 959 * domain. Set him in the map and clean up our
960 * leftover join state. */ 960 * leftover join state. */
961 BUG_ON(dlm->joining_node != assert->node_idx); 961 BUG_ON(dlm->joining_node != assert->node_idx);
962
963 if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) {
964 mlog(0, "dlm recovery is ongoing, disallow join\n");
965 spin_unlock(&dlm->spinlock);
966 spin_unlock(&dlm_domain_lock);
967 return -EAGAIN;
968 }
969
962 set_bit(assert->node_idx, dlm->domain_map); 970 set_bit(assert->node_idx, dlm->domain_map);
963 clear_bit(assert->node_idx, dlm->exit_domain_map); 971 clear_bit(assert->node_idx, dlm->exit_domain_map);
964 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); 972 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
@@ -1517,6 +1525,7 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,
1517 unsigned int node) 1525 unsigned int node)
1518{ 1526{
1519 int status; 1527 int status;
1528 int ret;
1520 struct dlm_assert_joined assert_msg; 1529 struct dlm_assert_joined assert_msg;
1521 1530
1522 mlog(0, "Sending join assert to node %u\n", node); 1531 mlog(0, "Sending join assert to node %u\n", node);
@@ -1528,11 +1537,13 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,
1528 1537
1529 status = o2net_send_message(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, 1538 status = o2net_send_message(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
1530 &assert_msg, sizeof(assert_msg), node, 1539 &assert_msg, sizeof(assert_msg), node,
1531 NULL); 1540 &ret);
1532 if (status < 0) 1541 if (status < 0)
1533 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " 1542 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
1534 "node %u\n", status, DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, 1543 "node %u\n", status, DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
1535 node); 1544 node);
1545 else
1546 status = ret;
1536 1547
1537 return status; 1548 return status;
1538} 1549}
@@ -2023,7 +2034,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
2023 INIT_LIST_HEAD(&dlm->list); 2034 INIT_LIST_HEAD(&dlm->list);
2024 INIT_LIST_HEAD(&dlm->dirty_list); 2035 INIT_LIST_HEAD(&dlm->dirty_list);
2025 INIT_LIST_HEAD(&dlm->reco.resources); 2036 INIT_LIST_HEAD(&dlm->reco.resources);
2026 INIT_LIST_HEAD(&dlm->reco.received);
2027 INIT_LIST_HEAD(&dlm->reco.node_data); 2037 INIT_LIST_HEAD(&dlm->reco.node_data);
2028 INIT_LIST_HEAD(&dlm->purge_list); 2038 INIT_LIST_HEAD(&dlm->purge_list);
2029 INIT_LIST_HEAD(&dlm->dlm_domain_handlers); 2039 INIT_LIST_HEAD(&dlm->dlm_domain_handlers);
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 5d32f7511f74..66c2a491f68d 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -52,7 +52,7 @@
52#define MLOG_MASK_PREFIX ML_DLM 52#define MLOG_MASK_PREFIX ML_DLM
53#include "cluster/masklog.h" 53#include "cluster/masklog.h"
54 54
55static struct kmem_cache *dlm_lock_cache = NULL; 55static struct kmem_cache *dlm_lock_cache;
56 56
57static DEFINE_SPINLOCK(dlm_cookie_lock); 57static DEFINE_SPINLOCK(dlm_cookie_lock);
58static u64 dlm_next_cookie = 1; 58static u64 dlm_next_cookie = 1;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index ee1f88419cb0..3087a21d32f9 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -82,9 +82,9 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
82 return 1; 82 return 1;
83} 83}
84 84
85static struct kmem_cache *dlm_lockres_cache = NULL; 85static struct kmem_cache *dlm_lockres_cache;
86static struct kmem_cache *dlm_lockname_cache = NULL; 86static struct kmem_cache *dlm_lockname_cache;
87static struct kmem_cache *dlm_mle_cache = NULL; 87static struct kmem_cache *dlm_mle_cache;
88 88
89static void dlm_mle_release(struct kref *kref); 89static void dlm_mle_release(struct kref *kref);
90static void dlm_init_mle(struct dlm_master_list_entry *mle, 90static void dlm_init_mle(struct dlm_master_list_entry *mle,
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index fe29f7978f81..5de019437ea5 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1986,7 +1986,15 @@ skip_lvb:
1986 } 1986 }
1987 if (!bad) { 1987 if (!bad) {
1988 dlm_lock_get(newlock); 1988 dlm_lock_get(newlock);
1989 list_add_tail(&newlock->list, queue); 1989 if (mres->flags & DLM_MRES_RECOVERY &&
1990 ml->list == DLM_CONVERTING_LIST &&
1991 newlock->ml.type >
1992 newlock->ml.convert_type) {
1993 /* newlock is doing downconvert, add it to the
1994 * head of converting list */
1995 list_add(&newlock->list, queue);
1996 } else
1997 list_add_tail(&newlock->list, queue);
1990 mlog(0, "%s:%.*s: added lock for node %u, " 1998 mlog(0, "%s:%.*s: added lock for node %u, "
1991 "setting refmap bit\n", dlm->name, 1999 "setting refmap bit\n", dlm->name,
1992 res->lockname.len, res->lockname.name, ml->node); 2000 res->lockname.len, res->lockname.name, ml->node);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 6bd690b5a061..52cfe99ae056 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2544,11 +2544,6 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
2544 * refreshed, so we do it here. Of course, making sense of 2544 * refreshed, so we do it here. Of course, making sense of
2545 * everything is up to the caller :) */ 2545 * everything is up to the caller :) */
2546 status = ocfs2_should_refresh_lock_res(lockres); 2546 status = ocfs2_should_refresh_lock_res(lockres);
2547 if (status < 0) {
2548 ocfs2_cluster_unlock(osb, lockres, level);
2549 mlog_errno(status);
2550 goto bail;
2551 }
2552 if (status) { 2547 if (status) {
2553 status = ocfs2_refresh_slot_info(osb); 2548 status = ocfs2_refresh_slot_info(osb);
2554 2549
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 8970dcf74de5..8eb6e5732d3b 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -828,7 +828,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
828 /* 828 /*
829 * fs-writeback will release the dirty pages without page lock 829 * fs-writeback will release the dirty pages without page lock
830 * whose offset are over inode size, the release happens at 830 * whose offset are over inode size, the release happens at
831 * block_write_full_page_endio(). 831 * block_write_full_page().
832 */ 832 */
833 i_size_write(inode, abs_to); 833 i_size_write(inode, abs_to);
834 inode->i_blocks = ocfs2_inode_sector_count(inode); 834 inode->i_blocks = ocfs2_inode_sector_count(inode);
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 490229f43731..6f66b3751ace 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -143,8 +143,8 @@ bail:
143 return status; 143 return status;
144} 144}
145 145
146int ocfs2_info_handle_blocksize(struct inode *inode, 146static int ocfs2_info_handle_blocksize(struct inode *inode,
147 struct ocfs2_info_request __user *req) 147 struct ocfs2_info_request __user *req)
148{ 148{
149 int status = -EFAULT; 149 int status = -EFAULT;
150 struct ocfs2_info_blocksize oib; 150 struct ocfs2_info_blocksize oib;
@@ -167,8 +167,8 @@ bail:
167 return status; 167 return status;
168} 168}
169 169
170int ocfs2_info_handle_clustersize(struct inode *inode, 170static int ocfs2_info_handle_clustersize(struct inode *inode,
171 struct ocfs2_info_request __user *req) 171 struct ocfs2_info_request __user *req)
172{ 172{
173 int status = -EFAULT; 173 int status = -EFAULT;
174 struct ocfs2_info_clustersize oic; 174 struct ocfs2_info_clustersize oic;
@@ -192,8 +192,8 @@ bail:
192 return status; 192 return status;
193} 193}
194 194
195int ocfs2_info_handle_maxslots(struct inode *inode, 195static int ocfs2_info_handle_maxslots(struct inode *inode,
196 struct ocfs2_info_request __user *req) 196 struct ocfs2_info_request __user *req)
197{ 197{
198 int status = -EFAULT; 198 int status = -EFAULT;
199 struct ocfs2_info_maxslots oim; 199 struct ocfs2_info_maxslots oim;
@@ -217,8 +217,8 @@ bail:
217 return status; 217 return status;
218} 218}
219 219
220int ocfs2_info_handle_label(struct inode *inode, 220static int ocfs2_info_handle_label(struct inode *inode,
221 struct ocfs2_info_request __user *req) 221 struct ocfs2_info_request __user *req)
222{ 222{
223 int status = -EFAULT; 223 int status = -EFAULT;
224 struct ocfs2_info_label oil; 224 struct ocfs2_info_label oil;
@@ -242,8 +242,8 @@ bail:
242 return status; 242 return status;
243} 243}
244 244
245int ocfs2_info_handle_uuid(struct inode *inode, 245static int ocfs2_info_handle_uuid(struct inode *inode,
246 struct ocfs2_info_request __user *req) 246 struct ocfs2_info_request __user *req)
247{ 247{
248 int status = -EFAULT; 248 int status = -EFAULT;
249 struct ocfs2_info_uuid oiu; 249 struct ocfs2_info_uuid oiu;
@@ -267,8 +267,8 @@ bail:
267 return status; 267 return status;
268} 268}
269 269
270int ocfs2_info_handle_fs_features(struct inode *inode, 270static int ocfs2_info_handle_fs_features(struct inode *inode,
271 struct ocfs2_info_request __user *req) 271 struct ocfs2_info_request __user *req)
272{ 272{
273 int status = -EFAULT; 273 int status = -EFAULT;
274 struct ocfs2_info_fs_features oif; 274 struct ocfs2_info_fs_features oif;
@@ -294,8 +294,8 @@ bail:
294 return status; 294 return status;
295} 295}
296 296
297int ocfs2_info_handle_journal_size(struct inode *inode, 297static int ocfs2_info_handle_journal_size(struct inode *inode,
298 struct ocfs2_info_request __user *req) 298 struct ocfs2_info_request __user *req)
299{ 299{
300 int status = -EFAULT; 300 int status = -EFAULT;
301 struct ocfs2_info_journal_size oij; 301 struct ocfs2_info_journal_size oij;
@@ -319,9 +319,10 @@ bail:
319 return status; 319 return status;
320} 320}
321 321
322int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, 322static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
323 struct inode *inode_alloc, u64 blkno, 323 struct inode *inode_alloc, u64 blkno,
324 struct ocfs2_info_freeinode *fi, u32 slot) 324 struct ocfs2_info_freeinode *fi,
325 u32 slot)
325{ 326{
326 int status = 0, unlock = 0; 327 int status = 0, unlock = 0;
327 328
@@ -366,8 +367,8 @@ bail:
366 return status; 367 return status;
367} 368}
368 369
369int ocfs2_info_handle_freeinode(struct inode *inode, 370static int ocfs2_info_handle_freeinode(struct inode *inode,
370 struct ocfs2_info_request __user *req) 371 struct ocfs2_info_request __user *req)
371{ 372{
372 u32 i; 373 u32 i;
373 u64 blkno = -1; 374 u64 blkno = -1;
@@ -462,19 +463,19 @@ static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats,
462 stats->ffs_free_chunks_real++; 463 stats->ffs_free_chunks_real++;
463} 464}
464 465
465void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg, 466static void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg,
466 unsigned int chunksize) 467 unsigned int chunksize)
467{ 468{
468 o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize); 469 o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize);
469 o2ffg_update_stats(&(ffg->iff_ffs), chunksize); 470 o2ffg_update_stats(&(ffg->iff_ffs), chunksize);
470} 471}
471 472
472int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb, 473static int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb,
473 struct inode *gb_inode, 474 struct inode *gb_inode,
474 struct ocfs2_dinode *gb_dinode, 475 struct ocfs2_dinode *gb_dinode,
475 struct ocfs2_chain_rec *rec, 476 struct ocfs2_chain_rec *rec,
476 struct ocfs2_info_freefrag *ffg, 477 struct ocfs2_info_freefrag *ffg,
477 u32 chunks_in_group) 478 u32 chunks_in_group)
478{ 479{
479 int status = 0, used; 480 int status = 0, used;
480 u64 blkno; 481 u64 blkno;
@@ -572,9 +573,9 @@ bail:
572 return status; 573 return status;
573} 574}
574 575
575int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb, 576static int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb,
576 struct inode *gb_inode, u64 blkno, 577 struct inode *gb_inode, u64 blkno,
577 struct ocfs2_info_freefrag *ffg) 578 struct ocfs2_info_freefrag *ffg)
578{ 579{
579 u32 chunks_in_group; 580 u32 chunks_in_group;
580 int status = 0, unlock = 0, i; 581 int status = 0, unlock = 0, i;
@@ -652,8 +653,8 @@ bail:
652 return status; 653 return status;
653} 654}
654 655
655int ocfs2_info_handle_freefrag(struct inode *inode, 656static int ocfs2_info_handle_freefrag(struct inode *inode,
656 struct ocfs2_info_request __user *req) 657 struct ocfs2_info_request __user *req)
657{ 658{
658 u64 blkno = -1; 659 u64 blkno = -1;
659 char namebuf[40]; 660 char namebuf[40];
@@ -723,8 +724,8 @@ out_err:
723 return status; 724 return status;
724} 725}
725 726
726int ocfs2_info_handle_unknown(struct inode *inode, 727static int ocfs2_info_handle_unknown(struct inode *inode,
727 struct ocfs2_info_request __user *req) 728 struct ocfs2_info_request __user *req)
728{ 729{
729 int status = -EFAULT; 730 int status = -EFAULT;
730 struct ocfs2_info_request oir; 731 struct ocfs2_info_request oir;
@@ -752,8 +753,8 @@ bail:
752 * - distinguish different requests. 753 * - distinguish different requests.
753 * - validate size of different requests. 754 * - validate size of different requests.
754 */ 755 */
755int ocfs2_info_handle_request(struct inode *inode, 756static int ocfs2_info_handle_request(struct inode *inode,
756 struct ocfs2_info_request __user *req) 757 struct ocfs2_info_request __user *req)
757{ 758{
758 int status = -EFAULT; 759 int status = -EFAULT;
759 struct ocfs2_info_request oir; 760 struct ocfs2_info_request oir;
@@ -811,8 +812,8 @@ bail:
811 return status; 812 return status;
812} 813}
813 814
814int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx, 815static int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx,
815 u64 *req_addr, int compat_flag) 816 u64 *req_addr, int compat_flag)
816{ 817{
817 int status = -EFAULT; 818 int status = -EFAULT;
818 u64 __user *bp = NULL; 819 u64 __user *bp = NULL;
@@ -849,8 +850,8 @@ bail:
849 * a better backward&forward compatibility, since a small piece of 850 * a better backward&forward compatibility, since a small piece of
850 * request will be less likely to be broken if disk layout get changed. 851 * request will be less likely to be broken if disk layout get changed.
851 */ 852 */
852int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info, 853static int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info,
853 int compat_flag) 854 int compat_flag)
854{ 855{
855 int i, status = 0; 856 int i, status = 0;
856 u64 req_addr; 857 u64 req_addr;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 03ea9314fecd..4b0c68849b36 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -30,6 +30,7 @@
30#include <linux/kthread.h> 30#include <linux/kthread.h>
31#include <linux/time.h> 31#include <linux/time.h>
32#include <linux/random.h> 32#include <linux/random.h>
33#include <linux/delay.h>
33 34
34#include <cluster/masklog.h> 35#include <cluster/masklog.h>
35 36
@@ -2185,8 +2186,20 @@ static int ocfs2_commit_thread(void *arg)
2185 || kthread_should_stop()); 2186 || kthread_should_stop());
2186 2187
2187 status = ocfs2_commit_cache(osb); 2188 status = ocfs2_commit_cache(osb);
2188 if (status < 0) 2189 if (status < 0) {
2189 mlog_errno(status); 2190 static unsigned long abort_warn_time;
2191
2192 /* Warn about this once per minute */
2193 if (printk_timed_ratelimit(&abort_warn_time, 60*HZ))
2194 mlog(ML_ERROR, "status = %d, journal is "
2195 "already aborted.\n", status);
2196 /*
2197 * After ocfs2_commit_cache() fails, j_num_trans has a
2198 * non-zero value. Sleep here to avoid a busy-wait
2199 * loop.
2200 */
2201 msleep_interruptible(1000);
2202 }
2190 2203
2191 if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){ 2204 if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){
2192 mlog(ML_KTHREAD, 2205 mlog(ML_KTHREAD,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 8d64a97a9d5e..bbec539230fd 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -422,6 +422,7 @@ struct ocfs2_super
422 struct inode *osb_tl_inode; 422 struct inode *osb_tl_inode;
423 struct buffer_head *osb_tl_bh; 423 struct buffer_head *osb_tl_bh;
424 struct delayed_work osb_truncate_log_wq; 424 struct delayed_work osb_truncate_log_wq;
425 atomic_t osb_tl_disable;
425 /* 426 /*
426 * How many clusters in our truncate log. 427 * How many clusters in our truncate log.
427 * It must be protected by osb_tl_inode->i_mutex. 428 * It must be protected by osb_tl_inode->i_mutex.
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 6ba4bcbc4796..714e53b9cc66 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -1408,10 +1408,9 @@ static void swap_refcount_rec(void *a, void *b, int size)
1408{ 1408{
1409 struct ocfs2_refcount_rec *l = a, *r = b, tmp; 1409 struct ocfs2_refcount_rec *l = a, *r = b, tmp;
1410 1410
1411 tmp = *(struct ocfs2_refcount_rec *)l; 1411 tmp = *l;
1412 *(struct ocfs2_refcount_rec *)l = 1412 *l = *r;
1413 *(struct ocfs2_refcount_rec *)r; 1413 *r = tmp;
1414 *(struct ocfs2_refcount_rec *)r = tmp;
1415} 1414}
1416 1415
1417/* 1416/*
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 822ebc10f281..d5da6f624142 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -53,8 +53,6 @@
53 */ 53 */
54static u16 ocfs2_calc_new_backup_super(struct inode *inode, 54static u16 ocfs2_calc_new_backup_super(struct inode *inode,
55 struct ocfs2_group_desc *gd, 55 struct ocfs2_group_desc *gd,
56 int new_clusters,
57 u32 first_new_cluster,
58 u16 cl_cpg, 56 u16 cl_cpg,
59 int set) 57 int set)
60{ 58{
@@ -127,8 +125,6 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
127 OCFS2_FEATURE_COMPAT_BACKUP_SB)) { 125 OCFS2_FEATURE_COMPAT_BACKUP_SB)) {
128 backups = ocfs2_calc_new_backup_super(bm_inode, 126 backups = ocfs2_calc_new_backup_super(bm_inode,
129 group, 127 group,
130 new_clusters,
131 first_new_cluster,
132 cl_cpg, 1); 128 cl_cpg, 1);
133 le16_add_cpu(&group->bg_free_bits_count, -1 * backups); 129 le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
134 } 130 }
@@ -157,7 +153,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
157 153
158 spin_lock(&OCFS2_I(bm_inode)->ip_lock); 154 spin_lock(&OCFS2_I(bm_inode)->ip_lock);
159 OCFS2_I(bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 155 OCFS2_I(bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
160 le64_add_cpu(&fe->i_size, new_clusters << osb->s_clustersize_bits); 156 le64_add_cpu(&fe->i_size, (u64)new_clusters << osb->s_clustersize_bits);
161 spin_unlock(&OCFS2_I(bm_inode)->ip_lock); 157 spin_unlock(&OCFS2_I(bm_inode)->ip_lock);
162 i_size_write(bm_inode, le64_to_cpu(fe->i_size)); 158 i_size_write(bm_inode, le64_to_cpu(fe->i_size));
163 159
@@ -167,8 +163,6 @@ out_rollback:
167 if (ret < 0) { 163 if (ret < 0) {
168 ocfs2_calc_new_backup_super(bm_inode, 164 ocfs2_calc_new_backup_super(bm_inode,
169 group, 165 group,
170 new_clusters,
171 first_new_cluster,
172 cl_cpg, 0); 166 cl_cpg, 0);
173 le16_add_cpu(&group->bg_free_bits_count, backups); 167 le16_add_cpu(&group->bg_free_bits_count, backups);
174 le16_add_cpu(&group->bg_bits, -1 * num_bits); 168 le16_add_cpu(&group->bg_bits, -1 * num_bits);
@@ -569,7 +563,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
569 563
570 spin_lock(&OCFS2_I(main_bm_inode)->ip_lock); 564 spin_lock(&OCFS2_I(main_bm_inode)->ip_lock);
571 OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 565 OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
572 le64_add_cpu(&fe->i_size, input->clusters << osb->s_clustersize_bits); 566 le64_add_cpu(&fe->i_size, (u64)input->clusters << osb->s_clustersize_bits);
573 spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock); 567 spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock);
574 i_size_write(main_bm_inode, le64_to_cpu(fe->i_size)); 568 i_size_write(main_bm_inode, le64_to_cpu(fe->i_size));
575 569
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 83f1a665ae97..5d965e83bd43 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -709,7 +709,7 @@ static struct ctl_table ocfs2_root_table[] = {
709 { } 709 { }
710}; 710};
711 711
712static struct ctl_table_header *ocfs2_table_header = NULL; 712static struct ctl_table_header *ocfs2_table_header;
713 713
714 714
715/* 715/*
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index a7cdd56f4c79..c7a89cea5c5d 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -75,7 +75,7 @@
75 75
76#include "buffer_head_io.h" 76#include "buffer_head_io.h"
77 77
78static struct kmem_cache *ocfs2_inode_cachep = NULL; 78static struct kmem_cache *ocfs2_inode_cachep;
79struct kmem_cache *ocfs2_dquot_cachep; 79struct kmem_cache *ocfs2_dquot_cachep;
80struct kmem_cache *ocfs2_qf_chunk_cachep; 80struct kmem_cache *ocfs2_qf_chunk_cachep;
81 81
@@ -85,7 +85,7 @@ struct kmem_cache *ocfs2_qf_chunk_cachep;
85 * workqueue and schedule on our own. */ 85 * workqueue and schedule on our own. */
86struct workqueue_struct *ocfs2_wq = NULL; 86struct workqueue_struct *ocfs2_wq = NULL;
87 87
88static struct dentry *ocfs2_debugfs_root = NULL; 88static struct dentry *ocfs2_debugfs_root;
89 89
90MODULE_AUTHOR("Oracle"); 90MODULE_AUTHOR("Oracle");
91MODULE_LICENSE("GPL"); 91MODULE_LICENSE("GPL");
@@ -2292,8 +2292,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
2292 goto bail; 2292 goto bail;
2293 } 2293 }
2294 2294
2295 strncpy(osb->vol_label, di->id2.i_super.s_label, 63); 2295 strlcpy(osb->vol_label, di->id2.i_super.s_label,
2296 osb->vol_label[63] = '\0'; 2296 OCFS2_MAX_VOL_LABEL_LEN);
2297 osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno); 2297 osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno);
2298 osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno); 2298 osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno);
2299 osb->first_cluster_group_blkno = 2299 osb->first_cluster_group_blkno =
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 52eaf33d346f..82e17b076ce7 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -67,7 +67,7 @@ struct ocfs2_meta_cache_item {
67 sector_t c_block; 67 sector_t c_block;
68}; 68};
69 69
70static struct kmem_cache *ocfs2_uptodate_cachep = NULL; 70static struct kmem_cache *ocfs2_uptodate_cachep;
71 71
72u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci) 72u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci)
73{ 73{
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 442177b1119a..2101ce46a5d2 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -737,9 +737,6 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
737 ptent = pte_file_clear_soft_dirty(ptent); 737 ptent = pte_file_clear_soft_dirty(ptent);
738 } 738 }
739 739
740 if (vma->vm_flags & VM_SOFTDIRTY)
741 vma->vm_flags &= ~VM_SOFTDIRTY;
742
743 set_pte_at(vma->vm_mm, addr, pte, ptent); 740 set_pte_at(vma->vm_mm, addr, pte, ptent);
744#endif 741#endif
745} 742}
@@ -807,8 +804,9 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
807 804
808 if (type == CLEAR_REFS_SOFT_DIRTY) { 805 if (type == CLEAR_REFS_SOFT_DIRTY) {
809 soft_dirty_cleared = true; 806 soft_dirty_cleared = true;
810 pr_warn_once("The pagemap bits 55-60 has changed their meaning! " 807 pr_warn_once("The pagemap bits 55-60 has changed their meaning!"
811 "See the linux/Documentation/vm/pagemap.txt for details.\n"); 808 " See the linux/Documentation/vm/pagemap.txt for "
809 "details.\n");
812 } 810 }
813 811
814 task = get_proc_task(file_inode(file)); 812 task = get_proc_task(file_inode(file));
@@ -839,11 +837,17 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
839 * 837 *
840 * Writing 3 to /proc/pid/clear_refs only affects file 838 * Writing 3 to /proc/pid/clear_refs only affects file
841 * mapped pages. 839 * mapped pages.
840 *
841 * Writing 4 to /proc/pid/clear_refs affects all pages.
842 */ 842 */
843 if (type == CLEAR_REFS_ANON && vma->vm_file) 843 if (type == CLEAR_REFS_ANON && vma->vm_file)
844 continue; 844 continue;
845 if (type == CLEAR_REFS_MAPPED && !vma->vm_file) 845 if (type == CLEAR_REFS_MAPPED && !vma->vm_file)
846 continue; 846 continue;
847 if (type == CLEAR_REFS_SOFT_DIRTY) {
848 if (vma->vm_flags & VM_SOFTDIRTY)
849 vma->vm_flags &= ~VM_SOFTDIRTY;
850 }
847 walk_page_range(vma->vm_start, vma->vm_end, 851 walk_page_range(vma->vm_start, vma->vm_end,
848 &clear_refs_walk); 852 &clear_refs_walk);
849 } 853 }
diff --git a/fs/readdir.c b/fs/readdir.c
index 5b53d995cae6..33fd92208cb7 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -13,6 +13,7 @@
13#include <linux/stat.h> 13#include <linux/stat.h>
14#include <linux/file.h> 14#include <linux/file.h>
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <linux/fsnotify.h>
16#include <linux/dirent.h> 17#include <linux/dirent.h>
17#include <linux/security.h> 18#include <linux/security.h>
18#include <linux/syscalls.h> 19#include <linux/syscalls.h>
@@ -40,6 +41,7 @@ int iterate_dir(struct file *file, struct dir_context *ctx)
40 ctx->pos = file->f_pos; 41 ctx->pos = file->f_pos;
41 res = file->f_op->iterate(file, ctx); 42 res = file->f_op->iterate(file, ctx);
42 file->f_pos = ctx->pos; 43 file->f_pos = ctx->pos;
44 fsnotify_access(file);
43 file_accessed(file); 45 file_accessed(file);
44 } 46 }
45 mutex_unlock(&inode->i_mutex); 47 mutex_unlock(&inode->i_mutex);
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 9e1bb79f7e6f..887d6d270080 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -25,7 +25,7 @@
25 25
26#define ERROR(s, args...) pr_err("SQUASHFS error: "s, ## args) 26#define ERROR(s, args...) pr_err("SQUASHFS error: "s, ## args)
27 27
28#define WARNING(s, args...) pr_warning("SQUASHFS: "s, ## args) 28#define WARNING(s, args...) pr_warn("SQUASHFS: "s, ## args)
29 29
30/* block.c */ 30/* block.c */
31extern int squashfs_read_data(struct super_block *, u64, int, u64 *, 31extern int squashfs_read_data(struct super_block *, u64, int, u64 *,
diff --git a/fs/super.c b/fs/super.c
index 48377f7463c0..d20d5b11dedf 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -112,9 +112,14 @@ static unsigned long super_cache_count(struct shrinker *shrink,
112 112
113 sb = container_of(shrink, struct super_block, s_shrink); 113 sb = container_of(shrink, struct super_block, s_shrink);
114 114
115 if (!grab_super_passive(sb)) 115 /*
116 return 0; 116 * Don't call grab_super_passive as it is a potential
117 117 * scalability bottleneck. The counts could get updated
118 * between super_cache_count and super_cache_scan anyway.
119 * Call to super_cache_count with shrinker_rwsem held
120 * ensures the safety of call to list_lru_count_node() and
121 * s_op->nr_cached_objects().
122 */
118 if (sb->s_op && sb->s_op->nr_cached_objects) 123 if (sb->s_op && sb->s_op->nr_cached_objects)
119 total_objects = sb->s_op->nr_cached_objects(sb, 124 total_objects = sb->s_op->nr_cached_objects(sb,
120 sc->nid); 125 sc->nid);
@@ -125,7 +130,6 @@ static unsigned long super_cache_count(struct shrinker *shrink,
125 sc->nid); 130 sc->nid);
126 131
127 total_objects = vfs_pressure_ratio(total_objects); 132 total_objects = vfs_pressure_ratio(total_objects);
128 drop_super(sb);
129 return total_objects; 133 return total_objects;
130} 134}
131 135
@@ -276,10 +280,8 @@ void deactivate_locked_super(struct super_block *s)
276 struct file_system_type *fs = s->s_type; 280 struct file_system_type *fs = s->s_type;
277 if (atomic_dec_and_test(&s->s_active)) { 281 if (atomic_dec_and_test(&s->s_active)) {
278 cleancache_invalidate_fs(s); 282 cleancache_invalidate_fs(s);
279 fs->kill_sb(s);
280
281 /* caches are now gone, we can safely kill the shrinker now */
282 unregister_shrinker(&s->s_shrink); 283 unregister_shrinker(&s->s_shrink);
284 fs->kill_sb(s);
283 285
284 put_filesystem(fs); 286 put_filesystem(fs);
285 put_super(s); 287 put_super(s);
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index a8015a7a55bb..53b2acc38213 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -233,6 +233,10 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
233# define pte_accessible(mm, pte) ((void)(pte), 1) 233# define pte_accessible(mm, pte) ((void)(pte), 1)
234#endif 234#endif
235 235
236#ifndef pte_present_nonuma
237#define pte_present_nonuma(pte) pte_present(pte)
238#endif
239
236#ifndef flush_tlb_fix_spurious_fault 240#ifndef flush_tlb_fix_spurious_fault
237#define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address) 241#define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address)
238#endif 242#endif
@@ -670,7 +674,7 @@ static inline int pmd_trans_unstable(pmd_t *pmd)
670static inline int pte_numa(pte_t pte) 674static inline int pte_numa(pte_t pte)
671{ 675{
672 return (pte_flags(pte) & 676 return (pte_flags(pte) &
673 (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA; 677 (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA;
674} 678}
675#endif 679#endif
676 680
@@ -678,7 +682,7 @@ static inline int pte_numa(pte_t pte)
678static inline int pmd_numa(pmd_t pmd) 682static inline int pmd_numa(pmd_t pmd)
679{ 683{
680 return (pmd_flags(pmd) & 684 return (pmd_flags(pmd) &
681 (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA; 685 (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA;
682} 686}
683#endif 687#endif
684 688
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5c6f836afa1b..3cd426e971db 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1588,6 +1588,7 @@ static inline bool blk_integrity_is_initialized(struct gendisk *g)
1588struct block_device_operations { 1588struct block_device_operations {
1589 int (*open) (struct block_device *, fmode_t); 1589 int (*open) (struct block_device *, fmode_t);
1590 void (*release) (struct gendisk *, fmode_t); 1590 void (*release) (struct gendisk *, fmode_t);
1591 int (*rw_page)(struct block_device *, sector_t, struct page *, int rw);
1591 int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); 1592 int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
1592 int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); 1593 int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
1593 int (*direct_access) (struct block_device *, sector_t, 1594 int (*direct_access) (struct block_device *, sector_t,
@@ -1606,7 +1607,13 @@ struct block_device_operations {
1606 1607
1607extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int, 1608extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int,
1608 unsigned long); 1609 unsigned long);
1610extern int bdev_read_page(struct block_device *, sector_t, struct page *);
1611extern int bdev_write_page(struct block_device *, sector_t, struct page *,
1612 struct writeback_control *);
1609#else /* CONFIG_BLOCK */ 1613#else /* CONFIG_BLOCK */
1614
1615struct block_device;
1616
1610/* 1617/*
1611 * stubs for when the block layer is configured out 1618 * stubs for when the block layer is configured out
1612 */ 1619 */
@@ -1642,6 +1649,12 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
1642 return false; 1649 return false;
1643} 1650}
1644 1651
1652static inline int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
1653 sector_t *error_sector)
1654{
1655 return 0;
1656}
1657
1645#endif /* CONFIG_BLOCK */ 1658#endif /* CONFIG_BLOCK */
1646 1659
1647#endif 1660#endif
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index db51fe4fe317..4e2bd4c95b66 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -58,9 +58,9 @@ extern void free_bootmem_late(unsigned long physaddr, unsigned long size);
58 * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE, 58 * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE,
59 * the architecture-specific code should honor this). 59 * the architecture-specific code should honor this).
60 * 60 *
61 * If flags is 0, then the return value is always 0 (success). If 61 * If flags is BOOTMEM_DEFAULT, then the return value is always 0 (success).
62 * flags contains BOOTMEM_EXCLUSIVE, then -EBUSY is returned if the 62 * If flags contains BOOTMEM_EXCLUSIVE, then -EBUSY is returned if the memory
63 * memory already was reserved. 63 * already was reserved.
64 */ 64 */
65#define BOOTMEM_DEFAULT 0 65#define BOOTMEM_DEFAULT 0
66#define BOOTMEM_EXCLUSIVE (1<<0) 66#define BOOTMEM_EXCLUSIVE (1<<0)
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 7cbf837a279c..324329ceea1e 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -207,8 +207,6 @@ void block_invalidatepage(struct page *page, unsigned int offset,
207 unsigned int length); 207 unsigned int length);
208int block_write_full_page(struct page *page, get_block_t *get_block, 208int block_write_full_page(struct page *page, get_block_t *get_block,
209 struct writeback_control *wbc); 209 struct writeback_control *wbc);
210int block_write_full_page_endio(struct page *page, get_block_t *get_block,
211 struct writeback_control *wbc, bh_end_io_t *handler);
212int block_read_full_page(struct page*, get_block_t*); 210int block_read_full_page(struct page*, get_block_t*);
213int block_is_partially_uptodate(struct page *page, unsigned long from, 211int block_is_partially_uptodate(struct page *page, unsigned long from,
214 unsigned long count); 212 unsigned long count);
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 7e1c76e3cd68..01e3132820da 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
22extern int fragmentation_index(struct zone *zone, unsigned int order); 22extern int fragmentation_index(struct zone *zone, unsigned int order);
23extern unsigned long try_to_compact_pages(struct zonelist *zonelist, 23extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
24 int order, gfp_t gfp_mask, nodemask_t *mask, 24 int order, gfp_t gfp_mask, nodemask_t *mask,
25 bool sync, bool *contended); 25 enum migrate_mode mode, bool *contended);
26extern void compact_pgdat(pg_data_t *pgdat, int order); 26extern void compact_pgdat(pg_data_t *pgdat, int order);
27extern void reset_isolation_suitable(pg_data_t *pgdat); 27extern void reset_isolation_suitable(pg_data_t *pgdat);
28extern unsigned long compaction_suitable(struct zone *zone, int order); 28extern unsigned long compaction_suitable(struct zone *zone, int order);
@@ -91,7 +91,7 @@ static inline bool compaction_restarting(struct zone *zone, int order)
91#else 91#else
92static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, 92static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
93 int order, gfp_t gfp_mask, nodemask_t *nodemask, 93 int order, gfp_t gfp_mask, nodemask_t *nodemask,
94 bool sync, bool *contended) 94 enum migrate_mode mode, bool *contended)
95{ 95{
96 return COMPACT_CONTINUE; 96 return COMPACT_CONTINUE;
97} 97}
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index ee7239ea1583..64fdfe1cfcf0 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -323,9 +323,18 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
323#endif 323#endif
324#ifndef __compiletime_error 324#ifndef __compiletime_error
325# define __compiletime_error(message) 325# define __compiletime_error(message)
326# define __compiletime_error_fallback(condition) \ 326/*
327 * Sparse complains of variable sized arrays due to the temporary variable in
328 * __compiletime_assert. Unfortunately we can't just expand it out to make
329 * sparse see a constant array size without breaking compiletime_assert on old
330 * versions of GCC (e.g. 4.2.4), so hide the array from sparse altogether.
331 */
332# ifndef __CHECKER__
333# define __compiletime_error_fallback(condition) \
327 do { ((void)sizeof(char[1 - 2 * condition])); } while (0) 334 do { ((void)sizeof(char[1 - 2 * condition])); } while (0)
328#else 335# endif
336#endif
337#ifndef __compiletime_error_fallback
329# define __compiletime_error_fallback(condition) do { } while (0) 338# define __compiletime_error_fallback(condition) do { } while (0)
330#endif 339#endif
331 340
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index b19d3dc2e651..ade2390ffe92 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -12,10 +12,31 @@
12#include <linux/cpumask.h> 12#include <linux/cpumask.h>
13#include <linux/nodemask.h> 13#include <linux/nodemask.h>
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/jump_label.h>
15 16
16#ifdef CONFIG_CPUSETS 17#ifdef CONFIG_CPUSETS
17 18
18extern int number_of_cpusets; /* How many cpusets are defined in system? */ 19extern struct static_key cpusets_enabled_key;
20static inline bool cpusets_enabled(void)
21{
22 return static_key_false(&cpusets_enabled_key);
23}
24
25static inline int nr_cpusets(void)
26{
27 /* jump label reference count + the top-level cpuset */
28 return static_key_count(&cpusets_enabled_key) + 1;
29}
30
31static inline void cpuset_inc(void)
32{
33 static_key_slow_inc(&cpusets_enabled_key);
34}
35
36static inline void cpuset_dec(void)
37{
38 static_key_slow_dec(&cpusets_enabled_key);
39}
19 40
20extern int cpuset_init(void); 41extern int cpuset_init(void);
21extern void cpuset_init_smp(void); 42extern void cpuset_init_smp(void);
@@ -32,13 +53,13 @@ extern int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask);
32 53
33static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) 54static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
34{ 55{
35 return number_of_cpusets <= 1 || 56 return nr_cpusets() <= 1 ||
36 __cpuset_node_allowed_softwall(node, gfp_mask); 57 __cpuset_node_allowed_softwall(node, gfp_mask);
37} 58}
38 59
39static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) 60static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
40{ 61{
41 return number_of_cpusets <= 1 || 62 return nr_cpusets() <= 1 ||
42 __cpuset_node_allowed_hardwall(node, gfp_mask); 63 __cpuset_node_allowed_hardwall(node, gfp_mask);
43} 64}
44 65
@@ -124,6 +145,8 @@ static inline void set_mems_allowed(nodemask_t nodemask)
124 145
125#else /* !CONFIG_CPUSETS */ 146#else /* !CONFIG_CPUSETS */
126 147
148static inline bool cpusets_enabled(void) { return false; }
149
127static inline int cpuset_init(void) { return 0; } 150static inline int cpuset_init(void) { return 0; }
128static inline void cpuset_init_smp(void) {} 151static inline void cpuset_init_smp(void) {}
129 152
diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h
index 3b28f937d959..772eab5d524a 100644
--- a/include/linux/dma-contiguous.h
+++ b/include/linux/dma-contiguous.h
@@ -88,7 +88,8 @@ static inline void dma_contiguous_set_default(struct cma *cma)
88void dma_contiguous_reserve(phys_addr_t addr_limit); 88void dma_contiguous_reserve(phys_addr_t addr_limit);
89 89
90int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, 90int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base,
91 phys_addr_t limit, struct cma **res_cma); 91 phys_addr_t limit, struct cma **res_cma,
92 bool fixed);
92 93
93/** 94/**
94 * dma_declare_contiguous() - reserve area for contiguous memory handling 95 * dma_declare_contiguous() - reserve area for contiguous memory handling
@@ -108,7 +109,7 @@ static inline int dma_declare_contiguous(struct device *dev, phys_addr_t size,
108{ 109{
109 struct cma *cma; 110 struct cma *cma;
110 int ret; 111 int ret;
111 ret = dma_contiguous_reserve_area(size, base, limit, &cma); 112 ret = dma_contiguous_reserve_area(size, base, limit, &cma, true);
112 if (ret == 0) 113 if (ret == 0)
113 dev_set_cma_area(dev, cma); 114 dev_set_cma_area(dev, cma);
114 115
@@ -136,7 +137,9 @@ static inline void dma_contiguous_set_default(struct cma *cma) { }
136static inline void dma_contiguous_reserve(phys_addr_t limit) { } 137static inline void dma_contiguous_reserve(phys_addr_t limit) { }
137 138
138static inline int dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, 139static inline int dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base,
139 phys_addr_t limit, struct cma **res_cma) { 140 phys_addr_t limit, struct cma **res_cma,
141 bool fixed)
142{
140 return -ENOSYS; 143 return -ENOSYS;
141} 144}
142 145
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 878031227c57..c3f46e499dd0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2590,6 +2590,7 @@ extern ssize_t simple_read_from_buffer(void __user *to, size_t count,
2590extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, 2590extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
2591 const void __user *from, size_t count); 2591 const void __user *from, size_t count);
2592 2592
2593extern int __generic_file_fsync(struct file *, loff_t, loff_t, int);
2593extern int generic_file_fsync(struct file *, loff_t, loff_t, int); 2594extern int generic_file_fsync(struct file *, loff_t, loff_t, int);
2594 2595
2595extern int generic_check_addressable(unsigned, u64); 2596extern int generic_check_addressable(unsigned, u64);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 39b81dc7d01a..6eb1fb37de9a 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -6,7 +6,6 @@
6#include <linux/stddef.h> 6#include <linux/stddef.h>
7#include <linux/linkage.h> 7#include <linux/linkage.h>
8#include <linux/topology.h> 8#include <linux/topology.h>
9#include <linux/mmdebug.h>
10 9
11struct vm_area_struct; 10struct vm_area_struct;
12 11
@@ -31,7 +30,6 @@ struct vm_area_struct;
31#define ___GFP_HARDWALL 0x20000u 30#define ___GFP_HARDWALL 0x20000u
32#define ___GFP_THISNODE 0x40000u 31#define ___GFP_THISNODE 0x40000u
33#define ___GFP_RECLAIMABLE 0x80000u 32#define ___GFP_RECLAIMABLE 0x80000u
34#define ___GFP_KMEMCG 0x100000u
35#define ___GFP_NOTRACK 0x200000u 33#define ___GFP_NOTRACK 0x200000u
36#define ___GFP_NO_KSWAPD 0x400000u 34#define ___GFP_NO_KSWAPD 0x400000u
37#define ___GFP_OTHER_NODE 0x800000u 35#define ___GFP_OTHER_NODE 0x800000u
@@ -91,7 +89,6 @@ struct vm_area_struct;
91 89
92#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) 90#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD)
93#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */ 91#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
94#define __GFP_KMEMCG ((__force gfp_t)___GFP_KMEMCG) /* Allocation comes from a memcg-accounted resource */
95#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */ 92#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */
96 93
97/* 94/*
@@ -353,6 +350,10 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
353#define alloc_page_vma_node(gfp_mask, vma, addr, node) \ 350#define alloc_page_vma_node(gfp_mask, vma, addr, node) \
354 alloc_pages_vma(gfp_mask, 0, vma, addr, node) 351 alloc_pages_vma(gfp_mask, 0, vma, addr, node)
355 352
353extern struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order);
354extern struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask,
355 unsigned int order);
356
356extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); 357extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
357extern unsigned long get_zeroed_page(gfp_t gfp_mask); 358extern unsigned long get_zeroed_page(gfp_t gfp_mask);
358 359
@@ -369,11 +370,11 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
369 370
370extern void __free_pages(struct page *page, unsigned int order); 371extern void __free_pages(struct page *page, unsigned int order);
371extern void free_pages(unsigned long addr, unsigned int order); 372extern void free_pages(unsigned long addr, unsigned int order);
372extern void free_hot_cold_page(struct page *page, int cold); 373extern void free_hot_cold_page(struct page *page, bool cold);
373extern void free_hot_cold_page_list(struct list_head *list, int cold); 374extern void free_hot_cold_page_list(struct list_head *list, bool cold);
374 375
375extern void __free_memcg_kmem_pages(struct page *page, unsigned int order); 376extern void __free_kmem_pages(struct page *page, unsigned int order);
376extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order); 377extern void free_kmem_pages(unsigned long addr, unsigned int order);
377 378
378#define __free_page(page) __free_pages((page), 0) 379#define __free_page(page) __free_pages((page), 0)
379#define free_page(addr) free_pages((addr), 0) 380#define free_page(addr) free_pages((addr), 0)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index b65166de1d9d..255cd5cc0754 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -343,6 +343,11 @@ static inline unsigned huge_page_shift(struct hstate *h)
343 return h->order + PAGE_SHIFT; 343 return h->order + PAGE_SHIFT;
344} 344}
345 345
346static inline bool hstate_is_gigantic(struct hstate *h)
347{
348 return huge_page_order(h) >= MAX_ORDER;
349}
350
346static inline unsigned int pages_per_huge_page(struct hstate *h) 351static inline unsigned int pages_per_huge_page(struct hstate *h)
347{ 352{
348 return 1 << h->order; 353 return 1 << h->order;
@@ -392,15 +397,13 @@ static inline pgoff_t basepage_index(struct page *page)
392 397
393extern void dissolve_free_huge_pages(unsigned long start_pfn, 398extern void dissolve_free_huge_pages(unsigned long start_pfn,
394 unsigned long end_pfn); 399 unsigned long end_pfn);
395int pmd_huge_support(void); 400static inline int hugepage_migration_supported(struct hstate *h)
396/*
397 * Currently hugepage migration is enabled only for pmd-based hugepage.
398 * This function will be updated when hugepage migration is more widely
399 * supported.
400 */
401static inline int hugepage_migration_support(struct hstate *h)
402{ 401{
403 return pmd_huge_support() && (huge_page_shift(h) == PMD_SHIFT); 402#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
403 return huge_page_shift(h) == PMD_SHIFT;
404#else
405 return 0;
406#endif
404} 407}
405 408
406static inline spinlock_t *huge_pte_lockptr(struct hstate *h, 409static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
@@ -450,8 +453,7 @@ static inline pgoff_t basepage_index(struct page *page)
450 return page->index; 453 return page->index;
451} 454}
452#define dissolve_free_huge_pages(s, e) do {} while (0) 455#define dissolve_free_huge_pages(s, e) do {} while (0)
453#define pmd_huge_support() 0 456#define hugepage_migration_supported(h) 0
454#define hugepage_migration_support(h) 0
455 457
456static inline spinlock_t *huge_pte_lockptr(struct hstate *h, 458static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
457 struct mm_struct *mm, pte_t *pte) 459 struct mm_struct *mm, pte_t *pte)
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 5c1dfb2a9e73..784304b222b3 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -69,6 +69,10 @@ struct static_key {
69 69
70# include <asm/jump_label.h> 70# include <asm/jump_label.h>
71# define HAVE_JUMP_LABEL 71# define HAVE_JUMP_LABEL
72#else
73struct static_key {
74 atomic_t enabled;
75};
72#endif /* CC_HAVE_ASM_GOTO && CONFIG_JUMP_LABEL */ 76#endif /* CC_HAVE_ASM_GOTO && CONFIG_JUMP_LABEL */
73 77
74enum jump_label_type { 78enum jump_label_type {
@@ -79,6 +83,12 @@ enum jump_label_type {
79struct module; 83struct module;
80 84
81#include <linux/atomic.h> 85#include <linux/atomic.h>
86
87static inline int static_key_count(struct static_key *key)
88{
89 return atomic_read(&key->enabled);
90}
91
82#ifdef HAVE_JUMP_LABEL 92#ifdef HAVE_JUMP_LABEL
83 93
84#define JUMP_LABEL_TYPE_FALSE_BRANCH 0UL 94#define JUMP_LABEL_TYPE_FALSE_BRANCH 0UL
@@ -134,10 +144,6 @@ extern void jump_label_apply_nops(struct module *mod);
134 144
135#else /* !HAVE_JUMP_LABEL */ 145#else /* !HAVE_JUMP_LABEL */
136 146
137struct static_key {
138 atomic_t enabled;
139};
140
141static __always_inline void jump_label_init(void) 147static __always_inline void jump_label_init(void)
142{ 148{
143 static_key_initialized = true; 149 static_key_initialized = true;
@@ -145,14 +151,14 @@ static __always_inline void jump_label_init(void)
145 151
146static __always_inline bool static_key_false(struct static_key *key) 152static __always_inline bool static_key_false(struct static_key *key)
147{ 153{
148 if (unlikely(atomic_read(&key->enabled) > 0)) 154 if (unlikely(static_key_count(key) > 0))
149 return true; 155 return true;
150 return false; 156 return false;
151} 157}
152 158
153static __always_inline bool static_key_true(struct static_key *key) 159static __always_inline bool static_key_true(struct static_key *key)
154{ 160{
155 if (likely(atomic_read(&key->enabled) > 0)) 161 if (likely(static_key_count(key) > 0))
156 return true; 162 return true;
157 return false; 163 return false;
158} 164}
@@ -194,7 +200,7 @@ static inline int jump_label_apply_nops(struct module *mod)
194 200
195static inline bool static_key_enabled(struct static_key *key) 201static inline bool static_key_enabled(struct static_key *key)
196{ 202{
197 return (atomic_read(&key->enabled) > 0); 203 return static_key_count(key) > 0;
198} 204}
199 205
200#endif /* _LINUX_JUMP_LABEL_H */ 206#endif /* _LINUX_JUMP_LABEL_H */
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 73dc382e72d8..b660e05b63d4 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -272,6 +272,8 @@ static inline bool memblock_bottom_up(void) { return false; }
272#define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0) 272#define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0)
273#define MEMBLOCK_ALLOC_ACCESSIBLE 0 273#define MEMBLOCK_ALLOC_ACCESSIBLE 0
274 274
275phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
276 phys_addr_t start, phys_addr_t end);
275phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align, 277phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align,
276 phys_addr_t max_addr); 278 phys_addr_t max_addr);
277phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align, 279phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align,
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b569b8be5c5a..eb65d29516ca 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -492,13 +492,9 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order);
492 492
493int memcg_cache_id(struct mem_cgroup *memcg); 493int memcg_cache_id(struct mem_cgroup *memcg);
494 494
495char *memcg_create_cache_name(struct mem_cgroup *memcg,
496 struct kmem_cache *root_cache);
497int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, 495int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
498 struct kmem_cache *root_cache); 496 struct kmem_cache *root_cache);
499void memcg_free_cache_params(struct kmem_cache *s); 497void memcg_free_cache_params(struct kmem_cache *s);
500void memcg_register_cache(struct kmem_cache *s);
501void memcg_unregister_cache(struct kmem_cache *s);
502 498
503int memcg_update_cache_size(struct kmem_cache *s, int num_groups); 499int memcg_update_cache_size(struct kmem_cache *s, int num_groups);
504void memcg_update_array_size(int num_groups); 500void memcg_update_array_size(int num_groups);
@@ -506,8 +502,10 @@ void memcg_update_array_size(int num_groups);
506struct kmem_cache * 502struct kmem_cache *
507__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp); 503__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp);
508 504
509void mem_cgroup_destroy_cache(struct kmem_cache *cachep); 505int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order);
510int __kmem_cache_destroy_memcg_children(struct kmem_cache *s); 506void __memcg_uncharge_slab(struct kmem_cache *cachep, int order);
507
508int __memcg_cleanup_cache_params(struct kmem_cache *s);
511 509
512/** 510/**
513 * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. 511 * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed.
@@ -534,7 +532,7 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
534 * res_counter_charge_nofail, but we hope those allocations are rare, 532 * res_counter_charge_nofail, but we hope those allocations are rare,
535 * and won't be worth the trouble. 533 * and won't be worth the trouble.
536 */ 534 */
537 if (!(gfp & __GFP_KMEMCG) || (gfp & __GFP_NOFAIL)) 535 if (gfp & __GFP_NOFAIL)
538 return true; 536 return true;
539 if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) 537 if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
540 return true; 538 return true;
@@ -583,17 +581,7 @@ memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
583 * @cachep: the original global kmem cache 581 * @cachep: the original global kmem cache
584 * @gfp: allocation flags. 582 * @gfp: allocation flags.
585 * 583 *
586 * This function assumes that the task allocating, which determines the memcg 584 * All memory allocated from a per-memcg cache is charged to the owner memcg.
587 * in the page allocator, belongs to the same cgroup throughout the whole
588 * process. Misacounting can happen if the task calls memcg_kmem_get_cache()
589 * while belonging to a cgroup, and later on changes. This is considered
590 * acceptable, and should only happen upon task migration.
591 *
592 * Before the cache is created by the memcg core, there is also a possible
593 * imbalance: the task belongs to a memcg, but the cache being allocated from
594 * is the global cache, since the child cache is not yet guaranteed to be
595 * ready. This case is also fine, since in this case the GFP_KMEMCG will not be
596 * passed and the page allocator will not attempt any cgroup accounting.
597 */ 585 */
598static __always_inline struct kmem_cache * 586static __always_inline struct kmem_cache *
599memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) 587memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
@@ -648,14 +636,6 @@ static inline void memcg_free_cache_params(struct kmem_cache *s)
648{ 636{
649} 637}
650 638
651static inline void memcg_register_cache(struct kmem_cache *s)
652{
653}
654
655static inline void memcg_unregister_cache(struct kmem_cache *s)
656{
657}
658
659static inline struct kmem_cache * 639static inline struct kmem_cache *
660memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) 640memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
661{ 641{
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 4ca3d951fe91..010d125bffbf 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -187,14 +187,8 @@ extern void put_page_bootmem(struct page *page);
187extern void get_page_bootmem(unsigned long ingo, struct page *page, 187extern void get_page_bootmem(unsigned long ingo, struct page *page,
188 unsigned long type); 188 unsigned long type);
189 189
190/* 190void get_online_mems(void);
191 * Lock for memory hotplug guarantees 1) all callbacks for memory hotplug 191void put_online_mems(void);
192 * notifier will be called under this. 2) offline/online/add/remove memory
193 * will not run simultaneously.
194 */
195
196void lock_memory_hotplug(void);
197void unlock_memory_hotplug(void);
198 192
199#else /* ! CONFIG_MEMORY_HOTPLUG */ 193#else /* ! CONFIG_MEMORY_HOTPLUG */
200/* 194/*
@@ -232,8 +226,8 @@ static inline int try_online_node(int nid)
232 return 0; 226 return 0;
233} 227}
234 228
235static inline void lock_memory_hotplug(void) {} 229static inline void get_online_mems(void) {}
236static inline void unlock_memory_hotplug(void) {} 230static inline void put_online_mems(void) {}
237 231
238#endif /* ! CONFIG_MEMORY_HOTPLUG */ 232#endif /* ! CONFIG_MEMORY_HOTPLUG */
239 233
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 3c1b968da0ca..f230a978e6ba 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -175,6 +175,12 @@ static inline int vma_migratable(struct vm_area_struct *vma)
175{ 175{
176 if (vma->vm_flags & (VM_IO | VM_PFNMAP)) 176 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
177 return 0; 177 return 0;
178
179#ifndef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
180 if (vma->vm_flags & VM_HUGETLB)
181 return 0;
182#endif
183
178 /* 184 /*
179 * Migration allocates pages in the highest zone. If we cannot 185 * Migration allocates pages in the highest zone. If we cannot
180 * do so then migration (at least from node to node) is not 186 * do so then migration (at least from node to node) is not
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 84a31ad0b791..a2901c414664 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -5,7 +5,9 @@
5#include <linux/mempolicy.h> 5#include <linux/mempolicy.h>
6#include <linux/migrate_mode.h> 6#include <linux/migrate_mode.h>
7 7
8typedef struct page *new_page_t(struct page *, unsigned long private, int **); 8typedef struct page *new_page_t(struct page *page, unsigned long private,
9 int **reason);
10typedef void free_page_t(struct page *page, unsigned long private);
9 11
10/* 12/*
11 * Return values from addresss_space_operations.migratepage(): 13 * Return values from addresss_space_operations.migratepage():
@@ -38,7 +40,7 @@ enum migrate_reason {
38extern void putback_movable_pages(struct list_head *l); 40extern void putback_movable_pages(struct list_head *l);
39extern int migrate_page(struct address_space *, 41extern int migrate_page(struct address_space *,
40 struct page *, struct page *, enum migrate_mode); 42 struct page *, struct page *, enum migrate_mode);
41extern int migrate_pages(struct list_head *l, new_page_t x, 43extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
42 unsigned long private, enum migrate_mode mode, int reason); 44 unsigned long private, enum migrate_mode mode, int reason);
43 45
44extern int migrate_prep(void); 46extern int migrate_prep(void);
@@ -56,8 +58,9 @@ extern int migrate_page_move_mapping(struct address_space *mapping,
56#else 58#else
57 59
58static inline void putback_movable_pages(struct list_head *l) {} 60static inline void putback_movable_pages(struct list_head *l) {}
59static inline int migrate_pages(struct list_head *l, new_page_t x, 61static inline int migrate_pages(struct list_head *l, new_page_t new,
60 unsigned long private, enum migrate_mode mode, int reason) 62 free_page_t free, unsigned long private, enum migrate_mode mode,
63 int reason)
61 { return -ENOSYS; } 64 { return -ENOSYS; }
62 65
63static inline int migrate_prep(void) { return -ENOSYS; } 66static inline int migrate_prep(void) { return -ENOSYS; }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index d6777060449f..368600628d14 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -407,20 +407,25 @@ static inline void compound_unlock_irqrestore(struct page *page,
407#endif 407#endif
408} 408}
409 409
410static inline struct page *compound_head_by_tail(struct page *tail)
411{
412 struct page *head = tail->first_page;
413
414 /*
415 * page->first_page may be a dangling pointer to an old
416 * compound page, so recheck that it is still a tail
417 * page before returning.
418 */
419 smp_rmb();
420 if (likely(PageTail(tail)))
421 return head;
422 return tail;
423}
424
410static inline struct page *compound_head(struct page *page) 425static inline struct page *compound_head(struct page *page)
411{ 426{
412 if (unlikely(PageTail(page))) { 427 if (unlikely(PageTail(page)))
413 struct page *head = page->first_page; 428 return compound_head_by_tail(page);
414
415 /*
416 * page->first_page may be a dangling pointer to an old
417 * compound page, so recheck that it is still a tail
418 * page before returning.
419 */
420 smp_rmb();
421 if (likely(PageTail(page)))
422 return head;
423 }
424 return page; 429 return page;
425} 430}
426 431
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 8967e20cbe57..de1627232af0 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -406,7 +406,7 @@ struct mm_struct {
406 spinlock_t ioctx_lock; 406 spinlock_t ioctx_lock;
407 struct kioctx_table __rcu *ioctx_table; 407 struct kioctx_table __rcu *ioctx_table;
408#endif 408#endif
409#ifdef CONFIG_MM_OWNER 409#ifdef CONFIG_MEMCG
410 /* 410 /*
411 * "owner" points to a task that is regarded as the canonical 411 * "owner" points to a task that is regarded as the canonical
412 * user/owner of this mm. All of the following must be true in 412 * user/owner of this mm. All of the following must be true in
diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index 2d57efa64cc1..edd82a105220 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -1,6 +1,8 @@
1#ifndef LINUX_MM_DEBUG_H 1#ifndef LINUX_MM_DEBUG_H
2#define LINUX_MM_DEBUG_H 1 2#define LINUX_MM_DEBUG_H 1
3 3
4#include <linux/stringify.h>
5
4struct page; 6struct page;
5 7
6extern void dump_page(struct page *page, const char *reason); 8extern void dump_page(struct page *page, const char *reason);
@@ -9,11 +11,20 @@ extern void dump_page_badflags(struct page *page, const char *reason,
9 11
10#ifdef CONFIG_DEBUG_VM 12#ifdef CONFIG_DEBUG_VM
11#define VM_BUG_ON(cond) BUG_ON(cond) 13#define VM_BUG_ON(cond) BUG_ON(cond)
12#define VM_BUG_ON_PAGE(cond, page) \ 14#define VM_BUG_ON_PAGE(cond, page) \
13 do { if (unlikely(cond)) { dump_page(page, NULL); BUG(); } } while (0) 15 do { \
16 if (unlikely(cond)) { \
17 dump_page(page, "VM_BUG_ON_PAGE(" __stringify(cond)")");\
18 BUG(); \
19 } \
20 } while (0)
21#define VM_WARN_ON(cond) WARN_ON(cond)
22#define VM_WARN_ON_ONCE(cond) WARN_ON_ONCE(cond)
14#else 23#else
15#define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond) 24#define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
16#define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond) 25#define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond)
26#define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond)
27#define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond)
17#endif 28#endif
18 29
19#ifdef CONFIG_DEBUG_VIRTUAL 30#ifdef CONFIG_DEBUG_VIRTUAL
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fac5509c18f0..6cbd1b6c3d20 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -75,9 +75,18 @@ enum {
75 75
76extern int page_group_by_mobility_disabled; 76extern int page_group_by_mobility_disabled;
77 77
78static inline int get_pageblock_migratetype(struct page *page) 78#define NR_MIGRATETYPE_BITS (PB_migrate_end - PB_migrate + 1)
79#define MIGRATETYPE_MASK ((1UL << NR_MIGRATETYPE_BITS) - 1)
80
81#define get_pageblock_migratetype(page) \
82 get_pfnblock_flags_mask(page, page_to_pfn(page), \
83 PB_migrate_end, MIGRATETYPE_MASK)
84
85static inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
79{ 86{
80 return get_pageblock_flags_group(page, PB_migrate, PB_migrate_end); 87 BUILD_BUG_ON(PB_migrate_end - PB_migrate != 2);
88 return get_pfnblock_flags_mask(page, pfn, PB_migrate_end,
89 MIGRATETYPE_MASK);
81} 90}
82 91
83struct free_area { 92struct free_area {
@@ -360,9 +369,10 @@ struct zone {
360 /* Set to true when the PG_migrate_skip bits should be cleared */ 369 /* Set to true when the PG_migrate_skip bits should be cleared */
361 bool compact_blockskip_flush; 370 bool compact_blockskip_flush;
362 371
363 /* pfns where compaction scanners should start */ 372 /* pfn where compaction free scanner should start */
364 unsigned long compact_cached_free_pfn; 373 unsigned long compact_cached_free_pfn;
365 unsigned long compact_cached_migrate_pfn; 374 /* pfn where async and sync compaction migration scanner should start */
375 unsigned long compact_cached_migrate_pfn[2];
366#endif 376#endif
367#ifdef CONFIG_MEMORY_HOTPLUG 377#ifdef CONFIG_MEMORY_HOTPLUG
368 /* see spanned/present_pages for more description */ 378 /* see spanned/present_pages for more description */
@@ -481,9 +491,8 @@ struct zone {
481 * give them a chance of being in the same cacheline. 491 * give them a chance of being in the same cacheline.
482 * 492 *
483 * Write access to present_pages at runtime should be protected by 493 * Write access to present_pages at runtime should be protected by
484 * lock_memory_hotplug()/unlock_memory_hotplug(). Any reader who can't 494 * mem_hotplug_begin/end(). Any reader who can't tolerant drift of
485 * tolerant drift of present_pages should hold memory hotplug lock to 495 * present_pages should get_online_mems() to get a stable value.
486 * get a stable value.
487 * 496 *
488 * Read access to managed_pages should be safe because it's unsigned 497 * Read access to managed_pages should be safe because it's unsigned
489 * long. Write access to zone->managed_pages and totalram_pages are 498 * long. Write access to zone->managed_pages and totalram_pages are
@@ -763,10 +772,10 @@ typedef struct pglist_data {
763 unsigned long node_spanned_pages; /* total size of physical page 772 unsigned long node_spanned_pages; /* total size of physical page
764 range, including holes */ 773 range, including holes */
765 int node_id; 774 int node_id;
766 nodemask_t reclaim_nodes; /* Nodes allowed to reclaim from */
767 wait_queue_head_t kswapd_wait; 775 wait_queue_head_t kswapd_wait;
768 wait_queue_head_t pfmemalloc_wait; 776 wait_queue_head_t pfmemalloc_wait;
769 struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */ 777 struct task_struct *kswapd; /* Protected by
778 mem_hotplug_begin/end() */
770 int kswapd_max_order; 779 int kswapd_max_order;
771 enum zone_type classzone_idx; 780 enum zone_type classzone_idx;
772#ifdef CONFIG_NUMA_BALANCING 781#ifdef CONFIG_NUMA_BALANCING
@@ -808,10 +817,10 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat)
808extern struct mutex zonelists_mutex; 817extern struct mutex zonelists_mutex;
809void build_all_zonelists(pg_data_t *pgdat, struct zone *zone); 818void build_all_zonelists(pg_data_t *pgdat, struct zone *zone);
810void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); 819void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
811bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, 820bool zone_watermark_ok(struct zone *z, unsigned int order,
812 int classzone_idx, int alloc_flags); 821 unsigned long mark, int classzone_idx, int alloc_flags);
813bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, 822bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
814 int classzone_idx, int alloc_flags); 823 unsigned long mark, int classzone_idx, int alloc_flags);
815enum memmap_context { 824enum memmap_context {
816 MEMMAP_EARLY, 825 MEMMAP_EARLY,
817 MEMMAP_HOTPLUG, 826 MEMMAP_HOTPLUG,
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index d1fe1a761047..2093eb72785e 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -198,6 +198,7 @@ struct page; /* forward declaration */
198TESTPAGEFLAG(Locked, locked) 198TESTPAGEFLAG(Locked, locked)
199PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error) 199PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error)
200PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced) 200PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced)
201 __SETPAGEFLAG(Referenced, referenced)
201PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty) 202PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty)
202PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru) 203PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru)
203PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active) 204PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
@@ -208,6 +209,7 @@ PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */
208PAGEFLAG(SavePinned, savepinned); /* Xen */ 209PAGEFLAG(SavePinned, savepinned); /* Xen */
209PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved) 210PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
210PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked) 211PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
212 __SETPAGEFLAG(SwapBacked, swapbacked)
211 213
212__PAGEFLAG(SlobFree, slob_free) 214__PAGEFLAG(SlobFree, slob_free)
213 215
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index 2ee8cd2466b5..2baeee12f48e 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -30,9 +30,12 @@ enum pageblock_bits {
30 PB_migrate, 30 PB_migrate,
31 PB_migrate_end = PB_migrate + 3 - 1, 31 PB_migrate_end = PB_migrate + 3 - 1,
32 /* 3 bits required for migrate types */ 32 /* 3 bits required for migrate types */
33#ifdef CONFIG_COMPACTION
34 PB_migrate_skip,/* If set the block is skipped by compaction */ 33 PB_migrate_skip,/* If set the block is skipped by compaction */
35#endif /* CONFIG_COMPACTION */ 34
35 /*
36 * Assume the bits will always align on a word. If this assumption
37 * changes then get/set pageblock needs updating.
38 */
36 NR_PAGEBLOCK_BITS 39 NR_PAGEBLOCK_BITS
37}; 40};
38 41
@@ -62,11 +65,26 @@ extern int pageblock_order;
62/* Forward declaration */ 65/* Forward declaration */
63struct page; 66struct page;
64 67
68unsigned long get_pfnblock_flags_mask(struct page *page,
69 unsigned long pfn,
70 unsigned long end_bitidx,
71 unsigned long mask);
72
73void set_pfnblock_flags_mask(struct page *page,
74 unsigned long flags,
75 unsigned long pfn,
76 unsigned long end_bitidx,
77 unsigned long mask);
78
65/* Declarations for getting and setting flags. See mm/page_alloc.c */ 79/* Declarations for getting and setting flags. See mm/page_alloc.c */
66unsigned long get_pageblock_flags_group(struct page *page, 80#define get_pageblock_flags_group(page, start_bitidx, end_bitidx) \
67 int start_bitidx, int end_bitidx); 81 get_pfnblock_flags_mask(page, page_to_pfn(page), \
68void set_pageblock_flags_group(struct page *page, unsigned long flags, 82 end_bitidx, \
69 int start_bitidx, int end_bitidx); 83 (1 << (end_bitidx - start_bitidx + 1)) - 1)
84#define set_pageblock_flags_group(page, flags, start_bitidx, end_bitidx) \
85 set_pfnblock_flags_mask(page, flags, page_to_pfn(page), \
86 end_bitidx, \
87 (1 << (end_bitidx - start_bitidx + 1)) - 1)
70 88
71#ifdef CONFIG_COMPACTION 89#ifdef CONFIG_COMPACTION
72#define get_pageblock_skip(page) \ 90#define get_pageblock_skip(page) \
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 45598f1e9aa3..0a97b583ee8d 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -110,7 +110,7 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
110 110
111#define page_cache_get(page) get_page(page) 111#define page_cache_get(page) get_page(page)
112#define page_cache_release(page) put_page(page) 112#define page_cache_release(page) put_page(page)
113void release_pages(struct page **pages, int nr, int cold); 113void release_pages(struct page **pages, int nr, bool cold);
114 114
115/* 115/*
116 * speculatively take a reference to a page. 116 * speculatively take a reference to a page.
@@ -259,12 +259,109 @@ pgoff_t page_cache_next_hole(struct address_space *mapping,
259pgoff_t page_cache_prev_hole(struct address_space *mapping, 259pgoff_t page_cache_prev_hole(struct address_space *mapping,
260 pgoff_t index, unsigned long max_scan); 260 pgoff_t index, unsigned long max_scan);
261 261
262#define FGP_ACCESSED 0x00000001
263#define FGP_LOCK 0x00000002
264#define FGP_CREAT 0x00000004
265#define FGP_WRITE 0x00000008
266#define FGP_NOFS 0x00000010
267#define FGP_NOWAIT 0x00000020
268
269struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
270 int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask);
271
272/**
273 * find_get_page - find and get a page reference
274 * @mapping: the address_space to search
275 * @offset: the page index
276 *
277 * Looks up the page cache slot at @mapping & @offset. If there is a
278 * page cache page, it is returned with an increased refcount.
279 *
280 * Otherwise, %NULL is returned.
281 */
282static inline struct page *find_get_page(struct address_space *mapping,
283 pgoff_t offset)
284{
285 return pagecache_get_page(mapping, offset, 0, 0, 0);
286}
287
288static inline struct page *find_get_page_flags(struct address_space *mapping,
289 pgoff_t offset, int fgp_flags)
290{
291 return pagecache_get_page(mapping, offset, fgp_flags, 0, 0);
292}
293
294/**
295 * find_lock_page - locate, pin and lock a pagecache page
296 * pagecache_get_page - find and get a page reference
297 * @mapping: the address_space to search
298 * @offset: the page index
299 *
300 * Looks up the page cache slot at @mapping & @offset. If there is a
301 * page cache page, it is returned locked and with an increased
302 * refcount.
303 *
304 * Otherwise, %NULL is returned.
305 *
306 * find_lock_page() may sleep.
307 */
308static inline struct page *find_lock_page(struct address_space *mapping,
309 pgoff_t offset)
310{
311 return pagecache_get_page(mapping, offset, FGP_LOCK, 0, 0);
312}
313
314/**
315 * find_or_create_page - locate or add a pagecache page
316 * @mapping: the page's address_space
317 * @index: the page's index into the mapping
318 * @gfp_mask: page allocation mode
319 *
320 * Looks up the page cache slot at @mapping & @offset. If there is a
321 * page cache page, it is returned locked and with an increased
322 * refcount.
323 *
324 * If the page is not present, a new page is allocated using @gfp_mask
325 * and added to the page cache and the VM's LRU list. The page is
326 * returned locked and with an increased refcount.
327 *
328 * On memory exhaustion, %NULL is returned.
329 *
330 * find_or_create_page() may sleep, even if @gfp_flags specifies an
331 * atomic allocation!
332 */
333static inline struct page *find_or_create_page(struct address_space *mapping,
334 pgoff_t offset, gfp_t gfp_mask)
335{
336 return pagecache_get_page(mapping, offset,
337 FGP_LOCK|FGP_ACCESSED|FGP_CREAT,
338 gfp_mask, gfp_mask & GFP_RECLAIM_MASK);
339}
340
341/**
342 * grab_cache_page_nowait - returns locked page at given index in given cache
343 * @mapping: target address_space
344 * @index: the page index
345 *
346 * Same as grab_cache_page(), but do not wait if the page is unavailable.
347 * This is intended for speculative data generators, where the data can
348 * be regenerated if the page couldn't be grabbed. This routine should
349 * be safe to call while holding the lock for another page.
350 *
351 * Clear __GFP_FS when allocating the page to avoid recursion into the fs
352 * and deadlock against the caller's locked page.
353 */
354static inline struct page *grab_cache_page_nowait(struct address_space *mapping,
355 pgoff_t index)
356{
357 return pagecache_get_page(mapping, index,
358 FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
359 mapping_gfp_mask(mapping),
360 GFP_NOFS);
361}
362
262struct page *find_get_entry(struct address_space *mapping, pgoff_t offset); 363struct page *find_get_entry(struct address_space *mapping, pgoff_t offset);
263struct page *find_get_page(struct address_space *mapping, pgoff_t offset);
264struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset); 364struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset);
265struct page *find_lock_page(struct address_space *mapping, pgoff_t offset);
266struct page *find_or_create_page(struct address_space *mapping, pgoff_t index,
267 gfp_t gfp_mask);
268unsigned find_get_entries(struct address_space *mapping, pgoff_t start, 365unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
269 unsigned int nr_entries, struct page **entries, 366 unsigned int nr_entries, struct page **entries,
270 pgoff_t *indices); 367 pgoff_t *indices);
@@ -287,8 +384,6 @@ static inline struct page *grab_cache_page(struct address_space *mapping,
287 return find_or_create_page(mapping, index, mapping_gfp_mask(mapping)); 384 return find_or_create_page(mapping, index, mapping_gfp_mask(mapping));
288} 385}
289 386
290extern struct page * grab_cache_page_nowait(struct address_space *mapping,
291 pgoff_t index);
292extern struct page * read_cache_page(struct address_space *mapping, 387extern struct page * read_cache_page(struct address_space *mapping,
293 pgoff_t index, filler_t *filler, void *data); 388 pgoff_t index, filler_t *filler, void *data);
294extern struct page * read_cache_page_gfp(struct address_space *mapping, 389extern struct page * read_cache_page_gfp(struct address_space *mapping,
@@ -425,6 +520,8 @@ static inline void wait_on_page_writeback(struct page *page)
425extern void end_page_writeback(struct page *page); 520extern void end_page_writeback(struct page *page);
426void wait_for_stable_page(struct page *page); 521void wait_for_stable_page(struct page *page);
427 522
523void page_endio(struct page *page, int rw, int err);
524
428/* 525/*
429 * Add an arbitrary waiter to a page's wait queue 526 * Add an arbitrary waiter to a page's wait queue
430 */ 527 */
diff --git a/include/linux/plist.h b/include/linux/plist.h
index aa0fb390bd29..8b6c970cff6c 100644
--- a/include/linux/plist.h
+++ b/include/linux/plist.h
@@ -98,6 +98,13 @@ struct plist_node {
98} 98}
99 99
100/** 100/**
101 * PLIST_HEAD - declare and init plist_head
102 * @head: name for struct plist_head variable
103 */
104#define PLIST_HEAD(head) \
105 struct plist_head head = PLIST_HEAD_INIT(head)
106
107/**
101 * PLIST_NODE_INIT - static struct plist_node initializer 108 * PLIST_NODE_INIT - static struct plist_node initializer
102 * @node: struct plist_node variable name 109 * @node: struct plist_node variable name
103 * @__prio: initial node priority 110 * @__prio: initial node priority
@@ -134,6 +141,8 @@ static inline void plist_node_init(struct plist_node *node, int prio)
134extern void plist_add(struct plist_node *node, struct plist_head *head); 141extern void plist_add(struct plist_node *node, struct plist_head *head);
135extern void plist_del(struct plist_node *node, struct plist_head *head); 142extern void plist_del(struct plist_node *node, struct plist_head *head);
136 143
144extern void plist_requeue(struct plist_node *node, struct plist_head *head);
145
137/** 146/**
138 * plist_for_each - iterate over the plist 147 * plist_for_each - iterate over the plist
139 * @pos: the type * to use as a loop counter 148 * @pos: the type * to use as a loop counter
@@ -143,6 +152,16 @@ extern void plist_del(struct plist_node *node, struct plist_head *head);
143 list_for_each_entry(pos, &(head)->node_list, node_list) 152 list_for_each_entry(pos, &(head)->node_list, node_list)
144 153
145/** 154/**
155 * plist_for_each_continue - continue iteration over the plist
156 * @pos: the type * to use as a loop cursor
157 * @head: the head for your list
158 *
159 * Continue to iterate over plist, continuing after the current position.
160 */
161#define plist_for_each_continue(pos, head) \
162 list_for_each_entry_continue(pos, &(head)->node_list, node_list)
163
164/**
146 * plist_for_each_safe - iterate safely over a plist of given type 165 * plist_for_each_safe - iterate safely over a plist of given type
147 * @pos: the type * to use as a loop counter 166 * @pos: the type * to use as a loop counter
148 * @n: another type * to use as temporary storage 167 * @n: another type * to use as temporary storage
@@ -163,6 +182,18 @@ extern void plist_del(struct plist_node *node, struct plist_head *head);
163 list_for_each_entry(pos, &(head)->node_list, mem.node_list) 182 list_for_each_entry(pos, &(head)->node_list, mem.node_list)
164 183
165/** 184/**
185 * plist_for_each_entry_continue - continue iteration over list of given type
186 * @pos: the type * to use as a loop cursor
187 * @head: the head for your list
188 * @m: the name of the list_struct within the struct
189 *
190 * Continue to iterate over list of given type, continuing after
191 * the current position.
192 */
193#define plist_for_each_entry_continue(pos, head, m) \
194 list_for_each_entry_continue(pos, &(head)->node_list, m.node_list)
195
196/**
166 * plist_for_each_entry_safe - iterate safely over list of given type 197 * plist_for_each_entry_safe - iterate safely over list of given type
167 * @pos: the type * to use as a loop counter 198 * @pos: the type * to use as a loop counter
168 * @n: another type * to use as temporary storage 199 * @n: another type * to use as temporary storage
@@ -229,6 +260,20 @@ static inline int plist_node_empty(const struct plist_node *node)
229#endif 260#endif
230 261
231/** 262/**
263 * plist_next - get the next entry in list
264 * @pos: the type * to cursor
265 */
266#define plist_next(pos) \
267 list_next_entry(pos, node_list)
268
269/**
270 * plist_prev - get the prev entry in list
271 * @pos: the type * to cursor
272 */
273#define plist_prev(pos) \
274 list_prev_entry(pos, node_list)
275
276/**
232 * plist_first - return the first node (and thus, highest priority) 277 * plist_first - return the first node (and thus, highest priority)
233 * @head: the &struct plist_head pointer 278 * @head: the &struct plist_head pointer
234 * 279 *
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 8752f7595b27..319ff7e53efb 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -30,6 +30,17 @@ static inline const char *printk_skip_level(const char *buffer)
30 return buffer; 30 return buffer;
31} 31}
32 32
33/* printk's without a loglevel use this.. */
34#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
35
36/* We show everything that is MORE important than this.. */
37#define CONSOLE_LOGLEVEL_SILENT 0 /* Mum's the word */
38#define CONSOLE_LOGLEVEL_MIN 1 /* Minimum loglevel we let people use */
39#define CONSOLE_LOGLEVEL_QUIET 4 /* Shhh ..., when booted with "quiet" */
40#define CONSOLE_LOGLEVEL_DEFAULT 7 /* anything MORE serious than KERN_DEBUG */
41#define CONSOLE_LOGLEVEL_DEBUG 10 /* issue debug messages */
42#define CONSOLE_LOGLEVEL_MOTORMOUTH 15 /* You can't shut this one up */
43
33extern int console_printk[]; 44extern int console_printk[];
34 45
35#define console_loglevel (console_printk[0]) 46#define console_loglevel (console_printk[0])
@@ -39,13 +50,13 @@ extern int console_printk[];
39 50
40static inline void console_silent(void) 51static inline void console_silent(void)
41{ 52{
42 console_loglevel = 0; 53 console_loglevel = CONSOLE_LOGLEVEL_SILENT;
43} 54}
44 55
45static inline void console_verbose(void) 56static inline void console_verbose(void)
46{ 57{
47 if (console_loglevel) 58 if (console_loglevel)
48 console_loglevel = 15; 59 console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
49} 60}
50 61
51struct va_format { 62struct va_format {
@@ -128,9 +139,9 @@ asmlinkage __printf(1, 2) __cold
128int printk(const char *fmt, ...); 139int printk(const char *fmt, ...);
129 140
130/* 141/*
131 * Special printk facility for scheduler use only, _DO_NOT_USE_ ! 142 * Special printk facility for scheduler/timekeeping use only, _DO_NOT_USE_ !
132 */ 143 */
133__printf(1, 2) __cold int printk_sched(const char *fmt, ...); 144__printf(1, 2) __cold int printk_deferred(const char *fmt, ...);
134 145
135/* 146/*
136 * Please don't use printk_ratelimit(), because it shares ratelimiting state 147 * Please don't use printk_ratelimit(), because it shares ratelimiting state
@@ -165,7 +176,7 @@ int printk(const char *s, ...)
165 return 0; 176 return 0;
166} 177}
167static inline __printf(1, 2) __cold 178static inline __printf(1, 2) __cold
168int printk_sched(const char *s, ...) 179int printk_deferred(const char *s, ...)
169{ 180{
170 return 0; 181 return 0;
171} 182}
@@ -210,6 +221,12 @@ extern asmlinkage void dump_stack(void) __cold;
210#define pr_fmt(fmt) fmt 221#define pr_fmt(fmt) fmt
211#endif 222#endif
212 223
224/*
225 * These can be used to print at the various log levels.
226 * All of these will print unconditionally, although note that pr_debug()
227 * and other debug macros are compiled out unless either DEBUG is defined
228 * or CONFIG_DYNAMIC_DEBUG is set.
229 */
213#define pr_emerg(fmt, ...) \ 230#define pr_emerg(fmt, ...) \
214 printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) 231 printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
215#define pr_alert(fmt, ...) \ 232#define pr_alert(fmt, ...) \
@@ -266,9 +283,20 @@ extern asmlinkage void dump_stack(void) __cold;
266 printk(fmt, ##__VA_ARGS__); \ 283 printk(fmt, ##__VA_ARGS__); \
267 } \ 284 } \
268}) 285})
286#define printk_deferred_once(fmt, ...) \
287({ \
288 static bool __print_once __read_mostly; \
289 \
290 if (!__print_once) { \
291 __print_once = true; \
292 printk_deferred(fmt, ##__VA_ARGS__); \
293 } \
294})
269#else 295#else
270#define printk_once(fmt, ...) \ 296#define printk_once(fmt, ...) \
271 no_printk(fmt, ##__VA_ARGS__) 297 no_printk(fmt, ##__VA_ARGS__)
298#define printk_deferred_once(fmt, ...) \
299 no_printk(fmt, ##__VA_ARGS__)
272#endif 300#endif
273 301
274#define pr_emerg_once(fmt, ...) \ 302#define pr_emerg_once(fmt, ...) \
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 608e60a74c3c..9d117f61d976 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -44,6 +44,10 @@ extern int remove_proc_subtree(const char *, struct proc_dir_entry *);
44 44
45#else /* CONFIG_PROC_FS */ 45#else /* CONFIG_PROC_FS */
46 46
47static inline void proc_root_init(void)
48{
49}
50
47static inline void proc_flush_task(struct task_struct *task) 51static inline void proc_flush_task(struct task_struct *task)
48{ 52{
49} 53}
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b66c2110cb1f..be574506e6a9 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -72,10 +72,9 @@ struct anon_vma_chain {
72}; 72};
73 73
74enum ttu_flags { 74enum ttu_flags {
75 TTU_UNMAP = 0, /* unmap mode */ 75 TTU_UNMAP = 1, /* unmap mode */
76 TTU_MIGRATION = 1, /* migration mode */ 76 TTU_MIGRATION = 2, /* migration mode */
77 TTU_MUNLOCK = 2, /* munlock mode */ 77 TTU_MUNLOCK = 4, /* munlock mode */
78 TTU_ACTION_MASK = 0xff,
79 78
80 TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */ 79 TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */
81 TTU_IGNORE_ACCESS = (1 << 9), /* don't age */ 80 TTU_IGNORE_ACCESS = (1 << 9), /* don't age */
@@ -183,14 +182,10 @@ static inline void page_dup_rmap(struct page *page)
183 */ 182 */
184int page_referenced(struct page *, int is_locked, 183int page_referenced(struct page *, int is_locked,
185 struct mem_cgroup *memcg, unsigned long *vm_flags); 184 struct mem_cgroup *memcg, unsigned long *vm_flags);
186int page_referenced_one(struct page *, struct vm_area_struct *,
187 unsigned long address, void *arg);
188 185
189#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK) 186#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
190 187
191int try_to_unmap(struct page *, enum ttu_flags flags); 188int try_to_unmap(struct page *, enum ttu_flags flags);
192int try_to_unmap_one(struct page *, struct vm_area_struct *,
193 unsigned long address, void *arg);
194 189
195/* 190/*
196 * Called from mm/filemap_xip.c to unmap empty zero page 191 * Called from mm/filemap_xip.c to unmap empty zero page
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 70f67e4e6156..8fcd0e6098d9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -137,12 +137,6 @@ struct filename;
137#define VMACACHE_MASK (VMACACHE_SIZE - 1) 137#define VMACACHE_MASK (VMACACHE_SIZE - 1)
138 138
139/* 139/*
140 * List of flags we want to share for kernel threads,
141 * if only because they are not used by them anyway.
142 */
143#define CLONE_KERNEL (CLONE_FS | CLONE_FILES | CLONE_SIGHAND)
144
145/*
146 * These are the constant used to fake the fixed-point load-average 140 * These are the constant used to fake the fixed-point load-average
147 * counting. Some notes: 141 * counting. Some notes:
148 * - 11 bit fractions expand to 22 bits by the multiplies: this gives 142 * - 11 bit fractions expand to 22 bits by the multiplies: this gives
@@ -745,7 +739,6 @@ static inline int signal_group_exit(const struct signal_struct *sig)
745struct user_struct { 739struct user_struct {
746 atomic_t __count; /* reference count */ 740 atomic_t __count; /* reference count */
747 atomic_t processes; /* How many processes does this user have? */ 741 atomic_t processes; /* How many processes does this user have? */
748 atomic_t files; /* How many open files does this user have? */
749 atomic_t sigpending; /* How many pending signals does this user have? */ 742 atomic_t sigpending; /* How many pending signals does this user have? */
750#ifdef CONFIG_INOTIFY_USER 743#ifdef CONFIG_INOTIFY_USER
751 atomic_t inotify_watches; /* How many inotify watches does this user have? */ 744 atomic_t inotify_watches; /* How many inotify watches does this user have? */
@@ -2967,7 +2960,7 @@ static inline void inc_syscw(struct task_struct *tsk)
2967#define TASK_SIZE_OF(tsk) TASK_SIZE 2960#define TASK_SIZE_OF(tsk) TASK_SIZE
2968#endif 2961#endif
2969 2962
2970#ifdef CONFIG_MM_OWNER 2963#ifdef CONFIG_MEMCG
2971extern void mm_update_next_owner(struct mm_struct *mm); 2964extern void mm_update_next_owner(struct mm_struct *mm);
2972extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); 2965extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
2973#else 2966#else
@@ -2978,7 +2971,7 @@ static inline void mm_update_next_owner(struct mm_struct *mm)
2978static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p) 2971static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
2979{ 2972{
2980} 2973}
2981#endif /* CONFIG_MM_OWNER */ 2974#endif /* CONFIG_MEMCG */
2982 2975
2983static inline unsigned long task_rlimit(const struct task_struct *tsk, 2976static inline unsigned long task_rlimit(const struct task_struct *tsk,
2984 unsigned int limit) 2977 unsigned int limit)
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 8045a554cafb..596a0e007c62 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -25,6 +25,10 @@ enum { sysctl_hung_task_timeout_secs = 0 };
25 * Because the kernel adds some informative sections to a image of program at 25 * Because the kernel adds some informative sections to a image of program at
26 * generating coredump, we need some margin. The number of extra sections is 26 * generating coredump, we need some margin. The number of extra sections is
27 * 1-3 now and depends on arch. We use "5" as safe margin, here. 27 * 1-3 now and depends on arch. We use "5" as safe margin, here.
28 *
29 * ELF extended numbering allows more than 65535 sections, so 16-bit bound is
30 * not a hard limit any more. Although some userspace tools can be surprised by
31 * that.
28 */ 32 */
29#define MAPCOUNT_ELF_CORE_MARGIN (5) 33#define MAPCOUNT_ELF_CORE_MARGIN (5)
30#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) 34#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 307bfbe62387..1d9abb7d22a0 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -116,7 +116,9 @@ struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
116 unsigned long, 116 unsigned long,
117 void (*)(void *)); 117 void (*)(void *));
118#ifdef CONFIG_MEMCG_KMEM 118#ifdef CONFIG_MEMCG_KMEM
119void kmem_cache_create_memcg(struct mem_cgroup *, struct kmem_cache *); 119struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *,
120 struct kmem_cache *,
121 const char *);
120#endif 122#endif
121void kmem_cache_destroy(struct kmem_cache *); 123void kmem_cache_destroy(struct kmem_cache *);
122int kmem_cache_shrink(struct kmem_cache *); 124int kmem_cache_shrink(struct kmem_cache *);
@@ -369,16 +371,7 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s,
369#include <linux/slub_def.h> 371#include <linux/slub_def.h>
370#endif 372#endif
371 373
372static __always_inline void * 374extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order);
373kmalloc_order(size_t size, gfp_t flags, unsigned int order)
374{
375 void *ret;
376
377 flags |= (__GFP_COMP | __GFP_KMEMCG);
378 ret = (void *) __get_free_pages(flags, order);
379 kmemleak_alloc(ret, size, 1, flags);
380 return ret;
381}
382 375
383#ifdef CONFIG_TRACING 376#ifdef CONFIG_TRACING
384extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order); 377extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order);
@@ -533,10 +526,7 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
533 * @memcg: pointer to the memcg this cache belongs to 526 * @memcg: pointer to the memcg this cache belongs to
534 * @list: list_head for the list of all caches in this memcg 527 * @list: list_head for the list of all caches in this memcg
535 * @root_cache: pointer to the global, root cache, this cache was derived from 528 * @root_cache: pointer to the global, root cache, this cache was derived from
536 * @dead: set to true after the memcg dies; the cache may still be around.
537 * @nr_pages: number of pages that belongs to this cache. 529 * @nr_pages: number of pages that belongs to this cache.
538 * @destroy: worker to be called whenever we are ready, or believe we may be
539 * ready, to destroy this cache.
540 */ 530 */
541struct memcg_cache_params { 531struct memcg_cache_params {
542 bool is_root_cache; 532 bool is_root_cache;
@@ -549,9 +539,7 @@ struct memcg_cache_params {
549 struct mem_cgroup *memcg; 539 struct mem_cgroup *memcg;
550 struct list_head list; 540 struct list_head list;
551 struct kmem_cache *root_cache; 541 struct kmem_cache *root_cache;
552 bool dead;
553 atomic_t nr_pages; 542 atomic_t nr_pages;
554 struct work_struct destroy;
555 }; 543 };
556 }; 544 };
557}; 545};
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 350711560753..4bdbee80eede 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -166,10 +166,10 @@ enum {
166#define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX 166#define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
167 167
168/* 168/*
169 * Ratio between the present memory in the zone and the "gap" that 169 * Ratio between zone->managed_pages and the "gap" that above the per-zone
170 * we're allowing kswapd to shrink in addition to the per-zone high 170 * "high_wmark". While balancing nodes, We allow kswapd to shrink zones that
171 * wmark, even for zones that already have the high wmark satisfied, 171 * do not meet the (high_wmark + gap) watermark, even which already met the
172 * in order to provide better per-zone lru behavior. We are ok to 172 * high_wmark, in order to provide better per-zone lru behavior. We are ok to
173 * spend not more than 1% of the memory for this zone balancing "gap". 173 * spend not more than 1% of the memory for this zone balancing "gap".
174 */ 174 */
175#define KSWAPD_ZONE_BALANCE_GAP_RATIO 100 175#define KSWAPD_ZONE_BALANCE_GAP_RATIO 100
@@ -214,8 +214,9 @@ struct percpu_cluster {
214struct swap_info_struct { 214struct swap_info_struct {
215 unsigned long flags; /* SWP_USED etc: see above */ 215 unsigned long flags; /* SWP_USED etc: see above */
216 signed short prio; /* swap priority of this type */ 216 signed short prio; /* swap priority of this type */
217 struct plist_node list; /* entry in swap_active_head */
218 struct plist_node avail_list; /* entry in swap_avail_head */
217 signed char type; /* strange name for an index */ 219 signed char type; /* strange name for an index */
218 signed char next; /* next type on the swap list */
219 unsigned int max; /* extent of the swap_map */ 220 unsigned int max; /* extent of the swap_map */
220 unsigned char *swap_map; /* vmalloc'ed array of usage counts */ 221 unsigned char *swap_map; /* vmalloc'ed array of usage counts */
221 struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ 222 struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
@@ -255,11 +256,6 @@ struct swap_info_struct {
255 struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */ 256 struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */
256}; 257};
257 258
258struct swap_list_t {
259 int head; /* head of priority-ordered swapfile list */
260 int next; /* swapfile to be used next */
261};
262
263/* linux/mm/workingset.c */ 259/* linux/mm/workingset.c */
264void *workingset_eviction(struct address_space *mapping, struct page *page); 260void *workingset_eviction(struct address_space *mapping, struct page *page);
265bool workingset_refault(void *shadow); 261bool workingset_refault(void *shadow);
@@ -308,12 +304,14 @@ extern unsigned long nr_free_pagecache_pages(void);
308 304
309 305
310/* linux/mm/swap.c */ 306/* linux/mm/swap.c */
311extern void __lru_cache_add(struct page *);
312extern void lru_cache_add(struct page *); 307extern void lru_cache_add(struct page *);
308extern void lru_cache_add_anon(struct page *page);
309extern void lru_cache_add_file(struct page *page);
313extern void lru_add_page_tail(struct page *page, struct page *page_tail, 310extern void lru_add_page_tail(struct page *page, struct page *page_tail,
314 struct lruvec *lruvec, struct list_head *head); 311 struct lruvec *lruvec, struct list_head *head);
315extern void activate_page(struct page *); 312extern void activate_page(struct page *);
316extern void mark_page_accessed(struct page *); 313extern void mark_page_accessed(struct page *);
314extern void init_page_accessed(struct page *page);
317extern void lru_add_drain(void); 315extern void lru_add_drain(void);
318extern void lru_add_drain_cpu(int cpu); 316extern void lru_add_drain_cpu(int cpu);
319extern void lru_add_drain_all(void); 317extern void lru_add_drain_all(void);
@@ -323,22 +321,6 @@ extern void swap_setup(void);
323 321
324extern void add_page_to_unevictable_list(struct page *page); 322extern void add_page_to_unevictable_list(struct page *page);
325 323
326/**
327 * lru_cache_add: add a page to the page lists
328 * @page: the page to add
329 */
330static inline void lru_cache_add_anon(struct page *page)
331{
332 ClearPageActive(page);
333 __lru_cache_add(page);
334}
335
336static inline void lru_cache_add_file(struct page *page)
337{
338 ClearPageActive(page);
339 __lru_cache_add(page);
340}
341
342/* linux/mm/vmscan.c */ 324/* linux/mm/vmscan.c */
343extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, 325extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
344 gfp_t gfp_mask, nodemask_t *mask); 326 gfp_t gfp_mask, nodemask_t *mask);
@@ -496,7 +478,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
496#define free_page_and_swap_cache(page) \ 478#define free_page_and_swap_cache(page) \
497 page_cache_release(page) 479 page_cache_release(page)
498#define free_pages_and_swap_cache(pages, nr) \ 480#define free_pages_and_swap_cache(pages, nr) \
499 release_pages((pages), (nr), 0); 481 release_pages((pages), (nr), false);
500 482
501static inline void show_swap_cache_info(void) 483static inline void show_swap_cache_info(void)
502{ 484{
diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
index e282624e8c10..388293a91e8c 100644
--- a/include/linux/swapfile.h
+++ b/include/linux/swapfile.h
@@ -6,7 +6,7 @@
6 * want to expose them to the dozens of source files that include swap.h 6 * want to expose them to the dozens of source files that include swap.h
7 */ 7 */
8extern spinlock_t swap_lock; 8extern spinlock_t swap_lock;
9extern struct swap_list_t swap_list; 9extern struct plist_head swap_active_head;
10extern struct swap_info_struct *swap_info[]; 10extern struct swap_info_struct *swap_info[];
11extern int try_to_unuse(unsigned int, bool, unsigned long); 11extern int try_to_unuse(unsigned int, bool, unsigned long);
12 12
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index c0f75261a728..6adfb7bfbf44 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -54,7 +54,7 @@ static inline pgoff_t swp_offset(swp_entry_t entry)
54/* check whether a pte points to a swap entry */ 54/* check whether a pte points to a swap entry */
55static inline int is_swap_pte(pte_t pte) 55static inline int is_swap_pte(pte_t pte)
56{ 56{
57 return !pte_none(pte) && !pte_present(pte) && !pte_file(pte); 57 return !pte_none(pte) && !pte_present_nonuma(pte) && !pte_file(pte);
58} 58}
59#endif 59#endif
60 60
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index a5ffd32642fd..e7a018eaf3a2 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -116,4 +116,6 @@ static inline void swiotlb_free(void) { }
116#endif 116#endif
117 117
118extern void swiotlb_print_info(void); 118extern void swiotlb_print_info(void);
119extern int is_swiotlb_buffer(phys_addr_t paddr);
120
119#endif /* __LINUX_SWIOTLB_H */ 121#endif /* __LINUX_SWIOTLB_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a4a0588c5397..b0881a0ed322 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -711,7 +711,7 @@ asmlinkage long sys_keyctl(int cmd, unsigned long arg2, unsigned long arg3,
711 711
712asmlinkage long sys_ioprio_set(int which, int who, int ioprio); 712asmlinkage long sys_ioprio_set(int which, int who, int ioprio);
713asmlinkage long sys_ioprio_get(int which, int who); 713asmlinkage long sys_ioprio_get(int which, int who);
714asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, 714asmlinkage long sys_set_mempolicy(int mode, const unsigned long __user *nmask,
715 unsigned long maxnode); 715 unsigned long maxnode);
716asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, 716asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
717 const unsigned long __user *from, 717 const unsigned long __user *from,
@@ -723,7 +723,7 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
723 int flags); 723 int flags);
724asmlinkage long sys_mbind(unsigned long start, unsigned long len, 724asmlinkage long sys_mbind(unsigned long start, unsigned long len,
725 unsigned long mode, 725 unsigned long mode,
726 unsigned long __user *nmask, 726 const unsigned long __user *nmask,
727 unsigned long maxnode, 727 unsigned long maxnode,
728 unsigned flags); 728 unsigned flags);
729asmlinkage long sys_get_mempolicy(int __user *policy, 729asmlinkage long sys_get_mempolicy(int __user *policy,
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index cb0cec94fda3..ff307b548ed3 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -61,8 +61,6 @@ extern long do_no_restart_syscall(struct restart_block *parm);
61# define THREADINFO_GFP (GFP_KERNEL | __GFP_NOTRACK) 61# define THREADINFO_GFP (GFP_KERNEL | __GFP_NOTRACK)
62#endif 62#endif
63 63
64#define THREADINFO_GFP_ACCOUNTED (THREADINFO_GFP | __GFP_KMEMCG)
65
66/* 64/*
67 * flag set/clear/test wrappers 65 * flag set/clear/test wrappers
68 * - pass TIF_xxxx constants to these functions 66 * - pass TIF_xxxx constants to these functions
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 973671ff9e7d..dda6ee521e74 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -58,7 +58,8 @@ int arch_update_cpu_topology(void);
58/* 58/*
59 * If the distance between nodes in a system is larger than RECLAIM_DISTANCE 59 * If the distance between nodes in a system is larger than RECLAIM_DISTANCE
60 * (in whatever arch specific measurement units returned by node_distance()) 60 * (in whatever arch specific measurement units returned by node_distance())
61 * then switch on zone reclaim on boot. 61 * and zone_reclaim_mode is enabled then the VM will only call zone_reclaim()
62 * on nodes within this distance.
62 */ 63 */
63#define RECLAIM_DISTANCE 30 64#define RECLAIM_DISTANCE 30
64#endif 65#endif
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 486c3972c0be..ced92345c963 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -80,6 +80,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
80 NR_TLB_LOCAL_FLUSH_ALL, 80 NR_TLB_LOCAL_FLUSH_ALL,
81 NR_TLB_LOCAL_FLUSH_ONE, 81 NR_TLB_LOCAL_FLUSH_ONE,
82#endif /* CONFIG_DEBUG_TLBFLUSH */ 82#endif /* CONFIG_DEBUG_TLBFLUSH */
83#ifdef CONFIG_DEBUG_VM_VMACACHE
84 VMACACHE_FIND_CALLS,
85 VMACACHE_FIND_HITS,
86#endif
83 NR_VM_EVENT_ITEMS 87 NR_VM_EVENT_ITEMS
84}; 88};
85 89
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 45c9cd1daf7a..82e7db7f7100 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -95,6 +95,12 @@ static inline void vm_events_fold_cpu(int cpu)
95#define count_vm_tlb_events(x, y) do { (void)(y); } while (0) 95#define count_vm_tlb_events(x, y) do { (void)(y); } while (0)
96#endif 96#endif
97 97
98#ifdef CONFIG_DEBUG_VM_VMACACHE
99#define count_vm_vmacache_event(x) count_vm_event(x)
100#else
101#define count_vm_vmacache_event(x) do {} while (0)
102#endif
103
98#define __count_zone_vm_events(item, zone, delta) \ 104#define __count_zone_vm_events(item, zone, delta) \
99 __count_vm_events(item##_NORMAL - ZONE_NORMAL + \ 105 __count_vm_events(item##_NORMAL - ZONE_NORMAL + \
100 zone_idx(zone), delta) 106 zone_idx(zone), delta)
diff --git a/include/linux/zbud.h b/include/linux/zbud.h
index 2571a5cfa5fc..13af0d450bf6 100644
--- a/include/linux/zbud.h
+++ b/include/linux/zbud.h
@@ -11,7 +11,7 @@ struct zbud_ops {
11 11
12struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops); 12struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops);
13void zbud_destroy_pool(struct zbud_pool *pool); 13void zbud_destroy_pool(struct zbud_pool *pool);
14int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp, 14int zbud_alloc(struct zbud_pool *pool, unsigned int size, gfp_t gfp,
15 unsigned long *handle); 15 unsigned long *handle);
16void zbud_free(struct zbud_pool *pool, unsigned long handle); 16void zbud_free(struct zbud_pool *pool, unsigned long handle);
17int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries); 17int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries);
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index 06f544ef2f6f..c6814b917bdf 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -5,6 +5,7 @@
5#define _TRACE_COMPACTION_H 5#define _TRACE_COMPACTION_H
6 6
7#include <linux/types.h> 7#include <linux/types.h>
8#include <linux/list.h>
8#include <linux/tracepoint.h> 9#include <linux/tracepoint.h>
9#include <trace/events/gfpflags.h> 10#include <trace/events/gfpflags.h>
10 11
@@ -47,10 +48,11 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages,
47 48
48TRACE_EVENT(mm_compaction_migratepages, 49TRACE_EVENT(mm_compaction_migratepages,
49 50
50 TP_PROTO(unsigned long nr_migrated, 51 TP_PROTO(unsigned long nr_all,
51 unsigned long nr_failed), 52 int migrate_rc,
53 struct list_head *migratepages),
52 54
53 TP_ARGS(nr_migrated, nr_failed), 55 TP_ARGS(nr_all, migrate_rc, migratepages),
54 56
55 TP_STRUCT__entry( 57 TP_STRUCT__entry(
56 __field(unsigned long, nr_migrated) 58 __field(unsigned long, nr_migrated)
@@ -58,7 +60,22 @@ TRACE_EVENT(mm_compaction_migratepages,
58 ), 60 ),
59 61
60 TP_fast_assign( 62 TP_fast_assign(
61 __entry->nr_migrated = nr_migrated; 63 unsigned long nr_failed = 0;
64 struct list_head *page_lru;
65
66 /*
67 * migrate_pages() returns either a non-negative number
68 * with the number of pages that failed migration, or an
69 * error code, in which case we need to count the remaining
70 * pages manually
71 */
72 if (migrate_rc >= 0)
73 nr_failed = migrate_rc;
74 else
75 list_for_each(page_lru, migratepages)
76 nr_failed++;
77
78 __entry->nr_migrated = nr_all - nr_failed;
62 __entry->nr_failed = nr_failed; 79 __entry->nr_failed = nr_failed;
63 ), 80 ),
64 81
diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h
index 1eddbf1557f2..d6fd8e5b14b7 100644
--- a/include/trace/events/gfpflags.h
+++ b/include/trace/events/gfpflags.h
@@ -34,7 +34,6 @@
34 {(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \ 34 {(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \
35 {(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \ 35 {(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \
36 {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ 36 {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \
37 {(unsigned long)__GFP_KMEMCG, "GFP_KMEMCG"}, \
38 {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \ 37 {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \
39 {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \ 38 {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \
40 {(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \ 39 {(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 132a985aba8b..69590b6ffc09 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -191,6 +191,7 @@ TRACE_EVENT(mm_shrink_slab_start,
191 TP_STRUCT__entry( 191 TP_STRUCT__entry(
192 __field(struct shrinker *, shr) 192 __field(struct shrinker *, shr)
193 __field(void *, shrink) 193 __field(void *, shrink)
194 __field(int, nid)
194 __field(long, nr_objects_to_shrink) 195 __field(long, nr_objects_to_shrink)
195 __field(gfp_t, gfp_flags) 196 __field(gfp_t, gfp_flags)
196 __field(unsigned long, pgs_scanned) 197 __field(unsigned long, pgs_scanned)
@@ -203,6 +204,7 @@ TRACE_EVENT(mm_shrink_slab_start,
203 TP_fast_assign( 204 TP_fast_assign(
204 __entry->shr = shr; 205 __entry->shr = shr;
205 __entry->shrink = shr->scan_objects; 206 __entry->shrink = shr->scan_objects;
207 __entry->nid = sc->nid;
206 __entry->nr_objects_to_shrink = nr_objects_to_shrink; 208 __entry->nr_objects_to_shrink = nr_objects_to_shrink;
207 __entry->gfp_flags = sc->gfp_mask; 209 __entry->gfp_flags = sc->gfp_mask;
208 __entry->pgs_scanned = pgs_scanned; 210 __entry->pgs_scanned = pgs_scanned;
@@ -212,9 +214,10 @@ TRACE_EVENT(mm_shrink_slab_start,
212 __entry->total_scan = total_scan; 214 __entry->total_scan = total_scan;
213 ), 215 ),
214 216
215 TP_printk("%pF %p: objects to shrink %ld gfp_flags %s pgs_scanned %ld lru_pgs %ld cache items %ld delta %lld total_scan %ld", 217 TP_printk("%pF %p: nid: %d objects to shrink %ld gfp_flags %s pgs_scanned %ld lru_pgs %ld cache items %ld delta %lld total_scan %ld",
216 __entry->shrink, 218 __entry->shrink,
217 __entry->shr, 219 __entry->shr,
220 __entry->nid,
218 __entry->nr_objects_to_shrink, 221 __entry->nr_objects_to_shrink,
219 show_gfp_flags(__entry->gfp_flags), 222 show_gfp_flags(__entry->gfp_flags),
220 __entry->pgs_scanned, 223 __entry->pgs_scanned,
@@ -225,13 +228,15 @@ TRACE_EVENT(mm_shrink_slab_start,
225); 228);
226 229
227TRACE_EVENT(mm_shrink_slab_end, 230TRACE_EVENT(mm_shrink_slab_end,
228 TP_PROTO(struct shrinker *shr, int shrinker_retval, 231 TP_PROTO(struct shrinker *shr, int nid, int shrinker_retval,
229 long unused_scan_cnt, long new_scan_cnt), 232 long unused_scan_cnt, long new_scan_cnt, long total_scan),
230 233
231 TP_ARGS(shr, shrinker_retval, unused_scan_cnt, new_scan_cnt), 234 TP_ARGS(shr, nid, shrinker_retval, unused_scan_cnt, new_scan_cnt,
235 total_scan),
232 236
233 TP_STRUCT__entry( 237 TP_STRUCT__entry(
234 __field(struct shrinker *, shr) 238 __field(struct shrinker *, shr)
239 __field(int, nid)
235 __field(void *, shrink) 240 __field(void *, shrink)
236 __field(long, unused_scan) 241 __field(long, unused_scan)
237 __field(long, new_scan) 242 __field(long, new_scan)
@@ -241,16 +246,18 @@ TRACE_EVENT(mm_shrink_slab_end,
241 246
242 TP_fast_assign( 247 TP_fast_assign(
243 __entry->shr = shr; 248 __entry->shr = shr;
249 __entry->nid = nid;
244 __entry->shrink = shr->scan_objects; 250 __entry->shrink = shr->scan_objects;
245 __entry->unused_scan = unused_scan_cnt; 251 __entry->unused_scan = unused_scan_cnt;
246 __entry->new_scan = new_scan_cnt; 252 __entry->new_scan = new_scan_cnt;
247 __entry->retval = shrinker_retval; 253 __entry->retval = shrinker_retval;
248 __entry->total_scan = new_scan_cnt - unused_scan_cnt; 254 __entry->total_scan = total_scan;
249 ), 255 ),
250 256
251 TP_printk("%pF %p: unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d", 257 TP_printk("%pF %p: nid: %d unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d",
252 __entry->shrink, 258 __entry->shrink,
253 __entry->shr, 259 __entry->shr,
260 __entry->nid,
254 __entry->unused_scan, 261 __entry->unused_scan,
255 __entry->new_scan, 262 __entry->new_scan,
256 __entry->total_scan, 263 __entry->total_scan,
diff --git a/init/Kconfig b/init/Kconfig
index 9d3585bb2a7a..9d76b99af1b9 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -261,6 +261,16 @@ config POSIX_MQUEUE_SYSCTL
261 depends on SYSCTL 261 depends on SYSCTL
262 default y 262 default y
263 263
264config CROSS_MEMORY_ATTACH
265 bool "Enable process_vm_readv/writev syscalls"
266 depends on MMU
267 default y
268 help
269 Enabling this option adds the system calls process_vm_readv and
270 process_vm_writev which allow a process with the correct privileges
271 to directly read from or write to to another process's address space.
272 See the man page for more details.
273
264config FHANDLE 274config FHANDLE
265 bool "open by fhandle syscalls" 275 bool "open by fhandle syscalls"
266 select EXPORTFS 276 select EXPORTFS
@@ -933,7 +943,6 @@ config RESOURCE_COUNTERS
933config MEMCG 943config MEMCG
934 bool "Memory Resource Controller for Control Groups" 944 bool "Memory Resource Controller for Control Groups"
935 depends on RESOURCE_COUNTERS 945 depends on RESOURCE_COUNTERS
936 select MM_OWNER
937 select EVENTFD 946 select EVENTFD
938 help 947 help
939 Provides a memory resource controller that manages both anonymous 948 Provides a memory resource controller that manages both anonymous
@@ -951,9 +960,6 @@ config MEMCG
951 disable memory resource controller and you can avoid overheads. 960 disable memory resource controller and you can avoid overheads.
952 (and lose benefits of memory resource controller) 961 (and lose benefits of memory resource controller)
953 962
954 This config option also selects MM_OWNER config option, which
955 could in turn add some fork/exit overhead.
956
957config MEMCG_SWAP 963config MEMCG_SWAP
958 bool "Memory Resource Controller Swap Extension" 964 bool "Memory Resource Controller Swap Extension"
959 depends on MEMCG && SWAP 965 depends on MEMCG && SWAP
@@ -996,6 +1002,12 @@ config MEMCG_KMEM
996 the kmem extension can use it to guarantee that no group of processes 1002 the kmem extension can use it to guarantee that no group of processes
997 will ever exhaust kernel resources alone. 1003 will ever exhaust kernel resources alone.
998 1004
1005 WARNING: Current implementation lacks reclaim support. That means
1006 allocation attempts will fail when close to the limit even if there
1007 are plenty of kmem available for reclaim. That makes this option
1008 unusable in real life so DO NOT SELECT IT unless for development
1009 purposes.
1010
999config CGROUP_HUGETLB 1011config CGROUP_HUGETLB
1000 bool "HugeTLB Resource Controller for Control Groups" 1012 bool "HugeTLB Resource Controller for Control Groups"
1001 depends on RESOURCE_COUNTERS && HUGETLB_PAGE 1013 depends on RESOURCE_COUNTERS && HUGETLB_PAGE
@@ -1173,9 +1185,6 @@ config SCHED_AUTOGROUP
1173 desktop applications. Task group autogeneration is currently based 1185 desktop applications. Task group autogeneration is currently based
1174 upon task session. 1186 upon task session.
1175 1187
1176config MM_OWNER
1177 bool
1178
1179config SYSFS_DEPRECATED 1188config SYSFS_DEPRECATED
1180 bool "Enable deprecated sysfs features to support old userspace tools" 1189 bool "Enable deprecated sysfs features to support old userspace tools"
1181 depends on SYSFS 1190 depends on SYSFS
@@ -1304,6 +1313,16 @@ config UID16
1304 help 1313 help
1305 This enables the legacy 16-bit UID syscall wrappers. 1314 This enables the legacy 16-bit UID syscall wrappers.
1306 1315
1316config SGETMASK_SYSCALL
1317 bool "sgetmask/ssetmask syscalls support" if EXPERT
1318 def_bool PARISC || MN10300 || BLACKFIN || M68K || PPC || MIPS || X86 || SPARC || CRIS || MICROBLAZE || SUPERH
1319 ---help---
1320 sys_sgetmask and sys_ssetmask are obsolete system calls
1321 no longer supported in libc but still enabled by default in some
1322 architectures.
1323
1324 If unsure, leave the default option here.
1325
1307config SYSFS_SYSCALL 1326config SYSFS_SYSCALL
1308 bool "Sysfs syscall support" if EXPERT 1327 bool "Sysfs syscall support" if EXPERT
1309 default y 1328 default y
diff --git a/init/main.c b/init/main.c
index 48655ceb66f4..17d47bcdf573 100644
--- a/init/main.c
+++ b/init/main.c
@@ -77,6 +77,7 @@
77#include <linux/sched_clock.h> 77#include <linux/sched_clock.h>
78#include <linux/context_tracking.h> 78#include <linux/context_tracking.h>
79#include <linux/random.h> 79#include <linux/random.h>
80#include <linux/list.h>
80 81
81#include <asm/io.h> 82#include <asm/io.h>
82#include <asm/bugs.h> 83#include <asm/bugs.h>
@@ -203,13 +204,13 @@ EXPORT_SYMBOL(loops_per_jiffy);
203 204
204static int __init debug_kernel(char *str) 205static int __init debug_kernel(char *str)
205{ 206{
206 console_loglevel = 10; 207 console_loglevel = CONSOLE_LOGLEVEL_DEBUG;
207 return 0; 208 return 0;
208} 209}
209 210
210static int __init quiet_kernel(char *str) 211static int __init quiet_kernel(char *str)
211{ 212{
212 console_loglevel = 4; 213 console_loglevel = CONSOLE_LOGLEVEL_QUIET;
213 return 0; 214 return 0;
214} 215}
215 216
@@ -379,7 +380,7 @@ static noinline void __init_refok rest_init(void)
379 * the init task will end up wanting to create kthreads, which, if 380 * the init task will end up wanting to create kthreads, which, if
380 * we schedule it before we create kthreadd, will OOPS. 381 * we schedule it before we create kthreadd, will OOPS.
381 */ 382 */
382 kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND); 383 kernel_thread(kernel_init, NULL, CLONE_FS);
383 numa_default_policy(); 384 numa_default_policy();
384 pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES); 385 pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
385 rcu_read_lock(); 386 rcu_read_lock();
@@ -507,7 +508,6 @@ asmlinkage __visible void __init start_kernel(void)
507 page_address_init(); 508 page_address_init();
508 pr_notice("%s", linux_banner); 509 pr_notice("%s", linux_banner);
509 setup_arch(&command_line); 510 setup_arch(&command_line);
510 mm_init_owner(&init_mm, &init_task);
511 mm_init_cpumask(&init_mm); 511 mm_init_cpumask(&init_mm);
512 setup_command_line(command_line); 512 setup_command_line(command_line);
513 setup_nr_cpu_ids(); 513 setup_nr_cpu_ids();
@@ -629,9 +629,7 @@ asmlinkage __visible void __init start_kernel(void)
629 signals_init(); 629 signals_init();
630 /* rootfs populating might need page-writeback */ 630 /* rootfs populating might need page-writeback */
631 page_writeback_init(); 631 page_writeback_init();
632#ifdef CONFIG_PROC_FS
633 proc_root_init(); 632 proc_root_init();
634#endif
635 cgroup_init(); 633 cgroup_init();
636 cpuset_init(); 634 cpuset_init();
637 taskstats_init_early(); 635 taskstats_init_early();
@@ -666,19 +664,83 @@ static void __init do_ctors(void)
666bool initcall_debug; 664bool initcall_debug;
667core_param(initcall_debug, initcall_debug, bool, 0644); 665core_param(initcall_debug, initcall_debug, bool, 0644);
668 666
667#ifdef CONFIG_KALLSYMS
668struct blacklist_entry {
669 struct list_head next;
670 char *buf;
671};
672
673static __initdata_or_module LIST_HEAD(blacklisted_initcalls);
674
675static int __init initcall_blacklist(char *str)
676{
677 char *str_entry;
678 struct blacklist_entry *entry;
679
680 /* str argument is a comma-separated list of functions */
681 do {
682 str_entry = strsep(&str, ",");
683 if (str_entry) {
684 pr_debug("blacklisting initcall %s\n", str_entry);
685 entry = alloc_bootmem(sizeof(*entry));
686 entry->buf = alloc_bootmem(strlen(str_entry) + 1);
687 strcpy(entry->buf, str_entry);
688 list_add(&entry->next, &blacklisted_initcalls);
689 }
690 } while (str_entry);
691
692 return 0;
693}
694
695static bool __init_or_module initcall_blacklisted(initcall_t fn)
696{
697 struct list_head *tmp;
698 struct blacklist_entry *entry;
699 char *fn_name;
700
701 fn_name = kasprintf(GFP_KERNEL, "%pf", fn);
702 if (!fn_name)
703 return false;
704
705 list_for_each(tmp, &blacklisted_initcalls) {
706 entry = list_entry(tmp, struct blacklist_entry, next);
707 if (!strcmp(fn_name, entry->buf)) {
708 pr_debug("initcall %s blacklisted\n", fn_name);
709 kfree(fn_name);
710 return true;
711 }
712 }
713
714 kfree(fn_name);
715 return false;
716}
717#else
718static int __init initcall_blacklist(char *str)
719{
720 pr_warn("initcall_blacklist requires CONFIG_KALLSYMS\n");
721 return 0;
722}
723
724static bool __init_or_module initcall_blacklisted(initcall_t fn)
725{
726 return false;
727}
728#endif
729__setup("initcall_blacklist=", initcall_blacklist);
730
669static int __init_or_module do_one_initcall_debug(initcall_t fn) 731static int __init_or_module do_one_initcall_debug(initcall_t fn)
670{ 732{
671 ktime_t calltime, delta, rettime; 733 ktime_t calltime, delta, rettime;
672 unsigned long long duration; 734 unsigned long long duration;
673 int ret; 735 int ret;
674 736
675 pr_debug("calling %pF @ %i\n", fn, task_pid_nr(current)); 737 printk(KERN_DEBUG "calling %pF @ %i\n", fn, task_pid_nr(current));
676 calltime = ktime_get(); 738 calltime = ktime_get();
677 ret = fn(); 739 ret = fn();
678 rettime = ktime_get(); 740 rettime = ktime_get();
679 delta = ktime_sub(rettime, calltime); 741 delta = ktime_sub(rettime, calltime);
680 duration = (unsigned long long) ktime_to_ns(delta) >> 10; 742 duration = (unsigned long long) ktime_to_ns(delta) >> 10;
681 pr_debug("initcall %pF returned %d after %lld usecs\n", 743 printk(KERN_DEBUG "initcall %pF returned %d after %lld usecs\n",
682 fn, ret, duration); 744 fn, ret, duration);
683 745
684 return ret; 746 return ret;
@@ -690,6 +752,9 @@ int __init_or_module do_one_initcall(initcall_t fn)
690 int ret; 752 int ret;
691 char msgbuf[64]; 753 char msgbuf[64];
692 754
755 if (initcall_blacklisted(fn))
756 return -EPERM;
757
693 if (initcall_debug) 758 if (initcall_debug)
694 ret = do_one_initcall_debug(fn); 759 ret = do_one_initcall_debug(fn);
695 else 760 else
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index a5e026bc45c4..1323360d90e3 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -19,8 +19,8 @@
19 19
20static void backtrace_test_normal(void) 20static void backtrace_test_normal(void)
21{ 21{
22 printk("Testing a backtrace from process context.\n"); 22 pr_info("Testing a backtrace from process context.\n");
23 printk("The following trace is a kernel self test and not a bug!\n"); 23 pr_info("The following trace is a kernel self test and not a bug!\n");
24 24
25 dump_stack(); 25 dump_stack();
26} 26}
@@ -37,8 +37,8 @@ static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0);
37 37
38static void backtrace_test_irq(void) 38static void backtrace_test_irq(void)
39{ 39{
40 printk("Testing a backtrace from irq context.\n"); 40 pr_info("Testing a backtrace from irq context.\n");
41 printk("The following trace is a kernel self test and not a bug!\n"); 41 pr_info("The following trace is a kernel self test and not a bug!\n");
42 42
43 init_completion(&backtrace_work); 43 init_completion(&backtrace_work);
44 tasklet_schedule(&backtrace_tasklet); 44 tasklet_schedule(&backtrace_tasklet);
@@ -51,8 +51,8 @@ static void backtrace_test_saved(void)
51 struct stack_trace trace; 51 struct stack_trace trace;
52 unsigned long entries[8]; 52 unsigned long entries[8];
53 53
54 printk("Testing a saved backtrace.\n"); 54 pr_info("Testing a saved backtrace.\n");
55 printk("The following trace is a kernel self test and not a bug!\n"); 55 pr_info("The following trace is a kernel self test and not a bug!\n");
56 56
57 trace.nr_entries = 0; 57 trace.nr_entries = 0;
58 trace.max_entries = ARRAY_SIZE(entries); 58 trace.max_entries = ARRAY_SIZE(entries);
@@ -65,19 +65,19 @@ static void backtrace_test_saved(void)
65#else 65#else
66static void backtrace_test_saved(void) 66static void backtrace_test_saved(void)
67{ 67{
68 printk("Saved backtrace test skipped.\n"); 68 pr_info("Saved backtrace test skipped.\n");
69} 69}
70#endif 70#endif
71 71
72static int backtrace_regression_test(void) 72static int backtrace_regression_test(void)
73{ 73{
74 printk("====[ backtrace testing ]===========\n"); 74 pr_info("====[ backtrace testing ]===========\n");
75 75
76 backtrace_test_normal(); 76 backtrace_test_normal();
77 backtrace_test_irq(); 77 backtrace_test_irq();
78 backtrace_test_saved(); 78 backtrace_test_saved();
79 79
80 printk("====[ end of backtrace testing ]====\n"); 80 pr_info("====[ end of backtrace testing ]====\n");
81 return 0; 81 return 0;
82} 82}
83 83
diff --git a/kernel/capability.c b/kernel/capability.c
index a8d63df0c322..84b2bbf443e7 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -24,7 +24,6 @@
24 */ 24 */
25 25
26const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; 26const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
27
28EXPORT_SYMBOL(__cap_empty_set); 27EXPORT_SYMBOL(__cap_empty_set);
29 28
30int file_caps_enabled = 1; 29int file_caps_enabled = 1;
@@ -189,7 +188,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
189 * 188 *
190 * An alternative would be to return an error here 189 * An alternative would be to return an error here
191 * (-ERANGE), but that causes legacy applications to 190 * (-ERANGE), but that causes legacy applications to
192 * unexpectidly fail; the capget/modify/capset aborts 191 * unexpectedly fail; the capget/modify/capset aborts
193 * before modification is attempted and the application 192 * before modification is attempted and the application
194 * fails. 193 * fails.
195 */ 194 */
@@ -395,7 +394,8 @@ EXPORT_SYMBOL(ns_capable);
395 * This does not set PF_SUPERPRIV because the caller may not 394 * This does not set PF_SUPERPRIV because the caller may not
396 * actually be privileged. 395 * actually be privileged.
397 */ 396 */
398bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) 397bool file_ns_capable(const struct file *file, struct user_namespace *ns,
398 int cap)
399{ 399{
400 if (WARN_ON_ONCE(!cap_valid(cap))) 400 if (WARN_ON_ONCE(!cap_valid(cap)))
401 return false; 401 return false;
diff --git a/kernel/compat.c b/kernel/compat.c
index e40b0430b562..633394f442f8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -157,7 +157,7 @@ static int __compat_put_timespec(const struct timespec *ts, struct compat_timesp
157int compat_get_timeval(struct timeval *tv, const void __user *utv) 157int compat_get_timeval(struct timeval *tv, const void __user *utv)
158{ 158{
159 if (COMPAT_USE_64BIT_TIME) 159 if (COMPAT_USE_64BIT_TIME)
160 return copy_from_user(tv, utv, sizeof *tv) ? -EFAULT : 0; 160 return copy_from_user(tv, utv, sizeof(*tv)) ? -EFAULT : 0;
161 else 161 else
162 return __compat_get_timeval(tv, utv); 162 return __compat_get_timeval(tv, utv);
163} 163}
@@ -166,7 +166,7 @@ EXPORT_SYMBOL_GPL(compat_get_timeval);
166int compat_put_timeval(const struct timeval *tv, void __user *utv) 166int compat_put_timeval(const struct timeval *tv, void __user *utv)
167{ 167{
168 if (COMPAT_USE_64BIT_TIME) 168 if (COMPAT_USE_64BIT_TIME)
169 return copy_to_user(utv, tv, sizeof *tv) ? -EFAULT : 0; 169 return copy_to_user(utv, tv, sizeof(*tv)) ? -EFAULT : 0;
170 else 170 else
171 return __compat_put_timeval(tv, utv); 171 return __compat_put_timeval(tv, utv);
172} 172}
@@ -175,7 +175,7 @@ EXPORT_SYMBOL_GPL(compat_put_timeval);
175int compat_get_timespec(struct timespec *ts, const void __user *uts) 175int compat_get_timespec(struct timespec *ts, const void __user *uts)
176{ 176{
177 if (COMPAT_USE_64BIT_TIME) 177 if (COMPAT_USE_64BIT_TIME)
178 return copy_from_user(ts, uts, sizeof *ts) ? -EFAULT : 0; 178 return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;
179 else 179 else
180 return __compat_get_timespec(ts, uts); 180 return __compat_get_timespec(ts, uts);
181} 181}
@@ -184,7 +184,7 @@ EXPORT_SYMBOL_GPL(compat_get_timespec);
184int compat_put_timespec(const struct timespec *ts, void __user *uts) 184int compat_put_timespec(const struct timespec *ts, void __user *uts)
185{ 185{
186 if (COMPAT_USE_64BIT_TIME) 186 if (COMPAT_USE_64BIT_TIME)
187 return copy_to_user(uts, ts, sizeof *ts) ? -EFAULT : 0; 187 return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;
188 else 188 else
189 return __compat_put_timespec(ts, uts); 189 return __compat_put_timespec(ts, uts);
190} 190}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 247979a1b815..acf791c55b71 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -283,8 +283,7 @@ static inline void check_for_tasks(int cpu)
283 task_cputime(p, &utime, &stime); 283 task_cputime(p, &utime, &stime);
284 if (task_cpu(p) == cpu && p->state == TASK_RUNNING && 284 if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
285 (utime || stime)) 285 (utime || stime))
286 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " 286 pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n",
287 "(state = %ld, flags = %x)\n",
288 p->comm, task_pid_nr(p), cpu, 287 p->comm, task_pid_nr(p), cpu,
289 p->state, p->flags); 288 p->state, p->flags);
290 } 289 }
@@ -336,8 +335,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
336 if (err) { 335 if (err) {
337 nr_calls--; 336 nr_calls--;
338 __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); 337 __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
339 printk("%s: attempt to take down CPU %u failed\n", 338 pr_warn("%s: attempt to take down CPU %u failed\n",
340 __func__, cpu); 339 __func__, cpu);
341 goto out_release; 340 goto out_release;
342 } 341 }
343 342
@@ -444,8 +443,8 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
444 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); 443 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
445 if (ret) { 444 if (ret) {
446 nr_calls--; 445 nr_calls--;
447 printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n", 446 pr_warn("%s: attempt to bring up CPU %u failed\n",
448 __func__, cpu); 447 __func__, cpu);
449 goto out_notify; 448 goto out_notify;
450 } 449 }
451 450
@@ -475,11 +474,10 @@ int cpu_up(unsigned int cpu)
475 int err = 0; 474 int err = 0;
476 475
477 if (!cpu_possible(cpu)) { 476 if (!cpu_possible(cpu)) {
478 printk(KERN_ERR "can't online cpu %d because it is not " 477 pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n",
479 "configured as may-hotadd at boot time\n", cpu); 478 cpu);
480#if defined(CONFIG_IA64) 479#if defined(CONFIG_IA64)
481 printk(KERN_ERR "please check additional_cpus= boot " 480 pr_err("please check additional_cpus= boot parameter\n");
482 "parameter\n");
483#endif 481#endif
484 return -EINVAL; 482 return -EINVAL;
485 } 483 }
@@ -518,7 +516,7 @@ int disable_nonboot_cpus(void)
518 */ 516 */
519 cpumask_clear(frozen_cpus); 517 cpumask_clear(frozen_cpus);
520 518
521 printk("Disabling non-boot CPUs ...\n"); 519 pr_info("Disabling non-boot CPUs ...\n");
522 for_each_online_cpu(cpu) { 520 for_each_online_cpu(cpu) {
523 if (cpu == first_cpu) 521 if (cpu == first_cpu)
524 continue; 522 continue;
@@ -526,8 +524,7 @@ int disable_nonboot_cpus(void)
526 if (!error) 524 if (!error)
527 cpumask_set_cpu(cpu, frozen_cpus); 525 cpumask_set_cpu(cpu, frozen_cpus);
528 else { 526 else {
529 printk(KERN_ERR "Error taking CPU%d down: %d\n", 527 pr_err("Error taking CPU%d down: %d\n", cpu, error);
530 cpu, error);
531 break; 528 break;
532 } 529 }
533 } 530 }
@@ -537,7 +534,7 @@ int disable_nonboot_cpus(void)
537 /* Make sure the CPUs won't be enabled by someone else */ 534 /* Make sure the CPUs won't be enabled by someone else */
538 cpu_hotplug_disabled = 1; 535 cpu_hotplug_disabled = 1;
539 } else { 536 } else {
540 printk(KERN_ERR "Non-boot CPUs are not disabled\n"); 537 pr_err("Non-boot CPUs are not disabled\n");
541 } 538 }
542 cpu_maps_update_done(); 539 cpu_maps_update_done();
543 return error; 540 return error;
@@ -561,17 +558,17 @@ void __ref enable_nonboot_cpus(void)
561 if (cpumask_empty(frozen_cpus)) 558 if (cpumask_empty(frozen_cpus))
562 goto out; 559 goto out;
563 560
564 printk(KERN_INFO "Enabling non-boot CPUs ...\n"); 561 pr_info("Enabling non-boot CPUs ...\n");
565 562
566 arch_enable_nonboot_cpus_begin(); 563 arch_enable_nonboot_cpus_begin();
567 564
568 for_each_cpu(cpu, frozen_cpus) { 565 for_each_cpu(cpu, frozen_cpus) {
569 error = _cpu_up(cpu, 1); 566 error = _cpu_up(cpu, 1);
570 if (!error) { 567 if (!error) {
571 printk(KERN_INFO "CPU%d is up\n", cpu); 568 pr_info("CPU%d is up\n", cpu);
572 continue; 569 continue;
573 } 570 }
574 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); 571 pr_warn("Error taking CPU%d up: %d\n", cpu, error);
575 } 572 }
576 573
577 arch_enable_nonboot_cpus_end(); 574 arch_enable_nonboot_cpus_end();
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3d54c418bd06..130017843899 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,12 +61,7 @@
61#include <linux/cgroup.h> 61#include <linux/cgroup.h>
62#include <linux/wait.h> 62#include <linux/wait.h>
63 63
64/* 64struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
65 * Tracks how many cpusets are currently defined in system.
66 * When there is only one cpuset (the root cpuset) we can
67 * short circuit some hooks.
68 */
69int number_of_cpusets __read_mostly;
70 65
71/* See "Frequency meter" comments, below. */ 66/* See "Frequency meter" comments, below. */
72 67
@@ -611,7 +606,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
611 goto done; 606 goto done;
612 } 607 }
613 608
614 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); 609 csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL);
615 if (!csa) 610 if (!csa)
616 goto done; 611 goto done;
617 csn = 0; 612 csn = 0;
@@ -1888,7 +1883,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
1888 if (is_spread_slab(parent)) 1883 if (is_spread_slab(parent))
1889 set_bit(CS_SPREAD_SLAB, &cs->flags); 1884 set_bit(CS_SPREAD_SLAB, &cs->flags);
1890 1885
1891 number_of_cpusets++; 1886 cpuset_inc();
1892 1887
1893 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) 1888 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
1894 goto out_unlock; 1889 goto out_unlock;
@@ -1939,7 +1934,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
1939 if (is_sched_load_balance(cs)) 1934 if (is_sched_load_balance(cs))
1940 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); 1935 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1941 1936
1942 number_of_cpusets--; 1937 cpuset_dec();
1943 clear_bit(CS_ONLINE, &cs->flags); 1938 clear_bit(CS_ONLINE, &cs->flags);
1944 1939
1945 mutex_unlock(&cpuset_mutex); 1940 mutex_unlock(&cpuset_mutex);
@@ -1992,7 +1987,6 @@ int __init cpuset_init(void)
1992 if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) 1987 if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
1993 BUG(); 1988 BUG();
1994 1989
1995 number_of_cpusets = 1;
1996 return 0; 1990 return 0;
1997} 1991}
1998 1992
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index b03e0e814e43..fe15fff5df53 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -21,7 +21,7 @@
21static void kdb_show_stack(struct task_struct *p, void *addr) 21static void kdb_show_stack(struct task_struct *p, void *addr)
22{ 22{
23 int old_lvl = console_loglevel; 23 int old_lvl = console_loglevel;
24 console_loglevel = 15; 24 console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
25 kdb_trap_printk++; 25 kdb_trap_printk++;
26 kdb_set_current_task(p); 26 kdb_set_current_task(p);
27 if (addr) { 27 if (addr) {
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 14ff4849262c..7c70812caea5 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -710,7 +710,7 @@ kdb_printit:
710 } 710 }
711 if (logging) { 711 if (logging) {
712 saved_loglevel = console_loglevel; 712 saved_loglevel = console_loglevel;
713 console_loglevel = 0; 713 console_loglevel = CONSOLE_LOGLEVEL_SILENT;
714 printk(KERN_INFO "%s", kdb_buffer); 714 printk(KERN_INFO "%s", kdb_buffer);
715 } 715 }
716 716
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 0b097c8a1e50..2f7c760305ca 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1091,7 +1091,7 @@ static int kdb_reboot(int argc, const char **argv)
1091static void kdb_dumpregs(struct pt_regs *regs) 1091static void kdb_dumpregs(struct pt_regs *regs)
1092{ 1092{
1093 int old_lvl = console_loglevel; 1093 int old_lvl = console_loglevel;
1094 console_loglevel = 15; 1094 console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
1095 kdb_trap_printk++; 1095 kdb_trap_printk++;
1096 show_regs(regs); 1096 show_regs(regs);
1097 kdb_trap_printk--; 1097 kdb_trap_printk--;
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 0dbeae374225..83d4382f5699 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -37,7 +37,7 @@ static unsigned long ident_map[32] = {
37struct exec_domain default_exec_domain = { 37struct exec_domain default_exec_domain = {
38 .name = "Linux", /* name */ 38 .name = "Linux", /* name */
39 .handler = default_handler, /* lcall7 causes a seg fault. */ 39 .handler = default_handler, /* lcall7 causes a seg fault. */
40 .pers_low = 0, /* PER_LINUX personality. */ 40 .pers_low = 0, /* PER_LINUX personality. */
41 .pers_high = 0, /* PER_LINUX personality. */ 41 .pers_high = 0, /* PER_LINUX personality. */
42 .signal_map = ident_map, /* Identity map signals. */ 42 .signal_map = ident_map, /* Identity map signals. */
43 .signal_invmap = ident_map, /* - both ways. */ 43 .signal_invmap = ident_map, /* - both ways. */
@@ -83,7 +83,7 @@ lookup_exec_domain(unsigned int personality)
83 ep = &default_exec_domain; 83 ep = &default_exec_domain;
84out: 84out:
85 read_unlock(&exec_domains_lock); 85 read_unlock(&exec_domains_lock);
86 return (ep); 86 return ep;
87} 87}
88 88
89int 89int
@@ -110,8 +110,9 @@ register_exec_domain(struct exec_domain *ep)
110 110
111out: 111out:
112 write_unlock(&exec_domains_lock); 112 write_unlock(&exec_domains_lock);
113 return (err); 113 return err;
114} 114}
115EXPORT_SYMBOL(register_exec_domain);
115 116
116int 117int
117unregister_exec_domain(struct exec_domain *ep) 118unregister_exec_domain(struct exec_domain *ep)
@@ -133,6 +134,7 @@ unregister:
133 write_unlock(&exec_domains_lock); 134 write_unlock(&exec_domains_lock);
134 return 0; 135 return 0;
135} 136}
137EXPORT_SYMBOL(unregister_exec_domain);
136 138
137int __set_personality(unsigned int personality) 139int __set_personality(unsigned int personality)
138{ 140{
@@ -144,6 +146,7 @@ int __set_personality(unsigned int personality)
144 146
145 return 0; 147 return 0;
146} 148}
149EXPORT_SYMBOL(__set_personality);
147 150
148#ifdef CONFIG_PROC_FS 151#ifdef CONFIG_PROC_FS
149static int execdomains_proc_show(struct seq_file *m, void *v) 152static int execdomains_proc_show(struct seq_file *m, void *v)
@@ -188,8 +191,3 @@ SYSCALL_DEFINE1(personality, unsigned int, personality)
188 191
189 return old; 192 return old;
190} 193}
191
192
193EXPORT_SYMBOL(register_exec_domain);
194EXPORT_SYMBOL(unregister_exec_domain);
195EXPORT_SYMBOL(__set_personality);
diff --git a/kernel/exit.c b/kernel/exit.c
index 6ed6a1d552b5..750c2e594617 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -352,7 +352,7 @@ int disallow_signal(int sig)
352 352
353EXPORT_SYMBOL(disallow_signal); 353EXPORT_SYMBOL(disallow_signal);
354 354
355#ifdef CONFIG_MM_OWNER 355#ifdef CONFIG_MEMCG
356/* 356/*
357 * A task is exiting. If it owned this mm, find a new owner for the mm. 357 * A task is exiting. If it owned this mm, find a new owner for the mm.
358 */ 358 */
@@ -395,14 +395,18 @@ retry:
395 } 395 }
396 396
397 /* 397 /*
398 * Search through everything else. We should not get 398 * Search through everything else, we should not get here often.
399 * here often
400 */ 399 */
401 do_each_thread(g, c) { 400 for_each_process(g) {
402 if (c->mm == mm) 401 if (g->flags & PF_KTHREAD)
403 goto assign_new_owner; 402 continue;
404 } while_each_thread(g, c); 403 for_each_thread(g, c) {
405 404 if (c->mm == mm)
405 goto assign_new_owner;
406 if (c->mm)
407 break;
408 }
409 }
406 read_unlock(&tasklist_lock); 410 read_unlock(&tasklist_lock);
407 /* 411 /*
408 * We found no owner yet mm_users > 1: this implies that we are 412 * We found no owner yet mm_users > 1: this implies that we are
@@ -434,7 +438,7 @@ assign_new_owner:
434 task_unlock(c); 438 task_unlock(c);
435 put_task_struct(c); 439 put_task_struct(c);
436} 440}
437#endif /* CONFIG_MM_OWNER */ 441#endif /* CONFIG_MEMCG */
438 442
439/* 443/*
440 * Turn us into a lazy TLB process if we 444 * Turn us into a lazy TLB process if we
diff --git a/kernel/fork.c b/kernel/fork.c
index 54a8d26f612f..0d53eb0dfb6f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -150,15 +150,15 @@ void __weak arch_release_thread_info(struct thread_info *ti)
150static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, 150static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
151 int node) 151 int node)
152{ 152{
153 struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED, 153 struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
154 THREAD_SIZE_ORDER); 154 THREAD_SIZE_ORDER);
155 155
156 return page ? page_address(page) : NULL; 156 return page ? page_address(page) : NULL;
157} 157}
158 158
159static inline void free_thread_info(struct thread_info *ti) 159static inline void free_thread_info(struct thread_info *ti)
160{ 160{
161 free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); 161 free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
162} 162}
163# else 163# else
164static struct kmem_cache *thread_info_cache; 164static struct kmem_cache *thread_info_cache;
@@ -1099,12 +1099,12 @@ static void rt_mutex_init_task(struct task_struct *p)
1099#endif 1099#endif
1100} 1100}
1101 1101
1102#ifdef CONFIG_MM_OWNER 1102#ifdef CONFIG_MEMCG
1103void mm_init_owner(struct mm_struct *mm, struct task_struct *p) 1103void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
1104{ 1104{
1105 mm->owner = p; 1105 mm->owner = p;
1106} 1106}
1107#endif /* CONFIG_MM_OWNER */ 1107#endif /* CONFIG_MEMCG */
1108 1108
1109/* 1109/*
1110 * Initialize POSIX timer handling for a single task. 1110 * Initialize POSIX timer handling for a single task.
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 06bb1417b063..06db12434d72 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -52,8 +52,10 @@ unsigned int __read_mostly sysctl_hung_task_panic =
52 52
53static int __init hung_task_panic_setup(char *str) 53static int __init hung_task_panic_setup(char *str)
54{ 54{
55 sysctl_hung_task_panic = simple_strtoul(str, NULL, 0); 55 int rc = kstrtouint(str, 0, &sysctl_hung_task_panic);
56 56
57 if (rc)
58 return rc;
57 return 1; 59 return 1;
58} 60}
59__setup("hung_task_panic=", hung_task_panic_setup); 61__setup("hung_task_panic=", hung_task_panic_setup);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 9a130ec06f7a..c2390f41307b 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -262,7 +262,7 @@ static void create_kthread(struct kthread_create_info *create)
262 * kthread_stop() has been called). The return value should be zero 262 * kthread_stop() has been called). The return value should be zero
263 * or a negative error number; it will be passed to kthread_stop(). 263 * or a negative error number; it will be passed to kthread_stop().
264 * 264 *
265 * Returns a task_struct or ERR_PTR(-ENOMEM). 265 * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR).
266 */ 266 */
267struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), 267struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
268 void *data, int node, 268 void *data, int node,
@@ -298,7 +298,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
298 * that thread. 298 * that thread.
299 */ 299 */
300 if (xchg(&create->done, NULL)) 300 if (xchg(&create->done, NULL))
301 return ERR_PTR(-ENOMEM); 301 return ERR_PTR(-EINTR);
302 /* 302 /*
303 * kthreadd (or new kernel thread) will call complete() 303 * kthreadd (or new kernel thread) will call complete()
304 * shortly. 304 * shortly.
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index a462b317f9a0..a02812743a7e 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -88,7 +88,8 @@ static void clear_global_latency_tracing(void)
88} 88}
89 89
90static void __sched 90static void __sched
91account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat) 91account_global_scheduler_latency(struct task_struct *tsk,
92 struct latency_record *lat)
92{ 93{
93 int firstnonnull = MAXLR + 1; 94 int firstnonnull = MAXLR + 1;
94 int i; 95 int i;
@@ -255,7 +256,7 @@ static int lstats_show(struct seq_file *m, void *v)
255 break; 256 break;
256 seq_printf(m, " %ps", (void *)bt); 257 seq_printf(m, " %ps", (void *)bt);
257 } 258 }
258 seq_printf(m, "\n"); 259 seq_puts(m, "\n");
259 } 260 }
260 } 261 }
261 return 0; 262 return 0;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 221229cf0190..ea2d5f6962ed 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -54,20 +54,16 @@
54#include "console_cmdline.h" 54#include "console_cmdline.h"
55#include "braille.h" 55#include "braille.h"
56 56
57/* printk's without a loglevel use this.. */
58#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
59
60/* We show everything that is MORE important than this.. */
61#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
62#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */
63
64int console_printk[4] = { 57int console_printk[4] = {
65 DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ 58 CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */
66 DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ 59 DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */
67 MINIMUM_CONSOLE_LOGLEVEL, /* minimum_console_loglevel */ 60 CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */
68 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ 61 CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */
69}; 62};
70 63
64/* Deferred messaged from sched code are marked by this special level */
65#define SCHED_MESSAGE_LOGLEVEL -2
66
71/* 67/*
72 * Low level drivers may need that to know if they can schedule in 68 * Low level drivers may need that to know if they can schedule in
73 * their unblank() callback or not. So let's export it. 69 * their unblank() callback or not. So let's export it.
@@ -91,6 +87,29 @@ static struct lockdep_map console_lock_dep_map = {
91#endif 87#endif
92 88
93/* 89/*
90 * Helper macros to handle lockdep when locking/unlocking console_sem. We use
91 * macros instead of functions so that _RET_IP_ contains useful information.
92 */
93#define down_console_sem() do { \
94 down(&console_sem);\
95 mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);\
96} while (0)
97
98static int __down_trylock_console_sem(unsigned long ip)
99{
100 if (down_trylock(&console_sem))
101 return 1;
102 mutex_acquire(&console_lock_dep_map, 0, 1, ip);
103 return 0;
104}
105#define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_)
106
107#define up_console_sem() do { \
108 mutex_release(&console_lock_dep_map, 1, _RET_IP_);\
109 up(&console_sem);\
110} while (0)
111
112/*
94 * This is used for debugging the mess that is the VT code by 113 * This is used for debugging the mess that is the VT code by
95 * keeping track if we have the console semaphore held. It's 114 * keeping track if we have the console semaphore held. It's
96 * definitely not the perfect debug tool (we don't know if _WE_ 115 * definitely not the perfect debug tool (we don't know if _WE_
@@ -206,8 +225,9 @@ struct printk_log {
206}; 225};
207 226
208/* 227/*
209 * The logbuf_lock protects kmsg buffer, indices, counters. It is also 228 * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken
210 * used in interesting ways to provide interlocking in console_unlock(); 229 * within the scheduler's rq lock. It must be released before calling
230 * console_unlock() or anything else that might wake up a process.
211 */ 231 */
212static DEFINE_RAW_SPINLOCK(logbuf_lock); 232static DEFINE_RAW_SPINLOCK(logbuf_lock);
213 233
@@ -250,9 +270,6 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
250static char *log_buf = __log_buf; 270static char *log_buf = __log_buf;
251static u32 log_buf_len = __LOG_BUF_LEN; 271static u32 log_buf_len = __LOG_BUF_LEN;
252 272
253/* cpu currently holding logbuf_lock */
254static volatile unsigned int logbuf_cpu = UINT_MAX;
255
256/* human readable text of the record */ 273/* human readable text of the record */
257static char *log_text(const struct printk_log *msg) 274static char *log_text(const struct printk_log *msg)
258{ 275{
@@ -297,34 +314,106 @@ static u32 log_next(u32 idx)
297 return idx + msg->len; 314 return idx + msg->len;
298} 315}
299 316
300/* insert record into the buffer, discard old ones, update heads */ 317/*
301static void log_store(int facility, int level, 318 * Check whether there is enough free space for the given message.
302 enum log_flags flags, u64 ts_nsec, 319 *
303 const char *dict, u16 dict_len, 320 * The same values of first_idx and next_idx mean that the buffer
304 const char *text, u16 text_len) 321 * is either empty or full.
322 *
323 * If the buffer is empty, we must respect the position of the indexes.
324 * They cannot be reset to the beginning of the buffer.
325 */
326static int logbuf_has_space(u32 msg_size, bool empty)
305{ 327{
306 struct printk_log *msg; 328 u32 free;
307 u32 size, pad_len;
308 329
309 /* number of '\0' padding bytes to next message */ 330 if (log_next_idx > log_first_idx || empty)
310 size = sizeof(struct printk_log) + text_len + dict_len; 331 free = max(log_buf_len - log_next_idx, log_first_idx);
311 pad_len = (-size) & (LOG_ALIGN - 1); 332 else
312 size += pad_len; 333 free = log_first_idx - log_next_idx;
313 334
335 /*
336 * We need space also for an empty header that signalizes wrapping
337 * of the buffer.
338 */
339 return free >= msg_size + sizeof(struct printk_log);
340}
341
342static int log_make_free_space(u32 msg_size)
343{
314 while (log_first_seq < log_next_seq) { 344 while (log_first_seq < log_next_seq) {
315 u32 free; 345 if (logbuf_has_space(msg_size, false))
346 return 0;
347 /* drop old messages until we have enough continuous space */
348 log_first_idx = log_next(log_first_idx);
349 log_first_seq++;
350 }
316 351
317 if (log_next_idx > log_first_idx) 352 /* sequence numbers are equal, so the log buffer is empty */
318 free = max(log_buf_len - log_next_idx, log_first_idx); 353 if (logbuf_has_space(msg_size, true))
319 else 354 return 0;
320 free = log_first_idx - log_next_idx;
321 355
322 if (free >= size + sizeof(struct printk_log)) 356 return -ENOMEM;
323 break; 357}
324 358
325 /* drop old messages until we have enough contiuous space */ 359/* compute the message size including the padding bytes */
326 log_first_idx = log_next(log_first_idx); 360static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len)
327 log_first_seq++; 361{
362 u32 size;
363
364 size = sizeof(struct printk_log) + text_len + dict_len;
365 *pad_len = (-size) & (LOG_ALIGN - 1);
366 size += *pad_len;
367
368 return size;
369}
370
371/*
372 * Define how much of the log buffer we could take at maximum. The value
373 * must be greater than two. Note that only half of the buffer is available
374 * when the index points to the middle.
375 */
376#define MAX_LOG_TAKE_PART 4
377static const char trunc_msg[] = "<truncated>";
378
379static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len,
380 u16 *dict_len, u32 *pad_len)
381{
382 /*
383 * The message should not take the whole buffer. Otherwise, it might
384 * get removed too soon.
385 */
386 u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART;
387 if (*text_len > max_text_len)
388 *text_len = max_text_len;
389 /* enable the warning message */
390 *trunc_msg_len = strlen(trunc_msg);
391 /* disable the "dict" completely */
392 *dict_len = 0;
393 /* compute the size again, count also the warning message */
394 return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len);
395}
396
397/* insert record into the buffer, discard old ones, update heads */
398static int log_store(int facility, int level,
399 enum log_flags flags, u64 ts_nsec,
400 const char *dict, u16 dict_len,
401 const char *text, u16 text_len)
402{
403 struct printk_log *msg;
404 u32 size, pad_len;
405 u16 trunc_msg_len = 0;
406
407 /* number of '\0' padding bytes to next message */
408 size = msg_used_size(text_len, dict_len, &pad_len);
409
410 if (log_make_free_space(size)) {
411 /* truncate the message if it is too long for empty buffer */
412 size = truncate_msg(&text_len, &trunc_msg_len,
413 &dict_len, &pad_len);
414 /* survive when the log buffer is too small for trunc_msg */
415 if (log_make_free_space(size))
416 return 0;
328 } 417 }
329 418
330 if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { 419 if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) {
@@ -341,6 +430,10 @@ static void log_store(int facility, int level,
341 msg = (struct printk_log *)(log_buf + log_next_idx); 430 msg = (struct printk_log *)(log_buf + log_next_idx);
342 memcpy(log_text(msg), text, text_len); 431 memcpy(log_text(msg), text, text_len);
343 msg->text_len = text_len; 432 msg->text_len = text_len;
433 if (trunc_msg_len) {
434 memcpy(log_text(msg) + text_len, trunc_msg, trunc_msg_len);
435 msg->text_len += trunc_msg_len;
436 }
344 memcpy(log_dict(msg), dict, dict_len); 437 memcpy(log_dict(msg), dict, dict_len);
345 msg->dict_len = dict_len; 438 msg->dict_len = dict_len;
346 msg->facility = facility; 439 msg->facility = facility;
@@ -356,6 +449,8 @@ static void log_store(int facility, int level,
356 /* insert message */ 449 /* insert message */
357 log_next_idx += msg->len; 450 log_next_idx += msg->len;
358 log_next_seq++; 451 log_next_seq++;
452
453 return msg->text_len;
359} 454}
360 455
361#ifdef CONFIG_SECURITY_DMESG_RESTRICT 456#ifdef CONFIG_SECURITY_DMESG_RESTRICT
@@ -1303,7 +1398,10 @@ static void zap_locks(void)
1303 sema_init(&console_sem, 1); 1398 sema_init(&console_sem, 1);
1304} 1399}
1305 1400
1306/* Check if we have any console registered that can be called early in boot. */ 1401/*
1402 * Check if we have any console that is capable of printing while cpu is
1403 * booting or shutting down. Requires console_sem.
1404 */
1307static int have_callable_console(void) 1405static int have_callable_console(void)
1308{ 1406{
1309 struct console *con; 1407 struct console *con;
@@ -1318,10 +1416,9 @@ static int have_callable_console(void)
1318/* 1416/*
1319 * Can we actually use the console at this time on this cpu? 1417 * Can we actually use the console at this time on this cpu?
1320 * 1418 *
1321 * Console drivers may assume that per-cpu resources have 1419 * Console drivers may assume that per-cpu resources have been allocated. So
1322 * been allocated. So unless they're explicitly marked as 1420 * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't
1323 * being able to cope (CON_ANYTIME) don't call them until 1421 * call them until this CPU is officially up.
1324 * this CPU is officially up.
1325 */ 1422 */
1326static inline int can_use_console(unsigned int cpu) 1423static inline int can_use_console(unsigned int cpu)
1327{ 1424{
@@ -1333,36 +1430,24 @@ static inline int can_use_console(unsigned int cpu)
1333 * messages from a 'printk'. Return true (and with the 1430 * messages from a 'printk'. Return true (and with the
1334 * console_lock held, and 'console_locked' set) if it 1431 * console_lock held, and 'console_locked' set) if it
1335 * is successful, false otherwise. 1432 * is successful, false otherwise.
1336 *
1337 * This gets called with the 'logbuf_lock' spinlock held and
1338 * interrupts disabled. It should return with 'lockbuf_lock'
1339 * released but interrupts still disabled.
1340 */ 1433 */
1341static int console_trylock_for_printk(unsigned int cpu) 1434static int console_trylock_for_printk(void)
1342 __releases(&logbuf_lock)
1343{ 1435{
1344 int retval = 0, wake = 0; 1436 unsigned int cpu = smp_processor_id();
1345
1346 if (console_trylock()) {
1347 retval = 1;
1348 1437
1349 /* 1438 if (!console_trylock())
1350 * If we can't use the console, we need to release 1439 return 0;
1351 * the console semaphore by hand to avoid flushing 1440 /*
1352 * the buffer. We need to hold the console semaphore 1441 * If we can't use the console, we need to release the console
1353 * in order to do this test safely. 1442 * semaphore by hand to avoid flushing the buffer. We need to hold the
1354 */ 1443 * console semaphore in order to do this test safely.
1355 if (!can_use_console(cpu)) { 1444 */
1356 console_locked = 0; 1445 if (!can_use_console(cpu)) {
1357 wake = 1; 1446 console_locked = 0;
1358 retval = 0; 1447 up_console_sem();
1359 } 1448 return 0;
1360 } 1449 }
1361 logbuf_cpu = UINT_MAX; 1450 return 1;
1362 raw_spin_unlock(&logbuf_lock);
1363 if (wake)
1364 up(&console_sem);
1365 return retval;
1366} 1451}
1367 1452
1368int printk_delay_msec __read_mostly; 1453int printk_delay_msec __read_mostly;
@@ -1490,11 +1575,19 @@ asmlinkage int vprintk_emit(int facility, int level,
1490 static int recursion_bug; 1575 static int recursion_bug;
1491 static char textbuf[LOG_LINE_MAX]; 1576 static char textbuf[LOG_LINE_MAX];
1492 char *text = textbuf; 1577 char *text = textbuf;
1493 size_t text_len; 1578 size_t text_len = 0;
1494 enum log_flags lflags = 0; 1579 enum log_flags lflags = 0;
1495 unsigned long flags; 1580 unsigned long flags;
1496 int this_cpu; 1581 int this_cpu;
1497 int printed_len = 0; 1582 int printed_len = 0;
1583 bool in_sched = false;
1584 /* cpu currently holding logbuf_lock in this function */
1585 static volatile unsigned int logbuf_cpu = UINT_MAX;
1586
1587 if (level == SCHED_MESSAGE_LOGLEVEL) {
1588 level = -1;
1589 in_sched = true;
1590 }
1498 1591
1499 boot_delay_msec(level); 1592 boot_delay_msec(level);
1500 printk_delay(); 1593 printk_delay();
@@ -1516,7 +1609,8 @@ asmlinkage int vprintk_emit(int facility, int level,
1516 */ 1609 */
1517 if (!oops_in_progress && !lockdep_recursing(current)) { 1610 if (!oops_in_progress && !lockdep_recursing(current)) {
1518 recursion_bug = 1; 1611 recursion_bug = 1;
1519 goto out_restore_irqs; 1612 local_irq_restore(flags);
1613 return 0;
1520 } 1614 }
1521 zap_locks(); 1615 zap_locks();
1522 } 1616 }
@@ -1530,17 +1624,22 @@ asmlinkage int vprintk_emit(int facility, int level,
1530 "BUG: recent printk recursion!"; 1624 "BUG: recent printk recursion!";
1531 1625
1532 recursion_bug = 0; 1626 recursion_bug = 0;
1533 printed_len += strlen(recursion_msg); 1627 text_len = strlen(recursion_msg);
1534 /* emit KERN_CRIT message */ 1628 /* emit KERN_CRIT message */
1535 log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, 1629 printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
1536 NULL, 0, recursion_msg, printed_len); 1630 NULL, 0, recursion_msg, text_len);
1537 } 1631 }
1538 1632
1539 /* 1633 /*
1540 * The printf needs to come first; we need the syslog 1634 * The printf needs to come first; we need the syslog
1541 * prefix which might be passed-in as a parameter. 1635 * prefix which might be passed-in as a parameter.
1542 */ 1636 */
1543 text_len = vscnprintf(text, sizeof(textbuf), fmt, args); 1637 if (in_sched)
1638 text_len = scnprintf(text, sizeof(textbuf),
1639 KERN_WARNING "[sched_delayed] ");
1640
1641 text_len += vscnprintf(text + text_len,
1642 sizeof(textbuf) - text_len, fmt, args);
1544 1643
1545 /* mark and strip a trailing newline */ 1644 /* mark and strip a trailing newline */
1546 if (text_len && text[text_len-1] == '\n') { 1645 if (text_len && text[text_len-1] == '\n') {
@@ -1586,9 +1685,12 @@ asmlinkage int vprintk_emit(int facility, int level,
1586 cont_flush(LOG_NEWLINE); 1685 cont_flush(LOG_NEWLINE);
1587 1686
1588 /* buffer line if possible, otherwise store it right away */ 1687 /* buffer line if possible, otherwise store it right away */
1589 if (!cont_add(facility, level, text, text_len)) 1688 if (cont_add(facility, level, text, text_len))
1590 log_store(facility, level, lflags | LOG_CONT, 0, 1689 printed_len += text_len;
1591 dict, dictlen, text, text_len); 1690 else
1691 printed_len += log_store(facility, level,
1692 lflags | LOG_CONT, 0,
1693 dict, dictlen, text, text_len);
1592 } else { 1694 } else {
1593 bool stored = false; 1695 bool stored = false;
1594 1696
@@ -1607,26 +1709,35 @@ asmlinkage int vprintk_emit(int facility, int level,
1607 cont_flush(LOG_NEWLINE); 1709 cont_flush(LOG_NEWLINE);
1608 } 1710 }
1609 1711
1610 if (!stored) 1712 if (stored)
1611 log_store(facility, level, lflags, 0, 1713 printed_len += text_len;
1612 dict, dictlen, text, text_len); 1714 else
1715 printed_len += log_store(facility, level, lflags, 0,
1716 dict, dictlen, text, text_len);
1613 } 1717 }
1614 printed_len += text_len;
1615 1718
1719 logbuf_cpu = UINT_MAX;
1720 raw_spin_unlock(&logbuf_lock);
1721 lockdep_on();
1722 local_irq_restore(flags);
1723
1724 /* If called from the scheduler, we can not call up(). */
1725 if (in_sched)
1726 return printed_len;
1727
1728 /*
1729 * Disable preemption to avoid being preempted while holding
1730 * console_sem which would prevent anyone from printing to console
1731 */
1732 preempt_disable();
1616 /* 1733 /*
1617 * Try to acquire and then immediately release the console semaphore. 1734 * Try to acquire and then immediately release the console semaphore.
1618 * The release will print out buffers and wake up /dev/kmsg and syslog() 1735 * The release will print out buffers and wake up /dev/kmsg and syslog()
1619 * users. 1736 * users.
1620 *
1621 * The console_trylock_for_printk() function will release 'logbuf_lock'
1622 * regardless of whether it actually gets the console semaphore or not.
1623 */ 1737 */
1624 if (console_trylock_for_printk(this_cpu)) 1738 if (console_trylock_for_printk())
1625 console_unlock(); 1739 console_unlock();
1626 1740 preempt_enable();
1627 lockdep_on();
1628out_restore_irqs:
1629 local_irq_restore(flags);
1630 1741
1631 return printed_len; 1742 return printed_len;
1632} 1743}
@@ -1882,16 +1993,14 @@ void suspend_console(void)
1882 printk("Suspending console(s) (use no_console_suspend to debug)\n"); 1993 printk("Suspending console(s) (use no_console_suspend to debug)\n");
1883 console_lock(); 1994 console_lock();
1884 console_suspended = 1; 1995 console_suspended = 1;
1885 up(&console_sem); 1996 up_console_sem();
1886 mutex_release(&console_lock_dep_map, 1, _RET_IP_);
1887} 1997}
1888 1998
1889void resume_console(void) 1999void resume_console(void)
1890{ 2000{
1891 if (!console_suspend_enabled) 2001 if (!console_suspend_enabled)
1892 return; 2002 return;
1893 down(&console_sem); 2003 down_console_sem();
1894 mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);
1895 console_suspended = 0; 2004 console_suspended = 0;
1896 console_unlock(); 2005 console_unlock();
1897} 2006}
@@ -1933,12 +2042,11 @@ void console_lock(void)
1933{ 2042{
1934 might_sleep(); 2043 might_sleep();
1935 2044
1936 down(&console_sem); 2045 down_console_sem();
1937 if (console_suspended) 2046 if (console_suspended)
1938 return; 2047 return;
1939 console_locked = 1; 2048 console_locked = 1;
1940 console_may_schedule = 1; 2049 console_may_schedule = 1;
1941 mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);
1942} 2050}
1943EXPORT_SYMBOL(console_lock); 2051EXPORT_SYMBOL(console_lock);
1944 2052
@@ -1952,15 +2060,14 @@ EXPORT_SYMBOL(console_lock);
1952 */ 2060 */
1953int console_trylock(void) 2061int console_trylock(void)
1954{ 2062{
1955 if (down_trylock(&console_sem)) 2063 if (down_trylock_console_sem())
1956 return 0; 2064 return 0;
1957 if (console_suspended) { 2065 if (console_suspended) {
1958 up(&console_sem); 2066 up_console_sem();
1959 return 0; 2067 return 0;
1960 } 2068 }
1961 console_locked = 1; 2069 console_locked = 1;
1962 console_may_schedule = 0; 2070 console_may_schedule = 0;
1963 mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_);
1964 return 1; 2071 return 1;
1965} 2072}
1966EXPORT_SYMBOL(console_trylock); 2073EXPORT_SYMBOL(console_trylock);
@@ -2022,7 +2129,7 @@ void console_unlock(void)
2022 bool retry; 2129 bool retry;
2023 2130
2024 if (console_suspended) { 2131 if (console_suspended) {
2025 up(&console_sem); 2132 up_console_sem();
2026 return; 2133 return;
2027 } 2134 }
2028 2135
@@ -2043,10 +2150,15 @@ again:
2043 } 2150 }
2044 2151
2045 if (console_seq < log_first_seq) { 2152 if (console_seq < log_first_seq) {
2153 len = sprintf(text, "** %u printk messages dropped ** ",
2154 (unsigned)(log_first_seq - console_seq));
2155
2046 /* messages are gone, move to first one */ 2156 /* messages are gone, move to first one */
2047 console_seq = log_first_seq; 2157 console_seq = log_first_seq;
2048 console_idx = log_first_idx; 2158 console_idx = log_first_idx;
2049 console_prev = 0; 2159 console_prev = 0;
2160 } else {
2161 len = 0;
2050 } 2162 }
2051skip: 2163skip:
2052 if (console_seq == log_next_seq) 2164 if (console_seq == log_next_seq)
@@ -2071,8 +2183,8 @@ skip:
2071 } 2183 }
2072 2184
2073 level = msg->level; 2185 level = msg->level;
2074 len = msg_print_text(msg, console_prev, false, 2186 len += msg_print_text(msg, console_prev, false,
2075 text, sizeof(text)); 2187 text + len, sizeof(text) - len);
2076 console_idx = log_next(console_idx); 2188 console_idx = log_next(console_idx);
2077 console_seq++; 2189 console_seq++;
2078 console_prev = msg->flags; 2190 console_prev = msg->flags;
@@ -2084,7 +2196,6 @@ skip:
2084 local_irq_restore(flags); 2196 local_irq_restore(flags);
2085 } 2197 }
2086 console_locked = 0; 2198 console_locked = 0;
2087 mutex_release(&console_lock_dep_map, 1, _RET_IP_);
2088 2199
2089 /* Release the exclusive_console once it is used */ 2200 /* Release the exclusive_console once it is used */
2090 if (unlikely(exclusive_console)) 2201 if (unlikely(exclusive_console))
@@ -2092,7 +2203,7 @@ skip:
2092 2203
2093 raw_spin_unlock(&logbuf_lock); 2204 raw_spin_unlock(&logbuf_lock);
2094 2205
2095 up(&console_sem); 2206 up_console_sem();
2096 2207
2097 /* 2208 /*
2098 * Someone could have filled up the buffer again, so re-check if there's 2209 * Someone could have filled up the buffer again, so re-check if there's
@@ -2137,7 +2248,7 @@ void console_unblank(void)
2137 * oops_in_progress is set to 1.. 2248 * oops_in_progress is set to 1..
2138 */ 2249 */
2139 if (oops_in_progress) { 2250 if (oops_in_progress) {
2140 if (down_trylock(&console_sem) != 0) 2251 if (down_trylock_console_sem() != 0)
2141 return; 2252 return;
2142 } else 2253 } else
2143 console_lock(); 2254 console_lock();
@@ -2438,21 +2549,19 @@ late_initcall(printk_late_init);
2438/* 2549/*
2439 * Delayed printk version, for scheduler-internal messages: 2550 * Delayed printk version, for scheduler-internal messages:
2440 */ 2551 */
2441#define PRINTK_BUF_SIZE 512
2442
2443#define PRINTK_PENDING_WAKEUP 0x01 2552#define PRINTK_PENDING_WAKEUP 0x01
2444#define PRINTK_PENDING_SCHED 0x02 2553#define PRINTK_PENDING_OUTPUT 0x02
2445 2554
2446static DEFINE_PER_CPU(int, printk_pending); 2555static DEFINE_PER_CPU(int, printk_pending);
2447static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
2448 2556
2449static void wake_up_klogd_work_func(struct irq_work *irq_work) 2557static void wake_up_klogd_work_func(struct irq_work *irq_work)
2450{ 2558{
2451 int pending = __this_cpu_xchg(printk_pending, 0); 2559 int pending = __this_cpu_xchg(printk_pending, 0);
2452 2560
2453 if (pending & PRINTK_PENDING_SCHED) { 2561 if (pending & PRINTK_PENDING_OUTPUT) {
2454 char *buf = __get_cpu_var(printk_sched_buf); 2562 /* If trylock fails, someone else is doing the printing */
2455 pr_warn("[sched_delayed] %s", buf); 2563 if (console_trylock())
2564 console_unlock();
2456 } 2565 }
2457 2566
2458 if (pending & PRINTK_PENDING_WAKEUP) 2567 if (pending & PRINTK_PENDING_WAKEUP)
@@ -2474,23 +2583,19 @@ void wake_up_klogd(void)
2474 preempt_enable(); 2583 preempt_enable();
2475} 2584}
2476 2585
2477int printk_sched(const char *fmt, ...) 2586int printk_deferred(const char *fmt, ...)
2478{ 2587{
2479 unsigned long flags;
2480 va_list args; 2588 va_list args;
2481 char *buf;
2482 int r; 2589 int r;
2483 2590
2484 local_irq_save(flags); 2591 preempt_disable();
2485 buf = __get_cpu_var(printk_sched_buf);
2486
2487 va_start(args, fmt); 2592 va_start(args, fmt);
2488 r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args); 2593 r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args);
2489 va_end(args); 2594 va_end(args);
2490 2595
2491 __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); 2596 __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
2492 irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); 2597 irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
2493 local_irq_restore(flags); 2598 preempt_enable();
2494 2599
2495 return r; 2600 return r;
2496} 2601}
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 662c83fc16b7..a3a9e240fcdb 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -388,15 +388,22 @@ static int __init reboot_setup(char *str)
388 break; 388 break;
389 389
390 case 's': 390 case 's':
391 if (isdigit(*(str+1))) 391 {
392 reboot_cpu = simple_strtoul(str+1, NULL, 0); 392 int rc;
393 else if (str[1] == 'm' && str[2] == 'p' && 393
394 isdigit(*(str+3))) 394 if (isdigit(*(str+1))) {
395 reboot_cpu = simple_strtoul(str+3, NULL, 0); 395 rc = kstrtoint(str+1, 0, &reboot_cpu);
396 else 396 if (rc)
397 return rc;
398 } else if (str[1] == 'm' && str[2] == 'p' &&
399 isdigit(*(str+3))) {
400 rc = kstrtoint(str+3, 0, &reboot_cpu);
401 if (rc)
402 return rc;
403 } else
397 reboot_mode = REBOOT_SOFT; 404 reboot_mode = REBOOT_SOFT;
398 break; 405 break;
399 406 }
400 case 'g': 407 case 'g':
401 reboot_mode = REBOOT_GPIO; 408 reboot_mode = REBOOT_GPIO;
402 break; 409 break;
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 51dbac6a3633..e791130f85a7 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -186,8 +186,11 @@ int res_counter_memparse_write_strategy(const char *buf,
186 186
187 /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */ 187 /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */
188 if (*buf == '-') { 188 if (*buf == '-') {
189 res = simple_strtoull(buf + 1, &end, 10); 189 int rc = kstrtoull(buf + 1, 10, &res);
190 if (res != 1 || *end != '\0') 190
191 if (rc)
192 return rc;
193 if (res != 1)
191 return -EINVAL; 194 return -EINVAL;
192 *resp = RES_COUNTER_MAX; 195 *resp = RES_COUNTER_MAX;
193 return 0; 196 return 0;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 913c6d6cc2c1..caf03e89a068 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1367,7 +1367,7 @@ out:
1367 * leave kernel. 1367 * leave kernel.
1368 */ 1368 */
1369 if (p->mm && printk_ratelimit()) { 1369 if (p->mm && printk_ratelimit()) {
1370 printk_sched("process %d (%s) no longer affine to cpu%d\n", 1370 printk_deferred("process %d (%s) no longer affine to cpu%d\n",
1371 task_pid_nr(p), p->comm, cpu); 1371 task_pid_nr(p), p->comm, cpu);
1372 } 1372 }
1373 } 1373 }
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index f9ca7d19781a..e1574fca03b5 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -348,12 +348,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
348 * entity. 348 * entity.
349 */ 349 */
350 if (dl_time_before(dl_se->deadline, rq_clock(rq))) { 350 if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
351 static bool lag_once = false; 351 printk_deferred_once("sched: DL replenish lagged to much\n");
352
353 if (!lag_once) {
354 lag_once = true;
355 printk_sched("sched: DL replenish lagged to much\n");
356 }
357 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; 352 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
358 dl_se->runtime = pi_se->dl_runtime; 353 dl_se->runtime = pi_se->dl_runtime;
359 } 354 }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 0ebfd7a29472..b3512f1afce9 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -890,14 +890,8 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
890 * but accrue some time due to boosting. 890 * but accrue some time due to boosting.
891 */ 891 */
892 if (likely(rt_b->rt_runtime)) { 892 if (likely(rt_b->rt_runtime)) {
893 static bool once = false;
894
895 rt_rq->rt_throttled = 1; 893 rt_rq->rt_throttled = 1;
896 894 printk_deferred_once("sched: RT throttling activated\n");
897 if (!once) {
898 once = true;
899 printk_sched("sched: RT throttling activated\n");
900 }
901 } else { 895 } else {
902 /* 896 /*
903 * In case we did anyway, make it go away, 897 * In case we did anyway, make it go away,
diff --git a/kernel/signal.c b/kernel/signal.c
index 6ea13c09ae56..6e600aaa2af4 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3496,7 +3496,7 @@ COMPAT_SYSCALL_DEFINE3(sigaction, int, sig,
3496} 3496}
3497#endif 3497#endif
3498 3498
3499#ifdef __ARCH_WANT_SYS_SGETMASK 3499#ifdef CONFIG_SGETMASK_SYSCALL
3500 3500
3501/* 3501/*
3502 * For backwards compatibility. Functionality superseded by sigprocmask. 3502 * For backwards compatibility. Functionality superseded by sigprocmask.
@@ -3517,7 +3517,7 @@ SYSCALL_DEFINE1(ssetmask, int, newmask)
3517 3517
3518 return old; 3518 return old;
3519} 3519}
3520#endif /* __ARCH_WANT_SGETMASK */ 3520#endif /* CONFIG_SGETMASK_SYSCALL */
3521 3521
3522#ifdef __ARCH_WANT_SYS_SIGNAL 3522#ifdef __ARCH_WANT_SYS_SIGNAL
3523/* 3523/*
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 01fbae5b97b7..695f0c6cd169 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -307,6 +307,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
307 * @cpu: cpu to stop 307 * @cpu: cpu to stop
308 * @fn: function to execute 308 * @fn: function to execute
309 * @arg: argument to @fn 309 * @arg: argument to @fn
310 * @work_buf: pointer to cpu_stop_work structure
310 * 311 *
311 * Similar to stop_one_cpu() but doesn't wait for completion. The 312 * Similar to stop_one_cpu() but doesn't wait for completion. The
312 * caller is responsible for ensuring @work_buf is currently unused 313 * caller is responsible for ensuring @work_buf is currently unused
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index bc8d1b74a6b9..36441b51b5df 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -135,6 +135,8 @@ cond_syscall(sys_setresgid16);
135cond_syscall(sys_setresuid16); 135cond_syscall(sys_setresuid16);
136cond_syscall(sys_setreuid16); 136cond_syscall(sys_setreuid16);
137cond_syscall(sys_setuid16); 137cond_syscall(sys_setuid16);
138cond_syscall(sys_sgetmask);
139cond_syscall(sys_ssetmask);
138cond_syscall(sys_vm86old); 140cond_syscall(sys_vm86old);
139cond_syscall(sys_vm86); 141cond_syscall(sys_vm86);
140cond_syscall(sys_ipc); 142cond_syscall(sys_ipc);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index c8780cdaf852..33db43a39515 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -786,8 +786,9 @@ static long hardpps_update_freq(struct pps_normtime freq_norm)
786 time_status |= STA_PPSERROR; 786 time_status |= STA_PPSERROR;
787 pps_errcnt++; 787 pps_errcnt++;
788 pps_dec_freq_interval(); 788 pps_dec_freq_interval();
789 pr_err("hardpps: PPSERROR: interval too long - %ld s\n", 789 printk_deferred(KERN_ERR
790 freq_norm.sec); 790 "hardpps: PPSERROR: interval too long - %ld s\n",
791 freq_norm.sec);
791 return 0; 792 return 0;
792 } 793 }
793 794
@@ -800,7 +801,8 @@ static long hardpps_update_freq(struct pps_normtime freq_norm)
800 delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); 801 delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT);
801 pps_freq = ftemp; 802 pps_freq = ftemp;
802 if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { 803 if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) {
803 pr_warning("hardpps: PPSWANDER: change=%ld\n", delta); 804 printk_deferred(KERN_WARNING
805 "hardpps: PPSWANDER: change=%ld\n", delta);
804 time_status |= STA_PPSWANDER; 806 time_status |= STA_PPSWANDER;
805 pps_stbcnt++; 807 pps_stbcnt++;
806 pps_dec_freq_interval(); 808 pps_dec_freq_interval();
@@ -844,8 +846,9 @@ static void hardpps_update_phase(long error)
844 * the time offset is updated. 846 * the time offset is updated.
845 */ 847 */
846 if (jitter > (pps_jitter << PPS_POPCORN)) { 848 if (jitter > (pps_jitter << PPS_POPCORN)) {
847 pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", 849 printk_deferred(KERN_WARNING
848 jitter, (pps_jitter << PPS_POPCORN)); 850 "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n",
851 jitter, (pps_jitter << PPS_POPCORN));
849 time_status |= STA_PPSJITTER; 852 time_status |= STA_PPSJITTER;
850 pps_jitcnt++; 853 pps_jitcnt++;
851 } else if (time_status & STA_PPSTIME) { 854 } else if (time_status & STA_PPSTIME) {
@@ -902,7 +905,7 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
902 time_status |= STA_PPSJITTER; 905 time_status |= STA_PPSJITTER;
903 /* restart the frequency calibration interval */ 906 /* restart the frequency calibration interval */
904 pps_fbase = *raw_ts; 907 pps_fbase = *raw_ts;
905 pr_err("hardpps: PPSJITTER: bad pulse\n"); 908 printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n");
906 return; 909 return;
907 } 910 }
908 911
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f7df8ea21707..32d8d6aaedb8 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -852,8 +852,9 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
852 struct timespec *delta) 852 struct timespec *delta)
853{ 853{
854 if (!timespec_valid_strict(delta)) { 854 if (!timespec_valid_strict(delta)) {
855 printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " 855 printk_deferred(KERN_WARNING
856 "sleep delta value!\n"); 856 "__timekeeping_inject_sleeptime: Invalid "
857 "sleep delta value!\n");
857 return; 858 return;
858 } 859 }
859 tk_xtime_add(tk, delta); 860 tk_xtime_add(tk, delta);
@@ -1157,7 +1158,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
1157 1158
1158 if (unlikely(tk->clock->maxadj && 1159 if (unlikely(tk->clock->maxadj &&
1159 (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { 1160 (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) {
1160 printk_once(KERN_WARNING 1161 printk_deferred_once(KERN_WARNING
1161 "Adjusting %s more than 11%% (%ld vs %ld)\n", 1162 "Adjusting %s more than 11%% (%ld vs %ld)\n",
1162 tk->clock->name, (long)tk->mult + adj, 1163 tk->clock->name, (long)tk->mult + adj,
1163 (long)tk->clock->mult + tk->clock->maxadj); 1164 (long)tk->clock->mult + tk->clock->maxadj);
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 6620e5837ce2..33cbd8c203f8 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -239,6 +239,7 @@ static int tracepoint_remove_func(struct tracepoint *tp,
239 * tracepoint_probe_register - Connect a probe to a tracepoint 239 * tracepoint_probe_register - Connect a probe to a tracepoint
240 * @tp: tracepoint 240 * @tp: tracepoint
241 * @probe: probe handler 241 * @probe: probe handler
242 * @data: tracepoint data
242 * 243 *
243 * Returns 0 if ok, error value on error. 244 * Returns 0 if ok, error value on error.
244 * Note: if @tp is within a module, the caller is responsible for 245 * Note: if @tp is within a module, the caller is responsible for
@@ -264,6 +265,7 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register);
264 * tracepoint_probe_unregister - Disconnect a probe from a tracepoint 265 * tracepoint_probe_unregister - Disconnect a probe from a tracepoint
265 * @tp: tracepoint 266 * @tp: tracepoint
266 * @probe: probe function pointer 267 * @probe: probe function pointer
268 * @data: tracepoint data
267 * 269 *
268 * Returns 0 if ok, error value on error. 270 * Returns 0 if ok, error value on error.
269 */ 271 */
diff --git a/kernel/user.c b/kernel/user.c
index 294fc6a94168..4efa39350e44 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -87,7 +87,6 @@ static DEFINE_SPINLOCK(uidhash_lock);
87struct user_struct root_user = { 87struct user_struct root_user = {
88 .__count = ATOMIC_INIT(1), 88 .__count = ATOMIC_INIT(1),
89 .processes = ATOMIC_INIT(1), 89 .processes = ATOMIC_INIT(1),
90 .files = ATOMIC_INIT(0),
91 .sigpending = ATOMIC_INIT(0), 90 .sigpending = ATOMIC_INIT(0),
92 .locked_shm = 0, 91 .locked_shm = 0,
93 .uid = GLOBAL_ROOT_UID, 92 .uid = GLOBAL_ROOT_UID,
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 4f69f9a5e221..6fbe811c7ad1 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -51,7 +51,7 @@ static int proc_do_uts_string(ctl_table *table, int write,
51 int r; 51 int r;
52 memcpy(&uts_table, table, sizeof(uts_table)); 52 memcpy(&uts_table, table, sizeof(uts_table));
53 uts_table.data = get_uts(table, write); 53 uts_table.data = get_uts(table, write);
54 r = proc_dostring(&uts_table,write,buffer,lenp, ppos); 54 r = proc_dostring(&uts_table, write, buffer, lenp, ppos);
55 put_uts(table, write, uts_table.data); 55 put_uts(table, write, uts_table.data);
56 56
57 if (write) 57 if (write)
@@ -135,4 +135,4 @@ static int __init utsname_sysctl_init(void)
135 return 0; 135 return 0;
136} 136}
137 137
138__initcall(utsname_sysctl_init); 138device_initcall(utsname_sysctl_init);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 99c8bfee1b00..ccca32264748 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -501,6 +501,16 @@ config DEBUG_VM
501 501
502 If unsure, say N. 502 If unsure, say N.
503 503
504config DEBUG_VM_VMACACHE
505 bool "Debug VMA caching"
506 depends on DEBUG_VM
507 help
508 Enable this to turn on VMA caching debug information. Doing so
509 can cause significant overhead, so only enable it in non-production
510 environments.
511
512 If unsure, say N.
513
504config DEBUG_VM_RB 514config DEBUG_VM_RB
505 bool "Debug VM red-black trees" 515 bool "Debug VM red-black trees"
506 depends on DEBUG_VM 516 depends on DEBUG_VM
@@ -823,11 +833,6 @@ config DEBUG_RT_MUTEXES
823 This allows rt mutex semantics violations and rt mutex related 833 This allows rt mutex semantics violations and rt mutex related
824 deadlocks (lockups) to be detected and reported automatically. 834 deadlocks (lockups) to be detected and reported automatically.
825 835
826config DEBUG_PI_LIST
827 bool
828 default y
829 depends on DEBUG_RT_MUTEXES
830
831config RT_MUTEX_TESTER 836config RT_MUTEX_TESTER
832 bool "Built-in scriptable tester for rt-mutexes" 837 bool "Built-in scriptable tester for rt-mutexes"
833 depends on DEBUG_KERNEL && RT_MUTEXES 838 depends on DEBUG_KERNEL && RT_MUTEXES
@@ -1053,6 +1058,16 @@ config DEBUG_LIST
1053 1058
1054 If unsure, say N. 1059 If unsure, say N.
1055 1060
1061config DEBUG_PI_LIST
1062 bool "Debug priority linked list manipulation"
1063 depends on DEBUG_KERNEL
1064 help
1065 Enable this to turn on extended checks in the priority-ordered
1066 linked-list (plist) walking routines. This checks the entire
1067 list multiple times during each manipulation.
1068
1069 If unsure, say N.
1070
1056config DEBUG_SG 1071config DEBUG_SG
1057 bool "Debug SG table operations" 1072 bool "Debug SG table operations"
1058 depends on DEBUG_KERNEL 1073 depends on DEBUG_KERNEL
diff --git a/lib/asn1_decoder.c b/lib/asn1_decoder.c
index 11b9b01fda6b..1a000bb050f9 100644
--- a/lib/asn1_decoder.c
+++ b/lib/asn1_decoder.c
@@ -140,7 +140,7 @@ error:
140 * @decoder: The decoder definition (produced by asn1_compiler) 140 * @decoder: The decoder definition (produced by asn1_compiler)
141 * @context: The caller's context (to be passed to the action functions) 141 * @context: The caller's context (to be passed to the action functions)
142 * @data: The encoded data 142 * @data: The encoded data
143 * @datasize: The size of the encoded data 143 * @datalen: The size of the encoded data
144 * 144 *
145 * Decode BER/DER/CER encoded ASN.1 data according to a bytecode pattern 145 * Decode BER/DER/CER encoded ASN.1 data according to a bytecode pattern
146 * produced by asn1_compiler. Action functions are called on marked tags to 146 * produced by asn1_compiler. Action functions are called on marked tags to
diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c
index 00bca223d1e1..0211d30d8c39 100644
--- a/lib/atomic64_test.c
+++ b/lib/atomic64_test.c
@@ -8,6 +8,9 @@
8 * the Free Software Foundation; either version 2 of the License, or 8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version. 9 * (at your option) any later version.
10 */ 10 */
11
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13
11#include <linux/init.h> 14#include <linux/init.h>
12#include <linux/bug.h> 15#include <linux/bug.h>
13#include <linux/kernel.h> 16#include <linux/kernel.h>
@@ -146,18 +149,18 @@ static __init int test_atomic64(void)
146 BUG_ON(v.counter != r); 149 BUG_ON(v.counter != r);
147 150
148#ifdef CONFIG_X86 151#ifdef CONFIG_X86
149 printk(KERN_INFO "atomic64 test passed for %s platform %s CX8 and %s SSE\n", 152 pr_info("passed for %s platform %s CX8 and %s SSE\n",
150#ifdef CONFIG_X86_64 153#ifdef CONFIG_X86_64
151 "x86-64", 154 "x86-64",
152#elif defined(CONFIG_X86_CMPXCHG64) 155#elif defined(CONFIG_X86_CMPXCHG64)
153 "i586+", 156 "i586+",
154#else 157#else
155 "i386+", 158 "i386+",
156#endif 159#endif
157 boot_cpu_has(X86_FEATURE_CX8) ? "with" : "without", 160 boot_cpu_has(X86_FEATURE_CX8) ? "with" : "without",
158 boot_cpu_has(X86_FEATURE_XMM) ? "with" : "without"); 161 boot_cpu_has(X86_FEATURE_XMM) ? "with" : "without");
159#else 162#else
160 printk(KERN_INFO "atomic64 test passed\n"); 163 pr_info("passed\n");
161#endif 164#endif
162 165
163 return 0; 166 return 0;
diff --git a/lib/btree.c b/lib/btree.c
index f9a484676cb6..4264871ea1a0 100644
--- a/lib/btree.c
+++ b/lib/btree.c
@@ -198,6 +198,7 @@ EXPORT_SYMBOL_GPL(btree_init);
198 198
199void btree_destroy(struct btree_head *head) 199void btree_destroy(struct btree_head *head)
200{ 200{
201 mempool_free(head->node, head->mempool);
201 mempool_destroy(head->mempool); 202 mempool_destroy(head->mempool);
202 head->mempool = NULL; 203 head->mempool = NULL;
203} 204}
diff --git a/lib/bug.c b/lib/bug.c
index 168603477f02..d1d7c7878900 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -37,6 +37,9 @@
37 37
38 Jeremy Fitzhardinge <jeremy@goop.org> 2006 38 Jeremy Fitzhardinge <jeremy@goop.org> 2006
39 */ 39 */
40
41#define pr_fmt(fmt) fmt
42
40#include <linux/list.h> 43#include <linux/list.h>
41#include <linux/module.h> 44#include <linux/module.h>
42#include <linux/kernel.h> 45#include <linux/kernel.h>
@@ -153,15 +156,13 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
153 156
154 if (warning) { 157 if (warning) {
155 /* this is a WARN_ON rather than BUG/BUG_ON */ 158 /* this is a WARN_ON rather than BUG/BUG_ON */
156 printk(KERN_WARNING "------------[ cut here ]------------\n"); 159 pr_warn("------------[ cut here ]------------\n");
157 160
158 if (file) 161 if (file)
159 printk(KERN_WARNING "WARNING: at %s:%u\n", 162 pr_warn("WARNING: at %s:%u\n", file, line);
160 file, line);
161 else 163 else
162 printk(KERN_WARNING "WARNING: at %p " 164 pr_warn("WARNING: at %p [verbose debug info unavailable]\n",
163 "[verbose debug info unavailable]\n", 165 (void *)bugaddr);
164 (void *)bugaddr);
165 166
166 print_modules(); 167 print_modules();
167 show_regs(regs); 168 show_regs(regs);
@@ -174,12 +175,10 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
174 printk(KERN_DEFAULT "------------[ cut here ]------------\n"); 175 printk(KERN_DEFAULT "------------[ cut here ]------------\n");
175 176
176 if (file) 177 if (file)
177 printk(KERN_CRIT "kernel BUG at %s:%u!\n", 178 pr_crit("kernel BUG at %s:%u!\n", file, line);
178 file, line);
179 else 179 else
180 printk(KERN_CRIT "Kernel BUG at %p " 180 pr_crit("Kernel BUG at %p [verbose debug info unavailable]\n",
181 "[verbose debug info unavailable]\n", 181 (void *)bugaddr);
182 (void *)bugaddr);
183 182
184 return BUG_TRAP_TYPE_BUG; 183 return BUG_TRAP_TYPE_BUG;
185} 184}
diff --git a/lib/crc32.c b/lib/crc32.c
index 70f00ca5ef1e..21a7b2135af6 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -33,13 +33,13 @@
33#include "crc32defs.h" 33#include "crc32defs.h"
34 34
35#if CRC_LE_BITS > 8 35#if CRC_LE_BITS > 8
36# define tole(x) ((__force u32) __constant_cpu_to_le32(x)) 36# define tole(x) ((__force u32) cpu_to_le32(x))
37#else 37#else
38# define tole(x) (x) 38# define tole(x) (x)
39#endif 39#endif
40 40
41#if CRC_BE_BITS > 8 41#if CRC_BE_BITS > 8
42# define tobe(x) ((__force u32) __constant_cpu_to_be32(x)) 42# define tobe(x) ((__force u32) cpu_to_be32(x))
43#else 43#else
44# define tobe(x) (x) 44# define tobe(x) (x)
45#endif 45#endif
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index e0731c3db706..547f7f923dbc 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -7,6 +7,9 @@
7 * 7 *
8 * For licencing details see kernel-base/COPYING 8 * For licencing details see kernel-base/COPYING
9 */ 9 */
10
11#define pr_fmt(fmt) "ODEBUG: " fmt
12
10#include <linux/debugobjects.h> 13#include <linux/debugobjects.h>
11#include <linux/interrupt.h> 14#include <linux/interrupt.h>
12#include <linux/sched.h> 15#include <linux/sched.h>
@@ -218,7 +221,7 @@ static void debug_objects_oom(void)
218 unsigned long flags; 221 unsigned long flags;
219 int i; 222 int i;
220 223
221 printk(KERN_WARNING "ODEBUG: Out of memory. ODEBUG disabled\n"); 224 pr_warn("Out of memory. ODEBUG disabled\n");
222 225
223 for (i = 0; i < ODEBUG_HASH_SIZE; i++, db++) { 226 for (i = 0; i < ODEBUG_HASH_SIZE; i++, db++) {
224 raw_spin_lock_irqsave(&db->lock, flags); 227 raw_spin_lock_irqsave(&db->lock, flags);
@@ -292,11 +295,9 @@ static void debug_object_is_on_stack(void *addr, int onstack)
292 295
293 limit++; 296 limit++;
294 if (is_on_stack) 297 if (is_on_stack)
295 printk(KERN_WARNING 298 pr_warn("object is on stack, but not annotated\n");
296 "ODEBUG: object is on stack, but not annotated\n");
297 else 299 else
298 printk(KERN_WARNING 300 pr_warn("object is not on stack, but annotated\n");
299 "ODEBUG: object is not on stack, but annotated\n");
300 WARN_ON(1); 301 WARN_ON(1);
301} 302}
302 303
@@ -985,7 +986,7 @@ static void __init debug_objects_selftest(void)
985 if (check_results(&obj, ODEBUG_STATE_NONE, ++fixups, ++warnings)) 986 if (check_results(&obj, ODEBUG_STATE_NONE, ++fixups, ++warnings))
986 goto out; 987 goto out;
987#endif 988#endif
988 printk(KERN_INFO "ODEBUG: selftest passed\n"); 989 pr_info("selftest passed\n");
989 990
990out: 991out:
991 debug_objects_fixups = oldfixups; 992 debug_objects_fixups = oldfixups;
@@ -1060,8 +1061,8 @@ static int __init debug_objects_replace_static_objects(void)
1060 } 1061 }
1061 local_irq_enable(); 1062 local_irq_enable();
1062 1063
1063 printk(KERN_DEBUG "ODEBUG: %d of %d active objects replaced\n", cnt, 1064 pr_debug("%d of %d active objects replaced\n",
1064 obj_pool_used); 1065 cnt, obj_pool_used);
1065 return 0; 1066 return 0;
1066free: 1067free:
1067 hlist_for_each_entry_safe(obj, tmp, &objects, node) { 1068 hlist_for_each_entry_safe(obj, tmp, &objects, node) {
@@ -1090,7 +1091,7 @@ void __init debug_objects_mem_init(void)
1090 debug_objects_enabled = 0; 1091 debug_objects_enabled = 0;
1091 if (obj_cache) 1092 if (obj_cache)
1092 kmem_cache_destroy(obj_cache); 1093 kmem_cache_destroy(obj_cache);
1093 printk(KERN_WARNING "ODEBUG: out of memory.\n"); 1094 pr_warn("out of memory.\n");
1094 } else 1095 } else
1095 debug_objects_selftest(); 1096 debug_objects_selftest();
1096} 1097}
diff --git a/lib/digsig.c b/lib/digsig.c
index 8793aeda30ca..ae05ea393fc8 100644
--- a/lib/digsig.c
+++ b/lib/digsig.c
@@ -175,10 +175,11 @@ err1:
175 * digsig_verify() - digital signature verification with public key 175 * digsig_verify() - digital signature verification with public key
176 * @keyring: keyring to search key in 176 * @keyring: keyring to search key in
177 * @sig: digital signature 177 * @sig: digital signature
178 * @sigen: length of the signature 178 * @siglen: length of the signature
179 * @data: data 179 * @data: data
180 * @datalen: length of the data 180 * @datalen: length of the data
181 * @return: 0 on success, -EINVAL otherwise 181 *
182 * Returns 0 on success, -EINVAL otherwise
182 * 183 *
183 * Verifies data integrity against digital signature. 184 * Verifies data integrity against digital signature.
184 * Currently only RSA is supported. 185 * Currently only RSA is supported.
diff --git a/lib/libcrc32c.c b/lib/libcrc32c.c
index 244f5480c898..b3131f5cf8a2 100644
--- a/lib/libcrc32c.c
+++ b/lib/libcrc32c.c
@@ -62,10 +62,7 @@ EXPORT_SYMBOL(crc32c);
62static int __init libcrc32c_mod_init(void) 62static int __init libcrc32c_mod_init(void)
63{ 63{
64 tfm = crypto_alloc_shash("crc32c", 0, 0); 64 tfm = crypto_alloc_shash("crc32c", 0, 0);
65 if (IS_ERR(tfm)) 65 return PTR_ERR_OR_ZERO(tfm);
66 return PTR_ERR(tfm);
67
68 return 0;
69} 66}
70 67
71static void __exit libcrc32c_mod_fini(void) 68static void __exit libcrc32c_mod_fini(void)
diff --git a/lib/nlattr.c b/lib/nlattr.c
index fc6754720ced..0c5778752aec 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -136,6 +136,7 @@ int nla_validate(const struct nlattr *head, int len, int maxtype,
136errout: 136errout:
137 return err; 137 return err;
138} 138}
139EXPORT_SYMBOL(nla_validate);
139 140
140/** 141/**
141 * nla_policy_len - Determin the max. length of a policy 142 * nla_policy_len - Determin the max. length of a policy
@@ -162,6 +163,7 @@ nla_policy_len(const struct nla_policy *p, int n)
162 163
163 return len; 164 return len;
164} 165}
166EXPORT_SYMBOL(nla_policy_len);
165 167
166/** 168/**
167 * nla_parse - Parse a stream of attributes into a tb buffer 169 * nla_parse - Parse a stream of attributes into a tb buffer
@@ -208,6 +210,7 @@ int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head,
208errout: 210errout:
209 return err; 211 return err;
210} 212}
213EXPORT_SYMBOL(nla_parse);
211 214
212/** 215/**
213 * nla_find - Find a specific attribute in a stream of attributes 216 * nla_find - Find a specific attribute in a stream of attributes
@@ -228,6 +231,7 @@ struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype)
228 231
229 return NULL; 232 return NULL;
230} 233}
234EXPORT_SYMBOL(nla_find);
231 235
232/** 236/**
233 * nla_strlcpy - Copy string attribute payload into a sized buffer 237 * nla_strlcpy - Copy string attribute payload into a sized buffer
@@ -258,6 +262,7 @@ size_t nla_strlcpy(char *dst, const struct nlattr *nla, size_t dstsize)
258 262
259 return srclen; 263 return srclen;
260} 264}
265EXPORT_SYMBOL(nla_strlcpy);
261 266
262/** 267/**
263 * nla_memcpy - Copy a netlink attribute into another memory area 268 * nla_memcpy - Copy a netlink attribute into another memory area
@@ -278,6 +283,7 @@ int nla_memcpy(void *dest, const struct nlattr *src, int count)
278 283
279 return minlen; 284 return minlen;
280} 285}
286EXPORT_SYMBOL(nla_memcpy);
281 287
282/** 288/**
283 * nla_memcmp - Compare an attribute with sized memory area 289 * nla_memcmp - Compare an attribute with sized memory area
@@ -295,6 +301,7 @@ int nla_memcmp(const struct nlattr *nla, const void *data,
295 301
296 return d; 302 return d;
297} 303}
304EXPORT_SYMBOL(nla_memcmp);
298 305
299/** 306/**
300 * nla_strcmp - Compare a string attribute against a string 307 * nla_strcmp - Compare a string attribute against a string
@@ -317,6 +324,7 @@ int nla_strcmp(const struct nlattr *nla, const char *str)
317 324
318 return d; 325 return d;
319} 326}
327EXPORT_SYMBOL(nla_strcmp);
320 328
321#ifdef CONFIG_NET 329#ifdef CONFIG_NET
322/** 330/**
@@ -502,12 +510,3 @@ int nla_append(struct sk_buff *skb, int attrlen, const void *data)
502} 510}
503EXPORT_SYMBOL(nla_append); 511EXPORT_SYMBOL(nla_append);
504#endif 512#endif
505
506EXPORT_SYMBOL(nla_validate);
507EXPORT_SYMBOL(nla_policy_len);
508EXPORT_SYMBOL(nla_parse);
509EXPORT_SYMBOL(nla_find);
510EXPORT_SYMBOL(nla_strlcpy);
511EXPORT_SYMBOL(nla_memcpy);
512EXPORT_SYMBOL(nla_memcmp);
513EXPORT_SYMBOL(nla_strcmp);
diff --git a/lib/plist.c b/lib/plist.c
index 1ebc95f7a46f..d408e774b746 100644
--- a/lib/plist.c
+++ b/lib/plist.c
@@ -134,6 +134,46 @@ void plist_del(struct plist_node *node, struct plist_head *head)
134 plist_check_head(head); 134 plist_check_head(head);
135} 135}
136 136
137/**
138 * plist_requeue - Requeue @node at end of same-prio entries.
139 *
140 * This is essentially an optimized plist_del() followed by
141 * plist_add(). It moves an entry already in the plist to
142 * after any other same-priority entries.
143 *
144 * @node: &struct plist_node pointer - entry to be moved
145 * @head: &struct plist_head pointer - list head
146 */
147void plist_requeue(struct plist_node *node, struct plist_head *head)
148{
149 struct plist_node *iter;
150 struct list_head *node_next = &head->node_list;
151
152 plist_check_head(head);
153 BUG_ON(plist_head_empty(head));
154 BUG_ON(plist_node_empty(node));
155
156 if (node == plist_last(head))
157 return;
158
159 iter = plist_next(node);
160
161 if (node->prio != iter->prio)
162 return;
163
164 plist_del(node, head);
165
166 plist_for_each_continue(iter, head) {
167 if (node->prio != iter->prio) {
168 node_next = &iter->node_list;
169 break;
170 }
171 }
172 list_add_tail(&node->node_list, node_next);
173
174 plist_check_head(head);
175}
176
137#ifdef CONFIG_DEBUG_PI_LIST 177#ifdef CONFIG_DEBUG_PI_LIST
138#include <linux/sched.h> 178#include <linux/sched.h>
139#include <linux/module.h> 179#include <linux/module.h>
@@ -170,12 +210,20 @@ static void __init plist_test_check(int nr_expect)
170 BUG_ON(prio_pos->prio_list.next != &first->prio_list); 210 BUG_ON(prio_pos->prio_list.next != &first->prio_list);
171} 211}
172 212
213static void __init plist_test_requeue(struct plist_node *node)
214{
215 plist_requeue(node, &test_head);
216
217 if (node != plist_last(&test_head))
218 BUG_ON(node->prio == plist_next(node)->prio);
219}
220
173static int __init plist_test(void) 221static int __init plist_test(void)
174{ 222{
175 int nr_expect = 0, i, loop; 223 int nr_expect = 0, i, loop;
176 unsigned int r = local_clock(); 224 unsigned int r = local_clock();
177 225
178 pr_debug("start plist test\n"); 226 printk(KERN_DEBUG "start plist test\n");
179 plist_head_init(&test_head); 227 plist_head_init(&test_head);
180 for (i = 0; i < ARRAY_SIZE(test_node); i++) 228 for (i = 0; i < ARRAY_SIZE(test_node); i++)
181 plist_node_init(test_node + i, 0); 229 plist_node_init(test_node + i, 0);
@@ -193,6 +241,10 @@ static int __init plist_test(void)
193 nr_expect--; 241 nr_expect--;
194 } 242 }
195 plist_test_check(nr_expect); 243 plist_test_check(nr_expect);
244 if (!plist_node_empty(test_node + i)) {
245 plist_test_requeue(test_node + i);
246 plist_test_check(nr_expect);
247 }
196 } 248 }
197 249
198 for (i = 0; i < ARRAY_SIZE(test_node); i++) { 250 for (i = 0; i < ARRAY_SIZE(test_node); i++) {
@@ -203,7 +255,7 @@ static int __init plist_test(void)
203 plist_test_check(nr_expect); 255 plist_test_check(nr_expect);
204 } 256 }
205 257
206 pr_debug("end plist test\n"); 258 printk(KERN_DEBUG "end plist test\n");
207 return 0; 259 return 0;
208} 260}
209 261
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 9599aa72d7a0..d64815651e90 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -194,7 +194,7 @@ radix_tree_node_alloc(struct radix_tree_root *root)
194 * succeed in getting a node here (and never reach 194 * succeed in getting a node here (and never reach
195 * kmem_cache_alloc) 195 * kmem_cache_alloc)
196 */ 196 */
197 rtp = &__get_cpu_var(radix_tree_preloads); 197 rtp = this_cpu_ptr(&radix_tree_preloads);
198 if (rtp->nr) { 198 if (rtp->nr) {
199 ret = rtp->nodes[rtp->nr - 1]; 199 ret = rtp->nodes[rtp->nr - 1];
200 rtp->nodes[rtp->nr - 1] = NULL; 200 rtp->nodes[rtp->nr - 1] = NULL;
@@ -250,14 +250,14 @@ static int __radix_tree_preload(gfp_t gfp_mask)
250 int ret = -ENOMEM; 250 int ret = -ENOMEM;
251 251
252 preempt_disable(); 252 preempt_disable();
253 rtp = &__get_cpu_var(radix_tree_preloads); 253 rtp = this_cpu_ptr(&radix_tree_preloads);
254 while (rtp->nr < ARRAY_SIZE(rtp->nodes)) { 254 while (rtp->nr < ARRAY_SIZE(rtp->nodes)) {
255 preempt_enable(); 255 preempt_enable();
256 node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); 256 node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
257 if (node == NULL) 257 if (node == NULL)
258 goto out; 258 goto out;
259 preempt_disable(); 259 preempt_disable();
260 rtp = &__get_cpu_var(radix_tree_preloads); 260 rtp = this_cpu_ptr(&radix_tree_preloads);
261 if (rtp->nr < ARRAY_SIZE(rtp->nodes)) 261 if (rtp->nr < ARRAY_SIZE(rtp->nodes))
262 rtp->nodes[rtp->nr++] = node; 262 rtp->nodes[rtp->nr++] = node;
263 else 263 else
@@ -1296,7 +1296,6 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
1296/** 1296/**
1297 * __radix_tree_delete_node - try to free node after clearing a slot 1297 * __radix_tree_delete_node - try to free node after clearing a slot
1298 * @root: radix tree root 1298 * @root: radix tree root
1299 * @index: index key
1300 * @node: node containing @index 1299 * @node: node containing @index
1301 * 1300 *
1302 * After clearing the slot at @index in @node from radix tree 1301 * After clearing the slot at @index in @node from radix tree
diff --git a/lib/string.c b/lib/string.c
index e0c20eb362f0..992bf30af759 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -107,7 +107,7 @@ EXPORT_SYMBOL(strcpy);
107 107
108#ifndef __HAVE_ARCH_STRNCPY 108#ifndef __HAVE_ARCH_STRNCPY
109/** 109/**
110 * strncpy - Copy a length-limited, %NUL-terminated string 110 * strncpy - Copy a length-limited, C-string
111 * @dest: Where to copy the string to 111 * @dest: Where to copy the string to
112 * @src: Where to copy the string from 112 * @src: Where to copy the string from
113 * @count: The maximum number of bytes to copy 113 * @count: The maximum number of bytes to copy
@@ -136,7 +136,7 @@ EXPORT_SYMBOL(strncpy);
136 136
137#ifndef __HAVE_ARCH_STRLCPY 137#ifndef __HAVE_ARCH_STRLCPY
138/** 138/**
139 * strlcpy - Copy a %NUL terminated string into a sized buffer 139 * strlcpy - Copy a C-string into a sized buffer
140 * @dest: Where to copy the string to 140 * @dest: Where to copy the string to
141 * @src: Where to copy the string from 141 * @src: Where to copy the string from
142 * @size: size of destination buffer 142 * @size: size of destination buffer
@@ -182,7 +182,7 @@ EXPORT_SYMBOL(strcat);
182 182
183#ifndef __HAVE_ARCH_STRNCAT 183#ifndef __HAVE_ARCH_STRNCAT
184/** 184/**
185 * strncat - Append a length-limited, %NUL-terminated string to another 185 * strncat - Append a length-limited, C-string to another
186 * @dest: The string to be appended to 186 * @dest: The string to be appended to
187 * @src: The string to append to it 187 * @src: The string to append to it
188 * @count: The maximum numbers of bytes to copy 188 * @count: The maximum numbers of bytes to copy
@@ -211,7 +211,7 @@ EXPORT_SYMBOL(strncat);
211 211
212#ifndef __HAVE_ARCH_STRLCAT 212#ifndef __HAVE_ARCH_STRLCAT
213/** 213/**
214 * strlcat - Append a length-limited, %NUL-terminated string to another 214 * strlcat - Append a length-limited, C-string to another
215 * @dest: The string to be appended to 215 * @dest: The string to be appended to
216 * @src: The string to append to it 216 * @src: The string to append to it
217 * @count: The size of the destination buffer. 217 * @count: The size of the destination buffer.
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index b604b831f4d1..649d097853a1 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -374,7 +374,7 @@ void __init swiotlb_free(void)
374 io_tlb_nslabs = 0; 374 io_tlb_nslabs = 0;
375} 375}
376 376
377static int is_swiotlb_buffer(phys_addr_t paddr) 377int is_swiotlb_buffer(phys_addr_t paddr)
378{ 378{
379 return paddr >= io_tlb_start && paddr < io_tlb_end; 379 return paddr >= io_tlb_start && paddr < io_tlb_end;
380} 380}
diff --git a/lib/textsearch.c b/lib/textsearch.c
index e0cc0146ae62..0c7e9ab2d88f 100644
--- a/lib/textsearch.c
+++ b/lib/textsearch.c
@@ -159,6 +159,7 @@ errout:
159 spin_unlock(&ts_mod_lock); 159 spin_unlock(&ts_mod_lock);
160 return err; 160 return err;
161} 161}
162EXPORT_SYMBOL(textsearch_register);
162 163
163/** 164/**
164 * textsearch_unregister - unregister a textsearch module 165 * textsearch_unregister - unregister a textsearch module
@@ -190,6 +191,7 @@ out:
190 spin_unlock(&ts_mod_lock); 191 spin_unlock(&ts_mod_lock);
191 return err; 192 return err;
192} 193}
194EXPORT_SYMBOL(textsearch_unregister);
193 195
194struct ts_linear_state 196struct ts_linear_state
195{ 197{
@@ -236,6 +238,7 @@ unsigned int textsearch_find_continuous(struct ts_config *conf,
236 238
237 return textsearch_find(conf, state); 239 return textsearch_find(conf, state);
238} 240}
241EXPORT_SYMBOL(textsearch_find_continuous);
239 242
240/** 243/**
241 * textsearch_prepare - Prepare a search 244 * textsearch_prepare - Prepare a search
@@ -298,6 +301,7 @@ errout:
298 301
299 return ERR_PTR(err); 302 return ERR_PTR(err);
300} 303}
304EXPORT_SYMBOL(textsearch_prepare);
301 305
302/** 306/**
303 * textsearch_destroy - destroy a search configuration 307 * textsearch_destroy - destroy a search configuration
@@ -316,9 +320,4 @@ void textsearch_destroy(struct ts_config *conf)
316 320
317 kfree(conf); 321 kfree(conf);
318} 322}
319
320EXPORT_SYMBOL(textsearch_register);
321EXPORT_SYMBOL(textsearch_unregister);
322EXPORT_SYMBOL(textsearch_prepare);
323EXPORT_SYMBOL(textsearch_find_continuous);
324EXPORT_SYMBOL(textsearch_destroy); 323EXPORT_SYMBOL(textsearch_destroy);
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 0648291cdafe..6fe2c84eb055 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -2347,7 +2347,7 @@ int vsscanf(const char *buf, const char *fmt, va_list args)
2347 break; 2347 break;
2348 2348
2349 base = 10; 2349 base = 10;
2350 is_sign = 0; 2350 is_sign = false;
2351 2351
2352 switch (*fmt++) { 2352 switch (*fmt++) {
2353 case 'c': 2353 case 'c':
@@ -2386,7 +2386,7 @@ int vsscanf(const char *buf, const char *fmt, va_list args)
2386 case 'i': 2386 case 'i':
2387 base = 0; 2387 base = 0;
2388 case 'd': 2388 case 'd':
2389 is_sign = 1; 2389 is_sign = true;
2390 case 'u': 2390 case 'u':
2391 break; 2391 break;
2392 case '%': 2392 case '%':
diff --git a/lib/xz/Kconfig b/lib/xz/Kconfig
index 08837db52d94..12d2d777f36b 100644
--- a/lib/xz/Kconfig
+++ b/lib/xz/Kconfig
@@ -9,33 +9,33 @@ config XZ_DEC
9if XZ_DEC 9if XZ_DEC
10 10
11config XZ_DEC_X86 11config XZ_DEC_X86
12 bool "x86 BCJ filter decoder" 12 bool "x86 BCJ filter decoder" if EXPERT
13 default y if X86 13 default y
14 select XZ_DEC_BCJ 14 select XZ_DEC_BCJ
15 15
16config XZ_DEC_POWERPC 16config XZ_DEC_POWERPC
17 bool "PowerPC BCJ filter decoder" 17 bool "PowerPC BCJ filter decoder" if EXPERT
18 default y if PPC 18 default y
19 select XZ_DEC_BCJ 19 select XZ_DEC_BCJ
20 20
21config XZ_DEC_IA64 21config XZ_DEC_IA64
22 bool "IA-64 BCJ filter decoder" 22 bool "IA-64 BCJ filter decoder" if EXPERT
23 default y if IA64 23 default y
24 select XZ_DEC_BCJ 24 select XZ_DEC_BCJ
25 25
26config XZ_DEC_ARM 26config XZ_DEC_ARM
27 bool "ARM BCJ filter decoder" 27 bool "ARM BCJ filter decoder" if EXPERT
28 default y if ARM 28 default y
29 select XZ_DEC_BCJ 29 select XZ_DEC_BCJ
30 30
31config XZ_DEC_ARMTHUMB 31config XZ_DEC_ARMTHUMB
32 bool "ARM-Thumb BCJ filter decoder" 32 bool "ARM-Thumb BCJ filter decoder" if EXPERT
33 default y if (ARM && ARM_THUMB) 33 default y
34 select XZ_DEC_BCJ 34 select XZ_DEC_BCJ
35 35
36config XZ_DEC_SPARC 36config XZ_DEC_SPARC
37 bool "SPARC BCJ filter decoder" 37 bool "SPARC BCJ filter decoder" if EXPERT
38 default y if SPARC 38 default y
39 select XZ_DEC_BCJ 39 select XZ_DEC_BCJ
40 40
41endif 41endif
diff --git a/lib/xz/xz_dec_lzma2.c b/lib/xz/xz_dec_lzma2.c
index a6cdc969ea42..08c3c8049998 100644
--- a/lib/xz/xz_dec_lzma2.c
+++ b/lib/xz/xz_dec_lzma2.c
@@ -1043,6 +1043,8 @@ XZ_EXTERN enum xz_ret xz_dec_lzma2_run(struct xz_dec_lzma2 *s,
1043 1043
1044 s->lzma2.sequence = SEQ_LZMA_PREPARE; 1044 s->lzma2.sequence = SEQ_LZMA_PREPARE;
1045 1045
1046 /* Fall through */
1047
1046 case SEQ_LZMA_PREPARE: 1048 case SEQ_LZMA_PREPARE:
1047 if (s->lzma2.compressed < RC_INIT_BYTES) 1049 if (s->lzma2.compressed < RC_INIT_BYTES)
1048 return XZ_DATA_ERROR; 1050 return XZ_DATA_ERROR;
@@ -1053,6 +1055,8 @@ XZ_EXTERN enum xz_ret xz_dec_lzma2_run(struct xz_dec_lzma2 *s,
1053 s->lzma2.compressed -= RC_INIT_BYTES; 1055 s->lzma2.compressed -= RC_INIT_BYTES;
1054 s->lzma2.sequence = SEQ_LZMA_RUN; 1056 s->lzma2.sequence = SEQ_LZMA_RUN;
1055 1057
1058 /* Fall through */
1059
1056 case SEQ_LZMA_RUN: 1060 case SEQ_LZMA_RUN:
1057 /* 1061 /*
1058 * Set dictionary limit to indicate how much we want 1062 * Set dictionary limit to indicate how much we want
diff --git a/mm/Kconfig b/mm/Kconfig
index 28cec518f4d4..3e9977a9d657 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -267,6 +267,9 @@ config MIGRATION
267 pages as migration can relocate pages to satisfy a huge page 267 pages as migration can relocate pages to satisfy a huge page
268 allocation instead of reclaiming. 268 allocation instead of reclaiming.
269 269
270config ARCH_ENABLE_HUGEPAGE_MIGRATION
271 boolean
272
270config PHYS_ADDR_T_64BIT 273config PHYS_ADDR_T_64BIT
271 def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT 274 def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
272 275
@@ -433,16 +436,6 @@ choice
433 benefit. 436 benefit.
434endchoice 437endchoice
435 438
436config CROSS_MEMORY_ATTACH
437 bool "Cross Memory Support"
438 depends on MMU
439 default y
440 help
441 Enabling this option adds the system calls process_vm_readv and
442 process_vm_writev which allow a process with the correct privileges
443 to directly read from or write to to another process's address space.
444 See the man page for more details.
445
446# 439#
447# UP and nommu archs use km based percpu allocator 440# UP and nommu archs use km based percpu allocator
448# 441#
@@ -558,7 +551,7 @@ config MEM_SOFT_DIRTY
558 See Documentation/vm/soft-dirty.txt for more details. 551 See Documentation/vm/soft-dirty.txt for more details.
559 552
560config ZSMALLOC 553config ZSMALLOC
561 bool "Memory allocator for compressed pages" 554 tristate "Memory allocator for compressed pages"
562 depends on MMU 555 depends on MMU
563 default n 556 default n
564 help 557 help
diff --git a/mm/Makefile b/mm/Makefile
index 0173940407f6..4064f3ec145e 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -3,7 +3,7 @@
3# 3#
4 4
5mmu-y := nommu.o 5mmu-y := nommu.o
6mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ 6mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o madvise.o memory.o mincore.o \
7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ 7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
8 vmalloc.o pagewalk.o pgtable-generic.o 8 vmalloc.o pagewalk.o pgtable-generic.o
9 9
diff --git a/mm/compaction.c b/mm/compaction.c
index 627dc2e4320f..21bf292b642a 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone)
89 unsigned long end_pfn = zone_end_pfn(zone); 89 unsigned long end_pfn = zone_end_pfn(zone);
90 unsigned long pfn; 90 unsigned long pfn;
91 91
92 zone->compact_cached_migrate_pfn = start_pfn; 92 zone->compact_cached_migrate_pfn[0] = start_pfn;
93 zone->compact_cached_migrate_pfn[1] = start_pfn;
93 zone->compact_cached_free_pfn = end_pfn; 94 zone->compact_cached_free_pfn = end_pfn;
94 zone->compact_blockskip_flush = false; 95 zone->compact_blockskip_flush = false;
95 96
@@ -131,9 +132,10 @@ void reset_isolation_suitable(pg_data_t *pgdat)
131 */ 132 */
132static void update_pageblock_skip(struct compact_control *cc, 133static void update_pageblock_skip(struct compact_control *cc,
133 struct page *page, unsigned long nr_isolated, 134 struct page *page, unsigned long nr_isolated,
134 bool migrate_scanner) 135 bool set_unsuitable, bool migrate_scanner)
135{ 136{
136 struct zone *zone = cc->zone; 137 struct zone *zone = cc->zone;
138 unsigned long pfn;
137 139
138 if (cc->ignore_skip_hint) 140 if (cc->ignore_skip_hint)
139 return; 141 return;
@@ -141,20 +143,32 @@ static void update_pageblock_skip(struct compact_control *cc,
141 if (!page) 143 if (!page)
142 return; 144 return;
143 145
144 if (!nr_isolated) { 146 if (nr_isolated)
145 unsigned long pfn = page_to_pfn(page); 147 return;
148
149 /*
150 * Only skip pageblocks when all forms of compaction will be known to
151 * fail in the near future.
152 */
153 if (set_unsuitable)
146 set_pageblock_skip(page); 154 set_pageblock_skip(page);
147 155
148 /* Update where compaction should restart */ 156 pfn = page_to_pfn(page);
149 if (migrate_scanner) { 157
150 if (!cc->finished_update_migrate && 158 /* Update where async and sync compaction should restart */
151 pfn > zone->compact_cached_migrate_pfn) 159 if (migrate_scanner) {
152 zone->compact_cached_migrate_pfn = pfn; 160 if (cc->finished_update_migrate)
153 } else { 161 return;
154 if (!cc->finished_update_free && 162 if (pfn > zone->compact_cached_migrate_pfn[0])
155 pfn < zone->compact_cached_free_pfn) 163 zone->compact_cached_migrate_pfn[0] = pfn;
156 zone->compact_cached_free_pfn = pfn; 164 if (cc->mode != MIGRATE_ASYNC &&
157 } 165 pfn > zone->compact_cached_migrate_pfn[1])
166 zone->compact_cached_migrate_pfn[1] = pfn;
167 } else {
168 if (cc->finished_update_free)
169 return;
170 if (pfn < zone->compact_cached_free_pfn)
171 zone->compact_cached_free_pfn = pfn;
158 } 172 }
159} 173}
160#else 174#else
@@ -166,7 +180,7 @@ static inline bool isolation_suitable(struct compact_control *cc,
166 180
167static void update_pageblock_skip(struct compact_control *cc, 181static void update_pageblock_skip(struct compact_control *cc,
168 struct page *page, unsigned long nr_isolated, 182 struct page *page, unsigned long nr_isolated,
169 bool migrate_scanner) 183 bool set_unsuitable, bool migrate_scanner)
170{ 184{
171} 185}
172#endif /* CONFIG_COMPACTION */ 186#endif /* CONFIG_COMPACTION */
@@ -195,7 +209,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
195 } 209 }
196 210
197 /* async aborts if taking too long or contended */ 211 /* async aborts if taking too long or contended */
198 if (!cc->sync) { 212 if (cc->mode == MIGRATE_ASYNC) {
199 cc->contended = true; 213 cc->contended = true;
200 return false; 214 return false;
201 } 215 }
@@ -208,10 +222,28 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
208 return true; 222 return true;
209} 223}
210 224
211static inline bool compact_trylock_irqsave(spinlock_t *lock, 225/*
212 unsigned long *flags, struct compact_control *cc) 226 * Aside from avoiding lock contention, compaction also periodically checks
227 * need_resched() and either schedules in sync compaction or aborts async
228 * compaction. This is similar to what compact_checklock_irqsave() does, but
229 * is used where no lock is concerned.
230 *
231 * Returns false when no scheduling was needed, or sync compaction scheduled.
232 * Returns true when async compaction should abort.
233 */
234static inline bool compact_should_abort(struct compact_control *cc)
213{ 235{
214 return compact_checklock_irqsave(lock, flags, false, cc); 236 /* async compaction aborts if contended */
237 if (need_resched()) {
238 if (cc->mode == MIGRATE_ASYNC) {
239 cc->contended = true;
240 return true;
241 }
242
243 cond_resched();
244 }
245
246 return false;
215} 247}
216 248
217/* Returns true if the page is within a block suitable for migration to */ 249/* Returns true if the page is within a block suitable for migration to */
@@ -329,7 +361,8 @@ isolate_fail:
329 361
330 /* Update the pageblock-skip if the whole pageblock was scanned */ 362 /* Update the pageblock-skip if the whole pageblock was scanned */
331 if (blockpfn == end_pfn) 363 if (blockpfn == end_pfn)
332 update_pageblock_skip(cc, valid_page, total_isolated, false); 364 update_pageblock_skip(cc, valid_page, total_isolated, true,
365 false);
333 366
334 count_compact_events(COMPACTFREE_SCANNED, nr_scanned); 367 count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
335 if (total_isolated) 368 if (total_isolated)
@@ -464,8 +497,9 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
464 unsigned long flags; 497 unsigned long flags;
465 bool locked = false; 498 bool locked = false;
466 struct page *page = NULL, *valid_page = NULL; 499 struct page *page = NULL, *valid_page = NULL;
467 bool skipped_async_unsuitable = false; 500 bool set_unsuitable = true;
468 const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) | 501 const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ?
502 ISOLATE_ASYNC_MIGRATE : 0) |
469 (unevictable ? ISOLATE_UNEVICTABLE : 0); 503 (unevictable ? ISOLATE_UNEVICTABLE : 0);
470 504
471 /* 505 /*
@@ -475,7 +509,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
475 */ 509 */
476 while (unlikely(too_many_isolated(zone))) { 510 while (unlikely(too_many_isolated(zone))) {
477 /* async migration should just abort */ 511 /* async migration should just abort */
478 if (!cc->sync) 512 if (cc->mode == MIGRATE_ASYNC)
479 return 0; 513 return 0;
480 514
481 congestion_wait(BLK_RW_ASYNC, HZ/10); 515 congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -484,8 +518,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
484 return 0; 518 return 0;
485 } 519 }
486 520
521 if (compact_should_abort(cc))
522 return 0;
523
487 /* Time to isolate some pages for migration */ 524 /* Time to isolate some pages for migration */
488 cond_resched();
489 for (; low_pfn < end_pfn; low_pfn++) { 525 for (; low_pfn < end_pfn; low_pfn++) {
490 /* give a chance to irqs before checking need_resched() */ 526 /* give a chance to irqs before checking need_resched() */
491 if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) { 527 if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
@@ -540,9 +576,9 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
540 * the minimum amount of work satisfies the allocation 576 * the minimum amount of work satisfies the allocation
541 */ 577 */
542 mt = get_pageblock_migratetype(page); 578 mt = get_pageblock_migratetype(page);
543 if (!cc->sync && !migrate_async_suitable(mt)) { 579 if (cc->mode == MIGRATE_ASYNC &&
544 cc->finished_update_migrate = true; 580 !migrate_async_suitable(mt)) {
545 skipped_async_unsuitable = true; 581 set_unsuitable = false;
546 goto next_pageblock; 582 goto next_pageblock;
547 } 583 }
548 } 584 }
@@ -646,11 +682,10 @@ next_pageblock:
646 /* 682 /*
647 * Update the pageblock-skip information and cached scanner pfn, 683 * Update the pageblock-skip information and cached scanner pfn,
648 * if the whole pageblock was scanned without isolating any page. 684 * if the whole pageblock was scanned without isolating any page.
649 * This is not done when pageblock was skipped due to being unsuitable
650 * for async compaction, so that eventual sync compaction can try.
651 */ 685 */
652 if (low_pfn == end_pfn && !skipped_async_unsuitable) 686 if (low_pfn == end_pfn)
653 update_pageblock_skip(cc, valid_page, nr_isolated, true); 687 update_pageblock_skip(cc, valid_page, nr_isolated,
688 set_unsuitable, true);
654 689
655 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); 690 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
656 691
@@ -671,7 +706,9 @@ static void isolate_freepages(struct zone *zone,
671 struct compact_control *cc) 706 struct compact_control *cc)
672{ 707{
673 struct page *page; 708 struct page *page;
674 unsigned long high_pfn, low_pfn, pfn, z_end_pfn; 709 unsigned long block_start_pfn; /* start of current pageblock */
710 unsigned long block_end_pfn; /* end of current pageblock */
711 unsigned long low_pfn; /* lowest pfn scanner is able to scan */
675 int nr_freepages = cc->nr_freepages; 712 int nr_freepages = cc->nr_freepages;
676 struct list_head *freelist = &cc->freepages; 713 struct list_head *freelist = &cc->freepages;
677 714
@@ -679,41 +716,38 @@ static void isolate_freepages(struct zone *zone,
679 * Initialise the free scanner. The starting point is where we last 716 * Initialise the free scanner. The starting point is where we last
680 * successfully isolated from, zone-cached value, or the end of the 717 * successfully isolated from, zone-cached value, or the end of the
681 * zone when isolating for the first time. We need this aligned to 718 * zone when isolating for the first time. We need this aligned to
682 * the pageblock boundary, because we do pfn -= pageblock_nr_pages 719 * the pageblock boundary, because we do
683 * in the for loop. 720 * block_start_pfn -= pageblock_nr_pages in the for loop.
721 * For ending point, take care when isolating in last pageblock of a
722 * a zone which ends in the middle of a pageblock.
684 * The low boundary is the end of the pageblock the migration scanner 723 * The low boundary is the end of the pageblock the migration scanner
685 * is using. 724 * is using.
686 */ 725 */
687 pfn = cc->free_pfn & ~(pageblock_nr_pages-1); 726 block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1);
727 block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
728 zone_end_pfn(zone));
688 low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); 729 low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
689 730
690 /* 731 /*
691 * Take care that if the migration scanner is at the end of the zone
692 * that the free scanner does not accidentally move to the next zone
693 * in the next isolation cycle.
694 */
695 high_pfn = min(low_pfn, pfn);
696
697 z_end_pfn = zone_end_pfn(zone);
698
699 /*
700 * Isolate free pages until enough are available to migrate the 732 * Isolate free pages until enough are available to migrate the
701 * pages on cc->migratepages. We stop searching if the migrate 733 * pages on cc->migratepages. We stop searching if the migrate
702 * and free page scanners meet or enough free pages are isolated. 734 * and free page scanners meet or enough free pages are isolated.
703 */ 735 */
704 for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages; 736 for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
705 pfn -= pageblock_nr_pages) { 737 block_end_pfn = block_start_pfn,
738 block_start_pfn -= pageblock_nr_pages) {
706 unsigned long isolated; 739 unsigned long isolated;
707 unsigned long end_pfn;
708 740
709 /* 741 /*
710 * This can iterate a massively long zone without finding any 742 * This can iterate a massively long zone without finding any
711 * suitable migration targets, so periodically check if we need 743 * suitable migration targets, so periodically check if we need
712 * to schedule. 744 * to schedule, or even abort async compaction.
713 */ 745 */
714 cond_resched(); 746 if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
747 && compact_should_abort(cc))
748 break;
715 749
716 if (!pfn_valid(pfn)) 750 if (!pfn_valid(block_start_pfn))
717 continue; 751 continue;
718 752
719 /* 753 /*
@@ -723,7 +757,7 @@ static void isolate_freepages(struct zone *zone,
723 * i.e. it's possible that all pages within a zones range of 757 * i.e. it's possible that all pages within a zones range of
724 * pages do not belong to a single zone. 758 * pages do not belong to a single zone.
725 */ 759 */
726 page = pfn_to_page(pfn); 760 page = pfn_to_page(block_start_pfn);
727 if (page_zone(page) != zone) 761 if (page_zone(page) != zone)
728 continue; 762 continue;
729 763
@@ -736,26 +770,26 @@ static void isolate_freepages(struct zone *zone,
736 continue; 770 continue;
737 771
738 /* Found a block suitable for isolating free pages from */ 772 /* Found a block suitable for isolating free pages from */
739 isolated = 0; 773 cc->free_pfn = block_start_pfn;
774 isolated = isolate_freepages_block(cc, block_start_pfn,
775 block_end_pfn, freelist, false);
776 nr_freepages += isolated;
740 777
741 /* 778 /*
742 * Take care when isolating in last pageblock of a zone which 779 * Set a flag that we successfully isolated in this pageblock.
743 * ends in the middle of a pageblock. 780 * In the next loop iteration, zone->compact_cached_free_pfn
781 * will not be updated and thus it will effectively contain the
782 * highest pageblock we isolated pages from.
744 */ 783 */
745 end_pfn = min(pfn + pageblock_nr_pages, z_end_pfn); 784 if (isolated)
746 isolated = isolate_freepages_block(cc, pfn, end_pfn, 785 cc->finished_update_free = true;
747 freelist, false);
748 nr_freepages += isolated;
749 786
750 /* 787 /*
751 * Record the highest PFN we isolated pages from. When next 788 * isolate_freepages_block() might have aborted due to async
752 * looking for free pages, the search will restart here as 789 * compaction being contended
753 * page migration may have returned some pages to the allocator
754 */ 790 */
755 if (isolated) { 791 if (cc->contended)
756 cc->finished_update_free = true; 792 break;
757 high_pfn = max(high_pfn, pfn);
758 }
759 } 793 }
760 794
761 /* split_free_page does not map the pages */ 795 /* split_free_page does not map the pages */
@@ -765,10 +799,9 @@ static void isolate_freepages(struct zone *zone,
765 * If we crossed the migrate scanner, we want to keep it that way 799 * If we crossed the migrate scanner, we want to keep it that way
766 * so that compact_finished() may detect this 800 * so that compact_finished() may detect this
767 */ 801 */
768 if (pfn < low_pfn) 802 if (block_start_pfn < low_pfn)
769 cc->free_pfn = max(pfn, zone->zone_start_pfn); 803 cc->free_pfn = cc->migrate_pfn;
770 else 804
771 cc->free_pfn = high_pfn;
772 cc->nr_freepages = nr_freepages; 805 cc->nr_freepages = nr_freepages;
773} 806}
774 807
@@ -783,9 +816,13 @@ static struct page *compaction_alloc(struct page *migratepage,
783 struct compact_control *cc = (struct compact_control *)data; 816 struct compact_control *cc = (struct compact_control *)data;
784 struct page *freepage; 817 struct page *freepage;
785 818
786 /* Isolate free pages if necessary */ 819 /*
820 * Isolate free pages if necessary, and if we are not aborting due to
821 * contention.
822 */
787 if (list_empty(&cc->freepages)) { 823 if (list_empty(&cc->freepages)) {
788 isolate_freepages(cc->zone, cc); 824 if (!cc->contended)
825 isolate_freepages(cc->zone, cc);
789 826
790 if (list_empty(&cc->freepages)) 827 if (list_empty(&cc->freepages))
791 return NULL; 828 return NULL;
@@ -799,23 +836,16 @@ static struct page *compaction_alloc(struct page *migratepage,
799} 836}
800 837
801/* 838/*
802 * We cannot control nr_migratepages and nr_freepages fully when migration is 839 * This is a migrate-callback that "frees" freepages back to the isolated
803 * running as migrate_pages() has no knowledge of compact_control. When 840 * freelist. All pages on the freelist are from the same zone, so there is no
804 * migration is complete, we count the number of pages on the lists by hand. 841 * special handling needed for NUMA.
805 */ 842 */
806static void update_nr_listpages(struct compact_control *cc) 843static void compaction_free(struct page *page, unsigned long data)
807{ 844{
808 int nr_migratepages = 0; 845 struct compact_control *cc = (struct compact_control *)data;
809 int nr_freepages = 0;
810 struct page *page;
811
812 list_for_each_entry(page, &cc->migratepages, lru)
813 nr_migratepages++;
814 list_for_each_entry(page, &cc->freepages, lru)
815 nr_freepages++;
816 846
817 cc->nr_migratepages = nr_migratepages; 847 list_add(&page->lru, &cc->freepages);
818 cc->nr_freepages = nr_freepages; 848 cc->nr_freepages++;
819} 849}
820 850
821/* possible outcome of isolate_migratepages */ 851/* possible outcome of isolate_migratepages */
@@ -862,13 +892,14 @@ static int compact_finished(struct zone *zone,
862 unsigned int order; 892 unsigned int order;
863 unsigned long watermark; 893 unsigned long watermark;
864 894
865 if (fatal_signal_pending(current)) 895 if (cc->contended || fatal_signal_pending(current))
866 return COMPACT_PARTIAL; 896 return COMPACT_PARTIAL;
867 897
868 /* Compaction run completes if the migrate and free scanner meet */ 898 /* Compaction run completes if the migrate and free scanner meet */
869 if (cc->free_pfn <= cc->migrate_pfn) { 899 if (cc->free_pfn <= cc->migrate_pfn) {
870 /* Let the next compaction start anew. */ 900 /* Let the next compaction start anew. */
871 zone->compact_cached_migrate_pfn = zone->zone_start_pfn; 901 zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
902 zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
872 zone->compact_cached_free_pfn = zone_end_pfn(zone); 903 zone->compact_cached_free_pfn = zone_end_pfn(zone);
873 904
874 /* 905 /*
@@ -968,6 +999,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
968 int ret; 999 int ret;
969 unsigned long start_pfn = zone->zone_start_pfn; 1000 unsigned long start_pfn = zone->zone_start_pfn;
970 unsigned long end_pfn = zone_end_pfn(zone); 1001 unsigned long end_pfn = zone_end_pfn(zone);
1002 const bool sync = cc->mode != MIGRATE_ASYNC;
971 1003
972 ret = compaction_suitable(zone, cc->order); 1004 ret = compaction_suitable(zone, cc->order);
973 switch (ret) { 1005 switch (ret) {
@@ -993,7 +1025,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
993 * information on where the scanners should start but check that it 1025 * information on where the scanners should start but check that it
994 * is initialised by ensuring the values are within zone boundaries. 1026 * is initialised by ensuring the values are within zone boundaries.
995 */ 1027 */
996 cc->migrate_pfn = zone->compact_cached_migrate_pfn; 1028 cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
997 cc->free_pfn = zone->compact_cached_free_pfn; 1029 cc->free_pfn = zone->compact_cached_free_pfn;
998 if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { 1030 if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
999 cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); 1031 cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
@@ -1001,7 +1033,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1001 } 1033 }
1002 if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { 1034 if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
1003 cc->migrate_pfn = start_pfn; 1035 cc->migrate_pfn = start_pfn;
1004 zone->compact_cached_migrate_pfn = cc->migrate_pfn; 1036 zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
1037 zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
1005 } 1038 }
1006 1039
1007 trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); 1040 trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);
@@ -1009,7 +1042,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1009 migrate_prep_local(); 1042 migrate_prep_local();
1010 1043
1011 while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { 1044 while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
1012 unsigned long nr_migrate, nr_remaining;
1013 int err; 1045 int err;
1014 1046
1015 switch (isolate_migratepages(zone, cc)) { 1047 switch (isolate_migratepages(zone, cc)) {
@@ -1024,21 +1056,20 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1024 ; 1056 ;
1025 } 1057 }
1026 1058
1027 nr_migrate = cc->nr_migratepages; 1059 if (!cc->nr_migratepages)
1060 continue;
1061
1028 err = migrate_pages(&cc->migratepages, compaction_alloc, 1062 err = migrate_pages(&cc->migratepages, compaction_alloc,
1029 (unsigned long)cc, 1063 compaction_free, (unsigned long)cc, cc->mode,
1030 cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
1031 MR_COMPACTION); 1064 MR_COMPACTION);
1032 update_nr_listpages(cc);
1033 nr_remaining = cc->nr_migratepages;
1034 1065
1035 trace_mm_compaction_migratepages(nr_migrate - nr_remaining, 1066 trace_mm_compaction_migratepages(cc->nr_migratepages, err,
1036 nr_remaining); 1067 &cc->migratepages);
1037 1068
1038 /* Release isolated pages not migrated */ 1069 /* All pages were either migrated or will be released */
1070 cc->nr_migratepages = 0;
1039 if (err) { 1071 if (err) {
1040 putback_movable_pages(&cc->migratepages); 1072 putback_movable_pages(&cc->migratepages);
1041 cc->nr_migratepages = 0;
1042 /* 1073 /*
1043 * migrate_pages() may return -ENOMEM when scanners meet 1074 * migrate_pages() may return -ENOMEM when scanners meet
1044 * and we want compact_finished() to detect it 1075 * and we want compact_finished() to detect it
@@ -1060,9 +1091,8 @@ out:
1060 return ret; 1091 return ret;
1061} 1092}
1062 1093
1063static unsigned long compact_zone_order(struct zone *zone, 1094static unsigned long compact_zone_order(struct zone *zone, int order,
1064 int order, gfp_t gfp_mask, 1095 gfp_t gfp_mask, enum migrate_mode mode, bool *contended)
1065 bool sync, bool *contended)
1066{ 1096{
1067 unsigned long ret; 1097 unsigned long ret;
1068 struct compact_control cc = { 1098 struct compact_control cc = {
@@ -1071,7 +1101,7 @@ static unsigned long compact_zone_order(struct zone *zone,
1071 .order = order, 1101 .order = order,
1072 .migratetype = allocflags_to_migratetype(gfp_mask), 1102 .migratetype = allocflags_to_migratetype(gfp_mask),
1073 .zone = zone, 1103 .zone = zone,
1074 .sync = sync, 1104 .mode = mode,
1075 }; 1105 };
1076 INIT_LIST_HEAD(&cc.freepages); 1106 INIT_LIST_HEAD(&cc.freepages);
1077 INIT_LIST_HEAD(&cc.migratepages); 1107 INIT_LIST_HEAD(&cc.migratepages);
@@ -1093,7 +1123,7 @@ int sysctl_extfrag_threshold = 500;
1093 * @order: The order of the current allocation 1123 * @order: The order of the current allocation
1094 * @gfp_mask: The GFP mask of the current allocation 1124 * @gfp_mask: The GFP mask of the current allocation
1095 * @nodemask: The allowed nodes to allocate from 1125 * @nodemask: The allowed nodes to allocate from
1096 * @sync: Whether migration is synchronous or not 1126 * @mode: The migration mode for async, sync light, or sync migration
1097 * @contended: Return value that is true if compaction was aborted due to lock contention 1127 * @contended: Return value that is true if compaction was aborted due to lock contention
1098 * @page: Optionally capture a free page of the requested order during compaction 1128 * @page: Optionally capture a free page of the requested order during compaction
1099 * 1129 *
@@ -1101,7 +1131,7 @@ int sysctl_extfrag_threshold = 500;
1101 */ 1131 */
1102unsigned long try_to_compact_pages(struct zonelist *zonelist, 1132unsigned long try_to_compact_pages(struct zonelist *zonelist,
1103 int order, gfp_t gfp_mask, nodemask_t *nodemask, 1133 int order, gfp_t gfp_mask, nodemask_t *nodemask,
1104 bool sync, bool *contended) 1134 enum migrate_mode mode, bool *contended)
1105{ 1135{
1106 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 1136 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1107 int may_enter_fs = gfp_mask & __GFP_FS; 1137 int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1126,7 +1156,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
1126 nodemask) { 1156 nodemask) {
1127 int status; 1157 int status;
1128 1158
1129 status = compact_zone_order(zone, order, gfp_mask, sync, 1159 status = compact_zone_order(zone, order, gfp_mask, mode,
1130 contended); 1160 contended);
1131 rc = max(status, rc); 1161 rc = max(status, rc);
1132 1162
@@ -1165,9 +1195,6 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
1165 if (zone_watermark_ok(zone, cc->order, 1195 if (zone_watermark_ok(zone, cc->order,
1166 low_wmark_pages(zone), 0, 0)) 1196 low_wmark_pages(zone), 0, 0))
1167 compaction_defer_reset(zone, cc->order, false); 1197 compaction_defer_reset(zone, cc->order, false);
1168 /* Currently async compaction is never deferred. */
1169 else if (cc->sync)
1170 defer_compaction(zone, cc->order);
1171 } 1198 }
1172 1199
1173 VM_BUG_ON(!list_empty(&cc->freepages)); 1200 VM_BUG_ON(!list_empty(&cc->freepages));
@@ -1179,7 +1206,7 @@ void compact_pgdat(pg_data_t *pgdat, int order)
1179{ 1206{
1180 struct compact_control cc = { 1207 struct compact_control cc = {
1181 .order = order, 1208 .order = order,
1182 .sync = false, 1209 .mode = MIGRATE_ASYNC,
1183 }; 1210 };
1184 1211
1185 if (!order) 1212 if (!order)
@@ -1192,7 +1219,7 @@ static void compact_node(int nid)
1192{ 1219{
1193 struct compact_control cc = { 1220 struct compact_control cc = {
1194 .order = -1, 1221 .order = -1,
1195 .sync = true, 1222 .mode = MIGRATE_SYNC,
1196 .ignore_skip_hint = true, 1223 .ignore_skip_hint = true,
1197 }; 1224 };
1198 1225
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 8058fcd7ae91..306baa594f95 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -170,24 +170,16 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
170 retval->boundary = boundary; 170 retval->boundary = boundary;
171 retval->allocation = allocation; 171 retval->allocation = allocation;
172 172
173 if (dev) { 173 INIT_LIST_HEAD(&retval->pools);
174 int ret;
175 174
176 mutex_lock(&pools_lock); 175 mutex_lock(&pools_lock);
177 if (list_empty(&dev->dma_pools)) 176 if (list_empty(&dev->dma_pools) &&
178 ret = device_create_file(dev, &dev_attr_pools); 177 device_create_file(dev, &dev_attr_pools)) {
179 else 178 kfree(retval);
180 ret = 0; 179 return NULL;
181 /* note: not currently insisting "name" be unique */
182 if (!ret)
183 list_add(&retval->pools, &dev->dma_pools);
184 else {
185 kfree(retval);
186 retval = NULL;
187 }
188 mutex_unlock(&pools_lock);
189 } else 180 } else
190 INIT_LIST_HEAD(&retval->pools); 181 list_add(&retval->pools, &dev->dma_pools);
182 mutex_unlock(&pools_lock);
191 183
192 return retval; 184 return retval;
193} 185}
@@ -508,7 +500,6 @@ void dmam_pool_destroy(struct dma_pool *pool)
508{ 500{
509 struct device *dev = pool->dev; 501 struct device *dev = pool->dev;
510 502
511 WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool)); 503 WARN_ON(devres_release(dev, dmam_pool_release, dmam_pool_match, pool));
512 dma_pool_destroy(pool);
513} 504}
514EXPORT_SYMBOL(dmam_pool_destroy); 505EXPORT_SYMBOL(dmam_pool_destroy);
diff --git a/mm/filemap.c b/mm/filemap.c
index 021056c324e6..7fadf1c62838 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -753,8 +753,17 @@ EXPORT_SYMBOL(unlock_page);
753 */ 753 */
754void end_page_writeback(struct page *page) 754void end_page_writeback(struct page *page)
755{ 755{
756 if (TestClearPageReclaim(page)) 756 /*
757 * TestClearPageReclaim could be used here but it is an atomic
758 * operation and overkill in this particular case. Failing to
759 * shuffle a page marked for immediate reclaim is too mild to
760 * justify taking an atomic operation penalty at the end of
761 * ever page writeback.
762 */
763 if (PageReclaim(page)) {
764 ClearPageReclaim(page);
757 rotate_reclaimable_page(page); 765 rotate_reclaimable_page(page);
766 }
758 767
759 if (!test_clear_page_writeback(page)) 768 if (!test_clear_page_writeback(page))
760 BUG(); 769 BUG();
@@ -764,6 +773,31 @@ void end_page_writeback(struct page *page)
764} 773}
765EXPORT_SYMBOL(end_page_writeback); 774EXPORT_SYMBOL(end_page_writeback);
766 775
776/*
777 * After completing I/O on a page, call this routine to update the page
778 * flags appropriately
779 */
780void page_endio(struct page *page, int rw, int err)
781{
782 if (rw == READ) {
783 if (!err) {
784 SetPageUptodate(page);
785 } else {
786 ClearPageUptodate(page);
787 SetPageError(page);
788 }
789 unlock_page(page);
790 } else { /* rw == WRITE */
791 if (err) {
792 SetPageError(page);
793 if (page->mapping)
794 mapping_set_error(page->mapping, err);
795 }
796 end_page_writeback(page);
797 }
798}
799EXPORT_SYMBOL_GPL(page_endio);
800
767/** 801/**
768 * __lock_page - get a lock on the page, assuming we need to sleep to get it 802 * __lock_page - get a lock on the page, assuming we need to sleep to get it
769 * @page: the page to lock 803 * @page: the page to lock
@@ -957,26 +991,6 @@ out:
957EXPORT_SYMBOL(find_get_entry); 991EXPORT_SYMBOL(find_get_entry);
958 992
959/** 993/**
960 * find_get_page - find and get a page reference
961 * @mapping: the address_space to search
962 * @offset: the page index
963 *
964 * Looks up the page cache slot at @mapping & @offset. If there is a
965 * page cache page, it is returned with an increased refcount.
966 *
967 * Otherwise, %NULL is returned.
968 */
969struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
970{
971 struct page *page = find_get_entry(mapping, offset);
972
973 if (radix_tree_exceptional_entry(page))
974 page = NULL;
975 return page;
976}
977EXPORT_SYMBOL(find_get_page);
978
979/**
980 * find_lock_entry - locate, pin and lock a page cache entry 994 * find_lock_entry - locate, pin and lock a page cache entry
981 * @mapping: the address_space to search 995 * @mapping: the address_space to search
982 * @offset: the page cache index 996 * @offset: the page cache index
@@ -1013,66 +1027,84 @@ repeat:
1013EXPORT_SYMBOL(find_lock_entry); 1027EXPORT_SYMBOL(find_lock_entry);
1014 1028
1015/** 1029/**
1016 * find_lock_page - locate, pin and lock a pagecache page 1030 * pagecache_get_page - find and get a page reference
1017 * @mapping: the address_space to search 1031 * @mapping: the address_space to search
1018 * @offset: the page index 1032 * @offset: the page index
1033 * @fgp_flags: PCG flags
1034 * @gfp_mask: gfp mask to use if a page is to be allocated
1019 * 1035 *
1020 * Looks up the page cache slot at @mapping & @offset. If there is a 1036 * Looks up the page cache slot at @mapping & @offset.
1021 * page cache page, it is returned locked and with an increased
1022 * refcount.
1023 *
1024 * Otherwise, %NULL is returned.
1025 *
1026 * find_lock_page() may sleep.
1027 */
1028struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
1029{
1030 struct page *page = find_lock_entry(mapping, offset);
1031
1032 if (radix_tree_exceptional_entry(page))
1033 page = NULL;
1034 return page;
1035}
1036EXPORT_SYMBOL(find_lock_page);
1037
1038/**
1039 * find_or_create_page - locate or add a pagecache page
1040 * @mapping: the page's address_space
1041 * @index: the page's index into the mapping
1042 * @gfp_mask: page allocation mode
1043 * 1037 *
1044 * Looks up the page cache slot at @mapping & @offset. If there is a 1038 * PCG flags modify how the page is returned
1045 * page cache page, it is returned locked and with an increased
1046 * refcount.
1047 * 1039 *
1048 * If the page is not present, a new page is allocated using @gfp_mask 1040 * FGP_ACCESSED: the page will be marked accessed
1049 * and added to the page cache and the VM's LRU list. The page is 1041 * FGP_LOCK: Page is return locked
1050 * returned locked and with an increased refcount. 1042 * FGP_CREAT: If page is not present then a new page is allocated using
1043 * @gfp_mask and added to the page cache and the VM's LRU
1044 * list. The page is returned locked and with an increased
1045 * refcount. Otherwise, %NULL is returned.
1051 * 1046 *
1052 * On memory exhaustion, %NULL is returned. 1047 * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
1048 * if the GFP flags specified for FGP_CREAT are atomic.
1053 * 1049 *
1054 * find_or_create_page() may sleep, even if @gfp_flags specifies an 1050 * If there is a page cache page, it is returned with an increased refcount.
1055 * atomic allocation!
1056 */ 1051 */
1057struct page *find_or_create_page(struct address_space *mapping, 1052struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
1058 pgoff_t index, gfp_t gfp_mask) 1053 int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask)
1059{ 1054{
1060 struct page *page; 1055 struct page *page;
1061 int err; 1056
1062repeat: 1057repeat:
1063 page = find_lock_page(mapping, index); 1058 page = find_get_entry(mapping, offset);
1064 if (!page) { 1059 if (radix_tree_exceptional_entry(page))
1065 page = __page_cache_alloc(gfp_mask); 1060 page = NULL;
1061 if (!page)
1062 goto no_page;
1063
1064 if (fgp_flags & FGP_LOCK) {
1065 if (fgp_flags & FGP_NOWAIT) {
1066 if (!trylock_page(page)) {
1067 page_cache_release(page);
1068 return NULL;
1069 }
1070 } else {
1071 lock_page(page);
1072 }
1073
1074 /* Has the page been truncated? */
1075 if (unlikely(page->mapping != mapping)) {
1076 unlock_page(page);
1077 page_cache_release(page);
1078 goto repeat;
1079 }
1080 VM_BUG_ON_PAGE(page->index != offset, page);
1081 }
1082
1083 if (page && (fgp_flags & FGP_ACCESSED))
1084 mark_page_accessed(page);
1085
1086no_page:
1087 if (!page && (fgp_flags & FGP_CREAT)) {
1088 int err;
1089 if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
1090 cache_gfp_mask |= __GFP_WRITE;
1091 if (fgp_flags & FGP_NOFS) {
1092 cache_gfp_mask &= ~__GFP_FS;
1093 radix_gfp_mask &= ~__GFP_FS;
1094 }
1095
1096 page = __page_cache_alloc(cache_gfp_mask);
1066 if (!page) 1097 if (!page)
1067 return NULL; 1098 return NULL;
1068 /* 1099
1069 * We want a regular kernel memory (not highmem or DMA etc) 1100 if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK)))
1070 * allocation for the radix tree nodes, but we need to honour 1101 fgp_flags |= FGP_LOCK;
1071 * the context-specific requirements the caller has asked for. 1102
1072 * GFP_RECLAIM_MASK collects those requirements. 1103 /* Init accessed so avoit atomic mark_page_accessed later */
1073 */ 1104 if (fgp_flags & FGP_ACCESSED)
1074 err = add_to_page_cache_lru(page, mapping, index, 1105 init_page_accessed(page);
1075 (gfp_mask & GFP_RECLAIM_MASK)); 1106
1107 err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask);
1076 if (unlikely(err)) { 1108 if (unlikely(err)) {
1077 page_cache_release(page); 1109 page_cache_release(page);
1078 page = NULL; 1110 page = NULL;
@@ -1080,9 +1112,10 @@ repeat:
1080 goto repeat; 1112 goto repeat;
1081 } 1113 }
1082 } 1114 }
1115
1083 return page; 1116 return page;
1084} 1117}
1085EXPORT_SYMBOL(find_or_create_page); 1118EXPORT_SYMBOL(pagecache_get_page);
1086 1119
1087/** 1120/**
1088 * find_get_entries - gang pagecache lookup 1121 * find_get_entries - gang pagecache lookup
@@ -1379,39 +1412,6 @@ repeat:
1379} 1412}
1380EXPORT_SYMBOL(find_get_pages_tag); 1413EXPORT_SYMBOL(find_get_pages_tag);
1381 1414
1382/**
1383 * grab_cache_page_nowait - returns locked page at given index in given cache
1384 * @mapping: target address_space
1385 * @index: the page index
1386 *
1387 * Same as grab_cache_page(), but do not wait if the page is unavailable.
1388 * This is intended for speculative data generators, where the data can
1389 * be regenerated if the page couldn't be grabbed. This routine should
1390 * be safe to call while holding the lock for another page.
1391 *
1392 * Clear __GFP_FS when allocating the page to avoid recursion into the fs
1393 * and deadlock against the caller's locked page.
1394 */
1395struct page *
1396grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
1397{
1398 struct page *page = find_get_page(mapping, index);
1399
1400 if (page) {
1401 if (trylock_page(page))
1402 return page;
1403 page_cache_release(page);
1404 return NULL;
1405 }
1406 page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
1407 if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) {
1408 page_cache_release(page);
1409 page = NULL;
1410 }
1411 return page;
1412}
1413EXPORT_SYMBOL(grab_cache_page_nowait);
1414
1415/* 1415/*
1416 * CD/DVDs are error prone. When a medium error occurs, the driver may fail 1416 * CD/DVDs are error prone. When a medium error occurs, the driver may fail
1417 * a _large_ part of the i/o request. Imagine the worst scenario: 1417 * a _large_ part of the i/o request. Imagine the worst scenario:
@@ -2381,7 +2381,6 @@ int pagecache_write_end(struct file *file, struct address_space *mapping,
2381{ 2381{
2382 const struct address_space_operations *aops = mapping->a_ops; 2382 const struct address_space_operations *aops = mapping->a_ops;
2383 2383
2384 mark_page_accessed(page);
2385 return aops->write_end(file, mapping, pos, len, copied, page, fsdata); 2384 return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
2386} 2385}
2387EXPORT_SYMBOL(pagecache_write_end); 2386EXPORT_SYMBOL(pagecache_write_end);
@@ -2463,34 +2462,18 @@ EXPORT_SYMBOL(generic_file_direct_write);
2463struct page *grab_cache_page_write_begin(struct address_space *mapping, 2462struct page *grab_cache_page_write_begin(struct address_space *mapping,
2464 pgoff_t index, unsigned flags) 2463 pgoff_t index, unsigned flags)
2465{ 2464{
2466 int status;
2467 gfp_t gfp_mask;
2468 struct page *page; 2465 struct page *page;
2469 gfp_t gfp_notmask = 0; 2466 int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT;
2470 2467
2471 gfp_mask = mapping_gfp_mask(mapping);
2472 if (mapping_cap_account_dirty(mapping))
2473 gfp_mask |= __GFP_WRITE;
2474 if (flags & AOP_FLAG_NOFS) 2468 if (flags & AOP_FLAG_NOFS)
2475 gfp_notmask = __GFP_FS; 2469 fgp_flags |= FGP_NOFS;
2476repeat: 2470
2477 page = find_lock_page(mapping, index); 2471 page = pagecache_get_page(mapping, index, fgp_flags,
2472 mapping_gfp_mask(mapping),
2473 GFP_KERNEL);
2478 if (page) 2474 if (page)
2479 goto found; 2475 wait_for_stable_page(page);
2480 2476
2481 page = __page_cache_alloc(gfp_mask & ~gfp_notmask);
2482 if (!page)
2483 return NULL;
2484 status = add_to_page_cache_lru(page, mapping, index,
2485 GFP_KERNEL & ~gfp_notmask);
2486 if (unlikely(status)) {
2487 page_cache_release(page);
2488 if (status == -EEXIST)
2489 goto repeat;
2490 return NULL;
2491 }
2492found:
2493 wait_for_stable_page(page);
2494 return page; 2477 return page;
2495} 2478}
2496EXPORT_SYMBOL(grab_cache_page_write_begin); 2479EXPORT_SYMBOL(grab_cache_page_write_begin);
@@ -2539,7 +2522,7 @@ again:
2539 2522
2540 status = a_ops->write_begin(file, mapping, pos, bytes, flags, 2523 status = a_ops->write_begin(file, mapping, pos, bytes, flags,
2541 &page, &fsdata); 2524 &page, &fsdata);
2542 if (unlikely(status)) 2525 if (unlikely(status < 0))
2543 break; 2526 break;
2544 2527
2545 if (mapping_writably_mapped(mapping)) 2528 if (mapping_writably_mapped(mapping))
@@ -2548,7 +2531,6 @@ again:
2548 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); 2531 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
2549 flush_dcache_page(page); 2532 flush_dcache_page(page);
2550 2533
2551 mark_page_accessed(page);
2552 status = a_ops->write_end(file, mapping, pos, bytes, copied, 2534 status = a_ops->write_end(file, mapping, pos, bytes, copied,
2553 page, fsdata); 2535 page, fsdata);
2554 if (unlikely(status < 0)) 2536 if (unlikely(status < 0))
diff --git a/mm/fremap.c b/mm/fremap.c
index 34feba60a17e..2c5646f11f41 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -82,13 +82,10 @@ static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
82 82
83 ptfile = pgoff_to_pte(pgoff); 83 ptfile = pgoff_to_pte(pgoff);
84 84
85 if (!pte_none(*pte)) { 85 if (!pte_none(*pte))
86 if (pte_present(*pte) && pte_soft_dirty(*pte))
87 pte_file_mksoft_dirty(ptfile);
88 zap_pte(mm, vma, addr, pte); 86 zap_pte(mm, vma, addr, pte);
89 }
90 87
91 set_pte_at(mm, addr, pte, ptfile); 88 set_pte_at(mm, addr, pte, pte_file_mksoft_dirty(ptfile));
92 /* 89 /*
93 * We don't need to run update_mmu_cache() here because the "file pte" 90 * We don't need to run update_mmu_cache() here because the "file pte"
94 * being installed by install_file_pte() is not a real pte - it's a 91 * being installed by install_file_pte() is not a real pte - it's a
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 1b24bdcb3197..c30eec536f03 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -327,15 +327,12 @@ EXPORT_SYMBOL(__frontswap_invalidate_area);
327 327
328static unsigned long __frontswap_curr_pages(void) 328static unsigned long __frontswap_curr_pages(void)
329{ 329{
330 int type;
331 unsigned long totalpages = 0; 330 unsigned long totalpages = 0;
332 struct swap_info_struct *si = NULL; 331 struct swap_info_struct *si = NULL;
333 332
334 assert_spin_locked(&swap_lock); 333 assert_spin_locked(&swap_lock);
335 for (type = swap_list.head; type >= 0; type = si->next) { 334 plist_for_each_entry(si, &swap_active_head, list)
336 si = swap_info[type];
337 totalpages += atomic_read(&si->frontswap_pages); 335 totalpages += atomic_read(&si->frontswap_pages);
338 }
339 return totalpages; 336 return totalpages;
340} 337}
341 338
@@ -347,11 +344,9 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
347 int si_frontswap_pages; 344 int si_frontswap_pages;
348 unsigned long total_pages_to_unuse = total; 345 unsigned long total_pages_to_unuse = total;
349 unsigned long pages = 0, pages_to_unuse = 0; 346 unsigned long pages = 0, pages_to_unuse = 0;
350 int type;
351 347
352 assert_spin_locked(&swap_lock); 348 assert_spin_locked(&swap_lock);
353 for (type = swap_list.head; type >= 0; type = si->next) { 349 plist_for_each_entry(si, &swap_active_head, list) {
354 si = swap_info[type];
355 si_frontswap_pages = atomic_read(&si->frontswap_pages); 350 si_frontswap_pages = atomic_read(&si->frontswap_pages);
356 if (total_pages_to_unuse < si_frontswap_pages) { 351 if (total_pages_to_unuse < si_frontswap_pages) {
357 pages = pages_to_unuse = total_pages_to_unuse; 352 pages = pages_to_unuse = total_pages_to_unuse;
@@ -366,7 +361,7 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
366 } 361 }
367 vm_unacct_memory(pages); 362 vm_unacct_memory(pages);
368 *unused = pages_to_unuse; 363 *unused = pages_to_unuse;
369 *swapid = type; 364 *swapid = si->type;
370 ret = 0; 365 ret = 0;
371 break; 366 break;
372 } 367 }
@@ -413,7 +408,7 @@ void frontswap_shrink(unsigned long target_pages)
413 /* 408 /*
414 * we don't want to hold swap_lock while doing a very 409 * we don't want to hold swap_lock while doing a very
415 * lengthy try_to_unuse, but swap_list may change 410 * lengthy try_to_unuse, but swap_list may change
416 * so restart scan from swap_list.head each time 411 * so restart scan from swap_active_head each time
417 */ 412 */
418 spin_lock(&swap_lock); 413 spin_lock(&swap_lock);
419 ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); 414 ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
diff --git a/mm/gup.c b/mm/gup.c
new file mode 100644
index 000000000000..cc5a9e7adea7
--- /dev/null
+++ b/mm/gup.c
@@ -0,0 +1,662 @@
1#include <linux/kernel.h>
2#include <linux/errno.h>
3#include <linux/err.h>
4#include <linux/spinlock.h>
5
6#include <linux/hugetlb.h>
7#include <linux/mm.h>
8#include <linux/pagemap.h>
9#include <linux/rmap.h>
10#include <linux/swap.h>
11#include <linux/swapops.h>
12
13#include "internal.h"
14
15static struct page *no_page_table(struct vm_area_struct *vma,
16 unsigned int flags)
17{
18 /*
19 * When core dumping an enormous anonymous area that nobody
20 * has touched so far, we don't want to allocate unnecessary pages or
21 * page tables. Return error instead of NULL to skip handle_mm_fault,
22 * then get_dump_page() will return NULL to leave a hole in the dump.
23 * But we can only make this optimization where a hole would surely
24 * be zero-filled if handle_mm_fault() actually did handle it.
25 */
26 if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault))
27 return ERR_PTR(-EFAULT);
28 return NULL;
29}
30
31static struct page *follow_page_pte(struct vm_area_struct *vma,
32 unsigned long address, pmd_t *pmd, unsigned int flags)
33{
34 struct mm_struct *mm = vma->vm_mm;
35 struct page *page;
36 spinlock_t *ptl;
37 pte_t *ptep, pte;
38
39retry:
40 if (unlikely(pmd_bad(*pmd)))
41 return no_page_table(vma, flags);
42
43 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
44 pte = *ptep;
45 if (!pte_present(pte)) {
46 swp_entry_t entry;
47 /*
48 * KSM's break_ksm() relies upon recognizing a ksm page
49 * even while it is being migrated, so for that case we
50 * need migration_entry_wait().
51 */
52 if (likely(!(flags & FOLL_MIGRATION)))
53 goto no_page;
54 if (pte_none(pte) || pte_file(pte))
55 goto no_page;
56 entry = pte_to_swp_entry(pte);
57 if (!is_migration_entry(entry))
58 goto no_page;
59 pte_unmap_unlock(ptep, ptl);
60 migration_entry_wait(mm, pmd, address);
61 goto retry;
62 }
63 if ((flags & FOLL_NUMA) && pte_numa(pte))
64 goto no_page;
65 if ((flags & FOLL_WRITE) && !pte_write(pte)) {
66 pte_unmap_unlock(ptep, ptl);
67 return NULL;
68 }
69
70 page = vm_normal_page(vma, address, pte);
71 if (unlikely(!page)) {
72 if ((flags & FOLL_DUMP) ||
73 !is_zero_pfn(pte_pfn(pte)))
74 goto bad_page;
75 page = pte_page(pte);
76 }
77
78 if (flags & FOLL_GET)
79 get_page_foll(page);
80 if (flags & FOLL_TOUCH) {
81 if ((flags & FOLL_WRITE) &&
82 !pte_dirty(pte) && !PageDirty(page))
83 set_page_dirty(page);
84 /*
85 * pte_mkyoung() would be more correct here, but atomic care
86 * is needed to avoid losing the dirty bit: it is easier to use
87 * mark_page_accessed().
88 */
89 mark_page_accessed(page);
90 }
91 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
92 /*
93 * The preliminary mapping check is mainly to avoid the
94 * pointless overhead of lock_page on the ZERO_PAGE
95 * which might bounce very badly if there is contention.
96 *
97 * If the page is already locked, we don't need to
98 * handle it now - vmscan will handle it later if and
99 * when it attempts to reclaim the page.
100 */
101 if (page->mapping && trylock_page(page)) {
102 lru_add_drain(); /* push cached pages to LRU */
103 /*
104 * Because we lock page here, and migration is
105 * blocked by the pte's page reference, and we
106 * know the page is still mapped, we don't even
107 * need to check for file-cache page truncation.
108 */
109 mlock_vma_page(page);
110 unlock_page(page);
111 }
112 }
113 pte_unmap_unlock(ptep, ptl);
114 return page;
115bad_page:
116 pte_unmap_unlock(ptep, ptl);
117 return ERR_PTR(-EFAULT);
118
119no_page:
120 pte_unmap_unlock(ptep, ptl);
121 if (!pte_none(pte))
122 return NULL;
123 return no_page_table(vma, flags);
124}
125
126/**
127 * follow_page_mask - look up a page descriptor from a user-virtual address
128 * @vma: vm_area_struct mapping @address
129 * @address: virtual address to look up
130 * @flags: flags modifying lookup behaviour
131 * @page_mask: on output, *page_mask is set according to the size of the page
132 *
133 * @flags can have FOLL_ flags set, defined in <linux/mm.h>
134 *
135 * Returns the mapped (struct page *), %NULL if no mapping exists, or
136 * an error pointer if there is a mapping to something not represented
137 * by a page descriptor (see also vm_normal_page()).
138 */
139struct page *follow_page_mask(struct vm_area_struct *vma,
140 unsigned long address, unsigned int flags,
141 unsigned int *page_mask)
142{
143 pgd_t *pgd;
144 pud_t *pud;
145 pmd_t *pmd;
146 spinlock_t *ptl;
147 struct page *page;
148 struct mm_struct *mm = vma->vm_mm;
149
150 *page_mask = 0;
151
152 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
153 if (!IS_ERR(page)) {
154 BUG_ON(flags & FOLL_GET);
155 return page;
156 }
157
158 pgd = pgd_offset(mm, address);
159 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
160 return no_page_table(vma, flags);
161
162 pud = pud_offset(pgd, address);
163 if (pud_none(*pud))
164 return no_page_table(vma, flags);
165 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
166 if (flags & FOLL_GET)
167 return NULL;
168 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
169 return page;
170 }
171 if (unlikely(pud_bad(*pud)))
172 return no_page_table(vma, flags);
173
174 pmd = pmd_offset(pud, address);
175 if (pmd_none(*pmd))
176 return no_page_table(vma, flags);
177 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
178 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
179 if (flags & FOLL_GET) {
180 /*
181 * Refcount on tail pages are not well-defined and
182 * shouldn't be taken. The caller should handle a NULL
183 * return when trying to follow tail pages.
184 */
185 if (PageHead(page))
186 get_page(page);
187 else
188 page = NULL;
189 }
190 return page;
191 }
192 if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
193 return no_page_table(vma, flags);
194 if (pmd_trans_huge(*pmd)) {
195 if (flags & FOLL_SPLIT) {
196 split_huge_page_pmd(vma, address, pmd);
197 return follow_page_pte(vma, address, pmd, flags);
198 }
199 ptl = pmd_lock(mm, pmd);
200 if (likely(pmd_trans_huge(*pmd))) {
201 if (unlikely(pmd_trans_splitting(*pmd))) {
202 spin_unlock(ptl);
203 wait_split_huge_page(vma->anon_vma, pmd);
204 } else {
205 page = follow_trans_huge_pmd(vma, address,
206 pmd, flags);
207 spin_unlock(ptl);
208 *page_mask = HPAGE_PMD_NR - 1;
209 return page;
210 }
211 } else
212 spin_unlock(ptl);
213 }
214 return follow_page_pte(vma, address, pmd, flags);
215}
216
217static int get_gate_page(struct mm_struct *mm, unsigned long address,
218 unsigned int gup_flags, struct vm_area_struct **vma,
219 struct page **page)
220{
221 pgd_t *pgd;
222 pud_t *pud;
223 pmd_t *pmd;
224 pte_t *pte;
225 int ret = -EFAULT;
226
227 /* user gate pages are read-only */
228 if (gup_flags & FOLL_WRITE)
229 return -EFAULT;
230 if (address > TASK_SIZE)
231 pgd = pgd_offset_k(address);
232 else
233 pgd = pgd_offset_gate(mm, address);
234 BUG_ON(pgd_none(*pgd));
235 pud = pud_offset(pgd, address);
236 BUG_ON(pud_none(*pud));
237 pmd = pmd_offset(pud, address);
238 if (pmd_none(*pmd))
239 return -EFAULT;
240 VM_BUG_ON(pmd_trans_huge(*pmd));
241 pte = pte_offset_map(pmd, address);
242 if (pte_none(*pte))
243 goto unmap;
244 *vma = get_gate_vma(mm);
245 if (!page)
246 goto out;
247 *page = vm_normal_page(*vma, address, *pte);
248 if (!*page) {
249 if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
250 goto unmap;
251 *page = pte_page(*pte);
252 }
253 get_page(*page);
254out:
255 ret = 0;
256unmap:
257 pte_unmap(pte);
258 return ret;
259}
260
261static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
262 unsigned long address, unsigned int *flags, int *nonblocking)
263{
264 struct mm_struct *mm = vma->vm_mm;
265 unsigned int fault_flags = 0;
266 int ret;
267
268 /* For mlock, just skip the stack guard page. */
269 if ((*flags & FOLL_MLOCK) &&
270 (stack_guard_page_start(vma, address) ||
271 stack_guard_page_end(vma, address + PAGE_SIZE)))
272 return -ENOENT;
273 if (*flags & FOLL_WRITE)
274 fault_flags |= FAULT_FLAG_WRITE;
275 if (nonblocking)
276 fault_flags |= FAULT_FLAG_ALLOW_RETRY;
277 if (*flags & FOLL_NOWAIT)
278 fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
279
280 ret = handle_mm_fault(mm, vma, address, fault_flags);
281 if (ret & VM_FAULT_ERROR) {
282 if (ret & VM_FAULT_OOM)
283 return -ENOMEM;
284 if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
285 return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT;
286 if (ret & VM_FAULT_SIGBUS)
287 return -EFAULT;
288 BUG();
289 }
290
291 if (tsk) {
292 if (ret & VM_FAULT_MAJOR)
293 tsk->maj_flt++;
294 else
295 tsk->min_flt++;
296 }
297
298 if (ret & VM_FAULT_RETRY) {
299 if (nonblocking)
300 *nonblocking = 0;
301 return -EBUSY;
302 }
303
304 /*
305 * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
306 * necessary, even if maybe_mkwrite decided not to set pte_write. We
307 * can thus safely do subsequent page lookups as if they were reads.
308 * But only do so when looping for pte_write is futile: in some cases
309 * userspace may also be wanting to write to the gotten user page,
310 * which a read fault here might prevent (a readonly page might get
311 * reCOWed by userspace write).
312 */
313 if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
314 *flags &= ~FOLL_WRITE;
315 return 0;
316}
317
318static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
319{
320 vm_flags_t vm_flags = vma->vm_flags;
321
322 if (vm_flags & (VM_IO | VM_PFNMAP))
323 return -EFAULT;
324
325 if (gup_flags & FOLL_WRITE) {
326 if (!(vm_flags & VM_WRITE)) {
327 if (!(gup_flags & FOLL_FORCE))
328 return -EFAULT;
329 /*
330 * We used to let the write,force case do COW in a
331 * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
332 * set a breakpoint in a read-only mapping of an
333 * executable, without corrupting the file (yet only
334 * when that file had been opened for writing!).
335 * Anon pages in shared mappings are surprising: now
336 * just reject it.
337 */
338 if (!is_cow_mapping(vm_flags)) {
339 WARN_ON_ONCE(vm_flags & VM_MAYWRITE);
340 return -EFAULT;
341 }
342 }
343 } else if (!(vm_flags & VM_READ)) {
344 if (!(gup_flags & FOLL_FORCE))
345 return -EFAULT;
346 /*
347 * Is there actually any vma we can reach here which does not
348 * have VM_MAYREAD set?
349 */
350 if (!(vm_flags & VM_MAYREAD))
351 return -EFAULT;
352 }
353 return 0;
354}
355
356/**
357 * __get_user_pages() - pin user pages in memory
358 * @tsk: task_struct of target task
359 * @mm: mm_struct of target mm
360 * @start: starting user address
361 * @nr_pages: number of pages from start to pin
362 * @gup_flags: flags modifying pin behaviour
363 * @pages: array that receives pointers to the pages pinned.
364 * Should be at least nr_pages long. Or NULL, if caller
365 * only intends to ensure the pages are faulted in.
366 * @vmas: array of pointers to vmas corresponding to each page.
367 * Or NULL if the caller does not require them.
368 * @nonblocking: whether waiting for disk IO or mmap_sem contention
369 *
370 * Returns number of pages pinned. This may be fewer than the number
371 * requested. If nr_pages is 0 or negative, returns 0. If no pages
372 * were pinned, returns -errno. Each page returned must be released
373 * with a put_page() call when it is finished with. vmas will only
374 * remain valid while mmap_sem is held.
375 *
376 * Must be called with mmap_sem held for read or write.
377 *
378 * __get_user_pages walks a process's page tables and takes a reference to
379 * each struct page that each user address corresponds to at a given
380 * instant. That is, it takes the page that would be accessed if a user
381 * thread accesses the given user virtual address at that instant.
382 *
383 * This does not guarantee that the page exists in the user mappings when
384 * __get_user_pages returns, and there may even be a completely different
385 * page there in some cases (eg. if mmapped pagecache has been invalidated
386 * and subsequently re faulted). However it does guarantee that the page
387 * won't be freed completely. And mostly callers simply care that the page
388 * contains data that was valid *at some point in time*. Typically, an IO
389 * or similar operation cannot guarantee anything stronger anyway because
390 * locks can't be held over the syscall boundary.
391 *
392 * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
393 * the page is written to, set_page_dirty (or set_page_dirty_lock, as
394 * appropriate) must be called after the page is finished with, and
395 * before put_page is called.
396 *
397 * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
398 * or mmap_sem contention, and if waiting is needed to pin all pages,
399 * *@nonblocking will be set to 0.
400 *
401 * In most cases, get_user_pages or get_user_pages_fast should be used
402 * instead of __get_user_pages. __get_user_pages should be used only if
403 * you need some special @gup_flags.
404 */
405long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
406 unsigned long start, unsigned long nr_pages,
407 unsigned int gup_flags, struct page **pages,
408 struct vm_area_struct **vmas, int *nonblocking)
409{
410 long i = 0;
411 unsigned int page_mask;
412 struct vm_area_struct *vma = NULL;
413
414 if (!nr_pages)
415 return 0;
416
417 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
418
419 /*
420 * If FOLL_FORCE is set then do not force a full fault as the hinting
421 * fault information is unrelated to the reference behaviour of a task
422 * using the address space
423 */
424 if (!(gup_flags & FOLL_FORCE))
425 gup_flags |= FOLL_NUMA;
426
427 do {
428 struct page *page;
429 unsigned int foll_flags = gup_flags;
430 unsigned int page_increm;
431
432 /* first iteration or cross vma bound */
433 if (!vma || start >= vma->vm_end) {
434 vma = find_extend_vma(mm, start);
435 if (!vma && in_gate_area(mm, start)) {
436 int ret;
437 ret = get_gate_page(mm, start & PAGE_MASK,
438 gup_flags, &vma,
439 pages ? &pages[i] : NULL);
440 if (ret)
441 return i ? : ret;
442 page_mask = 0;
443 goto next_page;
444 }
445
446 if (!vma || check_vma_flags(vma, gup_flags))
447 return i ? : -EFAULT;
448 if (is_vm_hugetlb_page(vma)) {
449 i = follow_hugetlb_page(mm, vma, pages, vmas,
450 &start, &nr_pages, i,
451 gup_flags);
452 continue;
453 }
454 }
455retry:
456 /*
457 * If we have a pending SIGKILL, don't keep faulting pages and
458 * potentially allocating memory.
459 */
460 if (unlikely(fatal_signal_pending(current)))
461 return i ? i : -ERESTARTSYS;
462 cond_resched();
463 page = follow_page_mask(vma, start, foll_flags, &page_mask);
464 if (!page) {
465 int ret;
466 ret = faultin_page(tsk, vma, start, &foll_flags,
467 nonblocking);
468 switch (ret) {
469 case 0:
470 goto retry;
471 case -EFAULT:
472 case -ENOMEM:
473 case -EHWPOISON:
474 return i ? i : ret;
475 case -EBUSY:
476 return i;
477 case -ENOENT:
478 goto next_page;
479 }
480 BUG();
481 }
482 if (IS_ERR(page))
483 return i ? i : PTR_ERR(page);
484 if (pages) {
485 pages[i] = page;
486 flush_anon_page(vma, page, start);
487 flush_dcache_page(page);
488 page_mask = 0;
489 }
490next_page:
491 if (vmas) {
492 vmas[i] = vma;
493 page_mask = 0;
494 }
495 page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
496 if (page_increm > nr_pages)
497 page_increm = nr_pages;
498 i += page_increm;
499 start += page_increm * PAGE_SIZE;
500 nr_pages -= page_increm;
501 } while (nr_pages);
502 return i;
503}
504EXPORT_SYMBOL(__get_user_pages);
505
506/*
507 * fixup_user_fault() - manually resolve a user page fault
508 * @tsk: the task_struct to use for page fault accounting, or
509 * NULL if faults are not to be recorded.
510 * @mm: mm_struct of target mm
511 * @address: user address
512 * @fault_flags:flags to pass down to handle_mm_fault()
513 *
514 * This is meant to be called in the specific scenario where for locking reasons
515 * we try to access user memory in atomic context (within a pagefault_disable()
516 * section), this returns -EFAULT, and we want to resolve the user fault before
517 * trying again.
518 *
519 * Typically this is meant to be used by the futex code.
520 *
521 * The main difference with get_user_pages() is that this function will
522 * unconditionally call handle_mm_fault() which will in turn perform all the
523 * necessary SW fixup of the dirty and young bits in the PTE, while
524 * handle_mm_fault() only guarantees to update these in the struct page.
525 *
526 * This is important for some architectures where those bits also gate the
527 * access permission to the page because they are maintained in software. On
528 * such architectures, gup() will not be enough to make a subsequent access
529 * succeed.
530 *
531 * This should be called with the mm_sem held for read.
532 */
533int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
534 unsigned long address, unsigned int fault_flags)
535{
536 struct vm_area_struct *vma;
537 vm_flags_t vm_flags;
538 int ret;
539
540 vma = find_extend_vma(mm, address);
541 if (!vma || address < vma->vm_start)
542 return -EFAULT;
543
544 vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ;
545 if (!(vm_flags & vma->vm_flags))
546 return -EFAULT;
547
548 ret = handle_mm_fault(mm, vma, address, fault_flags);
549 if (ret & VM_FAULT_ERROR) {
550 if (ret & VM_FAULT_OOM)
551 return -ENOMEM;
552 if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
553 return -EHWPOISON;
554 if (ret & VM_FAULT_SIGBUS)
555 return -EFAULT;
556 BUG();
557 }
558 if (tsk) {
559 if (ret & VM_FAULT_MAJOR)
560 tsk->maj_flt++;
561 else
562 tsk->min_flt++;
563 }
564 return 0;
565}
566
567/*
568 * get_user_pages() - pin user pages in memory
569 * @tsk: the task_struct to use for page fault accounting, or
570 * NULL if faults are not to be recorded.
571 * @mm: mm_struct of target mm
572 * @start: starting user address
573 * @nr_pages: number of pages from start to pin
574 * @write: whether pages will be written to by the caller
575 * @force: whether to force access even when user mapping is currently
576 * protected (but never forces write access to shared mapping).
577 * @pages: array that receives pointers to the pages pinned.
578 * Should be at least nr_pages long. Or NULL, if caller
579 * only intends to ensure the pages are faulted in.
580 * @vmas: array of pointers to vmas corresponding to each page.
581 * Or NULL if the caller does not require them.
582 *
583 * Returns number of pages pinned. This may be fewer than the number
584 * requested. If nr_pages is 0 or negative, returns 0. If no pages
585 * were pinned, returns -errno. Each page returned must be released
586 * with a put_page() call when it is finished with. vmas will only
587 * remain valid while mmap_sem is held.
588 *
589 * Must be called with mmap_sem held for read or write.
590 *
591 * get_user_pages walks a process's page tables and takes a reference to
592 * each struct page that each user address corresponds to at a given
593 * instant. That is, it takes the page that would be accessed if a user
594 * thread accesses the given user virtual address at that instant.
595 *
596 * This does not guarantee that the page exists in the user mappings when
597 * get_user_pages returns, and there may even be a completely different
598 * page there in some cases (eg. if mmapped pagecache has been invalidated
599 * and subsequently re faulted). However it does guarantee that the page
600 * won't be freed completely. And mostly callers simply care that the page
601 * contains data that was valid *at some point in time*. Typically, an IO
602 * or similar operation cannot guarantee anything stronger anyway because
603 * locks can't be held over the syscall boundary.
604 *
605 * If write=0, the page must not be written to. If the page is written to,
606 * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called
607 * after the page is finished with, and before put_page is called.
608 *
609 * get_user_pages is typically used for fewer-copy IO operations, to get a
610 * handle on the memory by some means other than accesses via the user virtual
611 * addresses. The pages may be submitted for DMA to devices or accessed via
612 * their kernel linear mapping (via the kmap APIs). Care should be taken to
613 * use the correct cache flushing APIs.
614 *
615 * See also get_user_pages_fast, for performance critical applications.
616 */
617long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
618 unsigned long start, unsigned long nr_pages, int write,
619 int force, struct page **pages, struct vm_area_struct **vmas)
620{
621 int flags = FOLL_TOUCH;
622
623 if (pages)
624 flags |= FOLL_GET;
625 if (write)
626 flags |= FOLL_WRITE;
627 if (force)
628 flags |= FOLL_FORCE;
629
630 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
631 NULL);
632}
633EXPORT_SYMBOL(get_user_pages);
634
635/**
636 * get_dump_page() - pin user page in memory while writing it to core dump
637 * @addr: user address
638 *
639 * Returns struct page pointer of user page pinned for dump,
640 * to be freed afterwards by page_cache_release() or put_page().
641 *
642 * Returns NULL on any kind of failure - a hole must then be inserted into
643 * the corefile, to preserve alignment with its headers; and also returns
644 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
645 * allowing a hole to be left in the corefile to save diskspace.
646 *
647 * Called without mmap_sem, but after all other threads have been killed.
648 */
649#ifdef CONFIG_ELF_CORE
650struct page *get_dump_page(unsigned long addr)
651{
652 struct vm_area_struct *vma;
653 struct page *page;
654
655 if (__get_user_pages(current, current->mm, addr, 1,
656 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
657 NULL) < 1)
658 return NULL;
659 flush_cache_page(vma, addr, page_to_pfn(page));
660 return page;
661}
662#endif /* CONFIG_ELF_CORE */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d199d2d91946..e60837dc785c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -5,6 +5,8 @@
5 * the COPYING file in the top-level directory. 5 * the COPYING file in the top-level directory.
6 */ 6 */
7 7
8#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9
8#include <linux/mm.h> 10#include <linux/mm.h>
9#include <linux/sched.h> 11#include <linux/sched.h>
10#include <linux/highmem.h> 12#include <linux/highmem.h>
@@ -151,8 +153,7 @@ static int start_khugepaged(void)
151 khugepaged_thread = kthread_run(khugepaged, NULL, 153 khugepaged_thread = kthread_run(khugepaged, NULL,
152 "khugepaged"); 154 "khugepaged");
153 if (unlikely(IS_ERR(khugepaged_thread))) { 155 if (unlikely(IS_ERR(khugepaged_thread))) {
154 printk(KERN_ERR 156 pr_err("khugepaged: kthread_run(khugepaged) failed\n");
155 "khugepaged: kthread_run(khugepaged) failed\n");
156 err = PTR_ERR(khugepaged_thread); 157 err = PTR_ERR(khugepaged_thread);
157 khugepaged_thread = NULL; 158 khugepaged_thread = NULL;
158 } 159 }
@@ -584,19 +585,19 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
584 585
585 *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); 586 *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
586 if (unlikely(!*hugepage_kobj)) { 587 if (unlikely(!*hugepage_kobj)) {
587 printk(KERN_ERR "hugepage: failed to create transparent hugepage kobject\n"); 588 pr_err("failed to create transparent hugepage kobject\n");
588 return -ENOMEM; 589 return -ENOMEM;
589 } 590 }
590 591
591 err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); 592 err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
592 if (err) { 593 if (err) {
593 printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); 594 pr_err("failed to register transparent hugepage group\n");
594 goto delete_obj; 595 goto delete_obj;
595 } 596 }
596 597
597 err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); 598 err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
598 if (err) { 599 if (err) {
599 printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); 600 pr_err("failed to register transparent hugepage group\n");
600 goto remove_hp_group; 601 goto remove_hp_group;
601 } 602 }
602 603
@@ -689,8 +690,7 @@ static int __init setup_transparent_hugepage(char *str)
689 } 690 }
690out: 691out:
691 if (!ret) 692 if (!ret)
692 printk(KERN_WARNING 693 pr_warn("transparent_hugepage= cannot parse, ignored\n");
693 "transparent_hugepage= cannot parse, ignored\n");
694 return ret; 694 return ret;
695} 695}
696__setup("transparent_hugepage=", setup_transparent_hugepage); 696__setup("transparent_hugepage=", setup_transparent_hugepage);
@@ -1830,10 +1830,11 @@ static void __split_huge_page(struct page *page,
1830 * the newly established pmd of the child later during the 1830 * the newly established pmd of the child later during the
1831 * walk, to be able to set it as pmd_trans_splitting too. 1831 * walk, to be able to set it as pmd_trans_splitting too.
1832 */ 1832 */
1833 if (mapcount != page_mapcount(page)) 1833 if (mapcount != page_mapcount(page)) {
1834 printk(KERN_ERR "mapcount %d page_mapcount %d\n", 1834 pr_err("mapcount %d page_mapcount %d\n",
1835 mapcount, page_mapcount(page)); 1835 mapcount, page_mapcount(page));
1836 BUG_ON(mapcount != page_mapcount(page)); 1836 BUG();
1837 }
1837 1838
1838 __split_huge_page_refcount(page, list); 1839 __split_huge_page_refcount(page, list);
1839 1840
@@ -1844,10 +1845,11 @@ static void __split_huge_page(struct page *page,
1844 BUG_ON(is_vma_temporary_stack(vma)); 1845 BUG_ON(is_vma_temporary_stack(vma));
1845 mapcount2 += __split_huge_page_map(page, vma, addr); 1846 mapcount2 += __split_huge_page_map(page, vma, addr);
1846 } 1847 }
1847 if (mapcount != mapcount2) 1848 if (mapcount != mapcount2) {
1848 printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n", 1849 pr_err("mapcount %d mapcount2 %d page_mapcount %d\n",
1849 mapcount, mapcount2, page_mapcount(page)); 1850 mapcount, mapcount2, page_mapcount(page));
1850 BUG_ON(mapcount != mapcount2); 1851 BUG();
1852 }
1851} 1853}
1852 1854
1853/* 1855/*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c82290b9c1fc..226910cb7c9b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -544,7 +544,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
544/* Movability of hugepages depends on migration support. */ 544/* Movability of hugepages depends on migration support. */
545static inline gfp_t htlb_alloc_mask(struct hstate *h) 545static inline gfp_t htlb_alloc_mask(struct hstate *h)
546{ 546{
547 if (hugepages_treat_as_movable || hugepage_migration_support(h)) 547 if (hugepages_treat_as_movable || hugepage_migration_supported(h))
548 return GFP_HIGHUSER_MOVABLE; 548 return GFP_HIGHUSER_MOVABLE;
549 else 549 else
550 return GFP_HIGHUSER; 550 return GFP_HIGHUSER;
@@ -607,25 +607,242 @@ err:
607 return NULL; 607 return NULL;
608} 608}
609 609
610/*
611 * common helper functions for hstate_next_node_to_{alloc|free}.
612 * We may have allocated or freed a huge page based on a different
613 * nodes_allowed previously, so h->next_node_to_{alloc|free} might
614 * be outside of *nodes_allowed. Ensure that we use an allowed
615 * node for alloc or free.
616 */
617static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
618{
619 nid = next_node(nid, *nodes_allowed);
620 if (nid == MAX_NUMNODES)
621 nid = first_node(*nodes_allowed);
622 VM_BUG_ON(nid >= MAX_NUMNODES);
623
624 return nid;
625}
626
627static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
628{
629 if (!node_isset(nid, *nodes_allowed))
630 nid = next_node_allowed(nid, nodes_allowed);
631 return nid;
632}
633
634/*
635 * returns the previously saved node ["this node"] from which to
636 * allocate a persistent huge page for the pool and advance the
637 * next node from which to allocate, handling wrap at end of node
638 * mask.
639 */
640static int hstate_next_node_to_alloc(struct hstate *h,
641 nodemask_t *nodes_allowed)
642{
643 int nid;
644
645 VM_BUG_ON(!nodes_allowed);
646
647 nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
648 h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
649
650 return nid;
651}
652
653/*
654 * helper for free_pool_huge_page() - return the previously saved
655 * node ["this node"] from which to free a huge page. Advance the
656 * next node id whether or not we find a free huge page to free so
657 * that the next attempt to free addresses the next node.
658 */
659static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
660{
661 int nid;
662
663 VM_BUG_ON(!nodes_allowed);
664
665 nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
666 h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
667
668 return nid;
669}
670
671#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
672 for (nr_nodes = nodes_weight(*mask); \
673 nr_nodes > 0 && \
674 ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \
675 nr_nodes--)
676
677#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
678 for (nr_nodes = nodes_weight(*mask); \
679 nr_nodes > 0 && \
680 ((node = hstate_next_node_to_free(hs, mask)) || 1); \
681 nr_nodes--)
682
683#if defined(CONFIG_CMA) && defined(CONFIG_X86_64)
684static void destroy_compound_gigantic_page(struct page *page,
685 unsigned long order)
686{
687 int i;
688 int nr_pages = 1 << order;
689 struct page *p = page + 1;
690
691 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
692 __ClearPageTail(p);
693 set_page_refcounted(p);
694 p->first_page = NULL;
695 }
696
697 set_compound_order(page, 0);
698 __ClearPageHead(page);
699}
700
701static void free_gigantic_page(struct page *page, unsigned order)
702{
703 free_contig_range(page_to_pfn(page), 1 << order);
704}
705
706static int __alloc_gigantic_page(unsigned long start_pfn,
707 unsigned long nr_pages)
708{
709 unsigned long end_pfn = start_pfn + nr_pages;
710 return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
711}
712
713static bool pfn_range_valid_gigantic(unsigned long start_pfn,
714 unsigned long nr_pages)
715{
716 unsigned long i, end_pfn = start_pfn + nr_pages;
717 struct page *page;
718
719 for (i = start_pfn; i < end_pfn; i++) {
720 if (!pfn_valid(i))
721 return false;
722
723 page = pfn_to_page(i);
724
725 if (PageReserved(page))
726 return false;
727
728 if (page_count(page) > 0)
729 return false;
730
731 if (PageHuge(page))
732 return false;
733 }
734
735 return true;
736}
737
738static bool zone_spans_last_pfn(const struct zone *zone,
739 unsigned long start_pfn, unsigned long nr_pages)
740{
741 unsigned long last_pfn = start_pfn + nr_pages - 1;
742 return zone_spans_pfn(zone, last_pfn);
743}
744
745static struct page *alloc_gigantic_page(int nid, unsigned order)
746{
747 unsigned long nr_pages = 1 << order;
748 unsigned long ret, pfn, flags;
749 struct zone *z;
750
751 z = NODE_DATA(nid)->node_zones;
752 for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) {
753 spin_lock_irqsave(&z->lock, flags);
754
755 pfn = ALIGN(z->zone_start_pfn, nr_pages);
756 while (zone_spans_last_pfn(z, pfn, nr_pages)) {
757 if (pfn_range_valid_gigantic(pfn, nr_pages)) {
758 /*
759 * We release the zone lock here because
760 * alloc_contig_range() will also lock the zone
761 * at some point. If there's an allocation
762 * spinning on this lock, it may win the race
763 * and cause alloc_contig_range() to fail...
764 */
765 spin_unlock_irqrestore(&z->lock, flags);
766 ret = __alloc_gigantic_page(pfn, nr_pages);
767 if (!ret)
768 return pfn_to_page(pfn);
769 spin_lock_irqsave(&z->lock, flags);
770 }
771 pfn += nr_pages;
772 }
773
774 spin_unlock_irqrestore(&z->lock, flags);
775 }
776
777 return NULL;
778}
779
780static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
781static void prep_compound_gigantic_page(struct page *page, unsigned long order);
782
783static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
784{
785 struct page *page;
786
787 page = alloc_gigantic_page(nid, huge_page_order(h));
788 if (page) {
789 prep_compound_gigantic_page(page, huge_page_order(h));
790 prep_new_huge_page(h, page, nid);
791 }
792
793 return page;
794}
795
796static int alloc_fresh_gigantic_page(struct hstate *h,
797 nodemask_t *nodes_allowed)
798{
799 struct page *page = NULL;
800 int nr_nodes, node;
801
802 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
803 page = alloc_fresh_gigantic_page_node(h, node);
804 if (page)
805 return 1;
806 }
807
808 return 0;
809}
810
811static inline bool gigantic_page_supported(void) { return true; }
812#else
813static inline bool gigantic_page_supported(void) { return false; }
814static inline void free_gigantic_page(struct page *page, unsigned order) { }
815static inline void destroy_compound_gigantic_page(struct page *page,
816 unsigned long order) { }
817static inline int alloc_fresh_gigantic_page(struct hstate *h,
818 nodemask_t *nodes_allowed) { return 0; }
819#endif
820
610static void update_and_free_page(struct hstate *h, struct page *page) 821static void update_and_free_page(struct hstate *h, struct page *page)
611{ 822{
612 int i; 823 int i;
613 824
614 VM_BUG_ON(h->order >= MAX_ORDER); 825 if (hstate_is_gigantic(h) && !gigantic_page_supported())
826 return;
615 827
616 h->nr_huge_pages--; 828 h->nr_huge_pages--;
617 h->nr_huge_pages_node[page_to_nid(page)]--; 829 h->nr_huge_pages_node[page_to_nid(page)]--;
618 for (i = 0; i < pages_per_huge_page(h); i++) { 830 for (i = 0; i < pages_per_huge_page(h); i++) {
619 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 831 page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
620 1 << PG_referenced | 1 << PG_dirty | 832 1 << PG_referenced | 1 << PG_dirty |
621 1 << PG_active | 1 << PG_reserved | 833 1 << PG_active | 1 << PG_private |
622 1 << PG_private | 1 << PG_writeback); 834 1 << PG_writeback);
623 } 835 }
624 VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); 836 VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
625 set_compound_page_dtor(page, NULL); 837 set_compound_page_dtor(page, NULL);
626 set_page_refcounted(page); 838 set_page_refcounted(page);
627 arch_release_hugepage(page); 839 if (hstate_is_gigantic(h)) {
628 __free_pages(page, huge_page_order(h)); 840 destroy_compound_gigantic_page(page, huge_page_order(h));
841 free_gigantic_page(page, huge_page_order(h));
842 } else {
843 arch_release_hugepage(page);
844 __free_pages(page, huge_page_order(h));
845 }
629} 846}
630 847
631struct hstate *size_to_hstate(unsigned long size) 848struct hstate *size_to_hstate(unsigned long size)
@@ -664,7 +881,7 @@ static void free_huge_page(struct page *page)
664 if (restore_reserve) 881 if (restore_reserve)
665 h->resv_huge_pages++; 882 h->resv_huge_pages++;
666 883
667 if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { 884 if (h->surplus_huge_pages_node[nid]) {
668 /* remove the page from active list */ 885 /* remove the page from active list */
669 list_del(&page->lru); 886 list_del(&page->lru);
670 update_and_free_page(h, page); 887 update_and_free_page(h, page);
@@ -690,8 +907,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
690 put_page(page); /* free it into the hugepage allocator */ 907 put_page(page); /* free it into the hugepage allocator */
691} 908}
692 909
693static void __init prep_compound_gigantic_page(struct page *page, 910static void prep_compound_gigantic_page(struct page *page, unsigned long order)
694 unsigned long order)
695{ 911{
696 int i; 912 int i;
697 int nr_pages = 1 << order; 913 int nr_pages = 1 << order;
@@ -769,9 +985,6 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
769{ 985{
770 struct page *page; 986 struct page *page;
771 987
772 if (h->order >= MAX_ORDER)
773 return NULL;
774
775 page = alloc_pages_exact_node(nid, 988 page = alloc_pages_exact_node(nid,
776 htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| 989 htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
777 __GFP_REPEAT|__GFP_NOWARN, 990 __GFP_REPEAT|__GFP_NOWARN,
@@ -787,79 +1000,6 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
787 return page; 1000 return page;
788} 1001}
789 1002
790/*
791 * common helper functions for hstate_next_node_to_{alloc|free}.
792 * We may have allocated or freed a huge page based on a different
793 * nodes_allowed previously, so h->next_node_to_{alloc|free} might
794 * be outside of *nodes_allowed. Ensure that we use an allowed
795 * node for alloc or free.
796 */
797static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
798{
799 nid = next_node(nid, *nodes_allowed);
800 if (nid == MAX_NUMNODES)
801 nid = first_node(*nodes_allowed);
802 VM_BUG_ON(nid >= MAX_NUMNODES);
803
804 return nid;
805}
806
807static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
808{
809 if (!node_isset(nid, *nodes_allowed))
810 nid = next_node_allowed(nid, nodes_allowed);
811 return nid;
812}
813
814/*
815 * returns the previously saved node ["this node"] from which to
816 * allocate a persistent huge page for the pool and advance the
817 * next node from which to allocate, handling wrap at end of node
818 * mask.
819 */
820static int hstate_next_node_to_alloc(struct hstate *h,
821 nodemask_t *nodes_allowed)
822{
823 int nid;
824
825 VM_BUG_ON(!nodes_allowed);
826
827 nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
828 h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
829
830 return nid;
831}
832
833/*
834 * helper for free_pool_huge_page() - return the previously saved
835 * node ["this node"] from which to free a huge page. Advance the
836 * next node id whether or not we find a free huge page to free so
837 * that the next attempt to free addresses the next node.
838 */
839static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
840{
841 int nid;
842
843 VM_BUG_ON(!nodes_allowed);
844
845 nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
846 h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
847
848 return nid;
849}
850
851#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
852 for (nr_nodes = nodes_weight(*mask); \
853 nr_nodes > 0 && \
854 ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \
855 nr_nodes--)
856
857#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
858 for (nr_nodes = nodes_weight(*mask); \
859 nr_nodes > 0 && \
860 ((node = hstate_next_node_to_free(hs, mask)) || 1); \
861 nr_nodes--)
862
863static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) 1003static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
864{ 1004{
865 struct page *page; 1005 struct page *page;
@@ -963,7 +1103,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
963 struct page *page; 1103 struct page *page;
964 unsigned int r_nid; 1104 unsigned int r_nid;
965 1105
966 if (h->order >= MAX_ORDER) 1106 if (hstate_is_gigantic(h))
967 return NULL; 1107 return NULL;
968 1108
969 /* 1109 /*
@@ -1156,7 +1296,7 @@ static void return_unused_surplus_pages(struct hstate *h,
1156 h->resv_huge_pages -= unused_resv_pages; 1296 h->resv_huge_pages -= unused_resv_pages;
1157 1297
1158 /* Cannot return gigantic pages currently */ 1298 /* Cannot return gigantic pages currently */
1159 if (h->order >= MAX_ORDER) 1299 if (hstate_is_gigantic(h))
1160 return; 1300 return;
1161 1301
1162 nr_pages = min(unused_resv_pages, h->surplus_huge_pages); 1302 nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
@@ -1246,24 +1386,17 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1246 return ERR_PTR(-ENOSPC); 1386 return ERR_PTR(-ENOSPC);
1247 1387
1248 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); 1388 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
1249 if (ret) { 1389 if (ret)
1250 if (chg || avoid_reserve) 1390 goto out_subpool_put;
1251 hugepage_subpool_put_pages(spool, 1); 1391
1252 return ERR_PTR(-ENOSPC);
1253 }
1254 spin_lock(&hugetlb_lock); 1392 spin_lock(&hugetlb_lock);
1255 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg); 1393 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg);
1256 if (!page) { 1394 if (!page) {
1257 spin_unlock(&hugetlb_lock); 1395 spin_unlock(&hugetlb_lock);
1258 page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 1396 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
1259 if (!page) { 1397 if (!page)
1260 hugetlb_cgroup_uncharge_cgroup(idx, 1398 goto out_uncharge_cgroup;
1261 pages_per_huge_page(h), 1399
1262 h_cg);
1263 if (chg || avoid_reserve)
1264 hugepage_subpool_put_pages(spool, 1);
1265 return ERR_PTR(-ENOSPC);
1266 }
1267 spin_lock(&hugetlb_lock); 1400 spin_lock(&hugetlb_lock);
1268 list_move(&page->lru, &h->hugepage_activelist); 1401 list_move(&page->lru, &h->hugepage_activelist);
1269 /* Fall through */ 1402 /* Fall through */
@@ -1275,6 +1408,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1275 1408
1276 vma_commit_reservation(h, vma, addr); 1409 vma_commit_reservation(h, vma, addr);
1277 return page; 1410 return page;
1411
1412out_uncharge_cgroup:
1413 hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
1414out_subpool_put:
1415 if (chg || avoid_reserve)
1416 hugepage_subpool_put_pages(spool, 1);
1417 return ERR_PTR(-ENOSPC);
1278} 1418}
1279 1419
1280/* 1420/*
@@ -1356,7 +1496,7 @@ static void __init gather_bootmem_prealloc(void)
1356 * fix confusing memory reports from free(1) and another 1496 * fix confusing memory reports from free(1) and another
1357 * side-effects, like CommitLimit going negative. 1497 * side-effects, like CommitLimit going negative.
1358 */ 1498 */
1359 if (h->order > (MAX_ORDER - 1)) 1499 if (hstate_is_gigantic(h))
1360 adjust_managed_page_count(page, 1 << h->order); 1500 adjust_managed_page_count(page, 1 << h->order);
1361 } 1501 }
1362} 1502}
@@ -1366,7 +1506,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
1366 unsigned long i; 1506 unsigned long i;
1367 1507
1368 for (i = 0; i < h->max_huge_pages; ++i) { 1508 for (i = 0; i < h->max_huge_pages; ++i) {
1369 if (h->order >= MAX_ORDER) { 1509 if (hstate_is_gigantic(h)) {
1370 if (!alloc_bootmem_huge_page(h)) 1510 if (!alloc_bootmem_huge_page(h))
1371 break; 1511 break;
1372 } else if (!alloc_fresh_huge_page(h, 1512 } else if (!alloc_fresh_huge_page(h,
@@ -1382,7 +1522,7 @@ static void __init hugetlb_init_hstates(void)
1382 1522
1383 for_each_hstate(h) { 1523 for_each_hstate(h) {
1384 /* oversize hugepages were init'ed in early boot */ 1524 /* oversize hugepages were init'ed in early boot */
1385 if (h->order < MAX_ORDER) 1525 if (!hstate_is_gigantic(h))
1386 hugetlb_hstate_alloc_pages(h); 1526 hugetlb_hstate_alloc_pages(h);
1387 } 1527 }
1388} 1528}
@@ -1416,7 +1556,7 @@ static void try_to_free_low(struct hstate *h, unsigned long count,
1416{ 1556{
1417 int i; 1557 int i;
1418 1558
1419 if (h->order >= MAX_ORDER) 1559 if (hstate_is_gigantic(h))
1420 return; 1560 return;
1421 1561
1422 for_each_node_mask(i, *nodes_allowed) { 1562 for_each_node_mask(i, *nodes_allowed) {
@@ -1479,7 +1619,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
1479{ 1619{
1480 unsigned long min_count, ret; 1620 unsigned long min_count, ret;
1481 1621
1482 if (h->order >= MAX_ORDER) 1622 if (hstate_is_gigantic(h) && !gigantic_page_supported())
1483 return h->max_huge_pages; 1623 return h->max_huge_pages;
1484 1624
1485 /* 1625 /*
@@ -1506,7 +1646,10 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
1506 * and reducing the surplus. 1646 * and reducing the surplus.
1507 */ 1647 */
1508 spin_unlock(&hugetlb_lock); 1648 spin_unlock(&hugetlb_lock);
1509 ret = alloc_fresh_huge_page(h, nodes_allowed); 1649 if (hstate_is_gigantic(h))
1650 ret = alloc_fresh_gigantic_page(h, nodes_allowed);
1651 else
1652 ret = alloc_fresh_huge_page(h, nodes_allowed);
1510 spin_lock(&hugetlb_lock); 1653 spin_lock(&hugetlb_lock);
1511 if (!ret) 1654 if (!ret)
1512 goto out; 1655 goto out;
@@ -1606,7 +1749,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1606 goto out; 1749 goto out;
1607 1750
1608 h = kobj_to_hstate(kobj, &nid); 1751 h = kobj_to_hstate(kobj, &nid);
1609 if (h->order >= MAX_ORDER) { 1752 if (hstate_is_gigantic(h) && !gigantic_page_supported()) {
1610 err = -EINVAL; 1753 err = -EINVAL;
1611 goto out; 1754 goto out;
1612 } 1755 }
@@ -1689,7 +1832,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
1689 unsigned long input; 1832 unsigned long input;
1690 struct hstate *h = kobj_to_hstate(kobj, NULL); 1833 struct hstate *h = kobj_to_hstate(kobj, NULL);
1691 1834
1692 if (h->order >= MAX_ORDER) 1835 if (hstate_is_gigantic(h))
1693 return -EINVAL; 1836 return -EINVAL;
1694 1837
1695 err = kstrtoul(buf, 10, &input); 1838 err = kstrtoul(buf, 10, &input);
@@ -2113,7 +2256,7 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
2113 2256
2114 tmp = h->max_huge_pages; 2257 tmp = h->max_huge_pages;
2115 2258
2116 if (write && h->order >= MAX_ORDER) 2259 if (write && hstate_is_gigantic(h) && !gigantic_page_supported())
2117 return -EINVAL; 2260 return -EINVAL;
2118 2261
2119 table->data = &tmp; 2262 table->data = &tmp;
@@ -2169,7 +2312,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
2169 2312
2170 tmp = h->nr_overcommit_huge_pages; 2313 tmp = h->nr_overcommit_huge_pages;
2171 2314
2172 if (write && h->order >= MAX_ORDER) 2315 if (write && hstate_is_gigantic(h))
2173 return -EINVAL; 2316 return -EINVAL;
2174 2317
2175 table->data = &tmp; 2318 table->data = &tmp;
diff --git a/mm/internal.h b/mm/internal.h
index 07b67361a40a..7f22a11fcc66 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -134,7 +134,7 @@ struct compact_control {
134 unsigned long nr_migratepages; /* Number of pages to migrate */ 134 unsigned long nr_migratepages; /* Number of pages to migrate */
135 unsigned long free_pfn; /* isolate_freepages search base */ 135 unsigned long free_pfn; /* isolate_freepages search base */
136 unsigned long migrate_pfn; /* isolate_migratepages search base */ 136 unsigned long migrate_pfn; /* isolate_migratepages search base */
137 bool sync; /* Synchronous migration */ 137 enum migrate_mode mode; /* Async or sync migration mode */
138 bool ignore_skip_hint; /* Scan blocks even if marked skip */ 138 bool ignore_skip_hint; /* Scan blocks even if marked skip */
139 bool finished_update_free; /* True when the zone cached pfns are 139 bool finished_update_free; /* True when the zone cached pfns are
140 * no longer being updated 140 * no longer being updated
@@ -144,7 +144,10 @@ struct compact_control {
144 int order; /* order a direct compactor needs */ 144 int order; /* order a direct compactor needs */
145 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 145 int migratetype; /* MOVABLE, RECLAIMABLE etc */
146 struct zone *zone; 146 struct zone *zone;
147 bool contended; /* True if a lock was contended */ 147 bool contended; /* True if a lock was contended, or
148 * need_resched() true during async
149 * compaction
150 */
148}; 151};
149 152
150unsigned long 153unsigned long
@@ -169,6 +172,11 @@ static inline unsigned long page_order(struct page *page)
169 return page_private(page); 172 return page_private(page);
170} 173}
171 174
175static inline bool is_cow_mapping(vm_flags_t flags)
176{
177 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
178}
179
172/* mm/util.c */ 180/* mm/util.c */
173void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, 181void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
174 struct vm_area_struct *prev, struct rb_node *rb_parent); 182 struct vm_area_struct *prev, struct rb_node *rb_parent);
@@ -184,26 +192,6 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
184} 192}
185 193
186/* 194/*
187 * Called only in fault path, to determine if a new page is being
188 * mapped into a LOCKED vma. If it is, mark page as mlocked.
189 */
190static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
191 struct page *page)
192{
193 VM_BUG_ON_PAGE(PageLRU(page), page);
194
195 if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
196 return 0;
197
198 if (!TestSetPageMlocked(page)) {
199 mod_zone_page_state(page_zone(page), NR_MLOCK,
200 hpage_nr_pages(page));
201 count_vm_event(UNEVICTABLE_PGMLOCKED);
202 }
203 return 1;
204}
205
206/*
207 * must be called with vma's mmap_sem held for read or write, and page locked. 195 * must be called with vma's mmap_sem held for read or write, and page locked.
208 */ 196 */
209extern void mlock_vma_page(struct page *page); 197extern void mlock_vma_page(struct page *page);
@@ -245,10 +233,6 @@ extern unsigned long vma_address(struct page *page,
245 struct vm_area_struct *vma); 233 struct vm_area_struct *vma);
246#endif 234#endif
247#else /* !CONFIG_MMU */ 235#else /* !CONFIG_MMU */
248static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p)
249{
250 return 0;
251}
252static inline void clear_page_mlock(struct page *page) { } 236static inline void clear_page_mlock(struct page *page) { }
253static inline void mlock_vma_page(struct page *page) { } 237static inline void mlock_vma_page(struct page *page) { }
254static inline void mlock_migrate_page(struct page *new, struct page *old) { } 238static inline void mlock_migrate_page(struct page *new, struct page *old) { }
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 8d2fcdfeff7f..736ade31d1dc 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1300,7 +1300,7 @@ static void kmemleak_scan(void)
1300 /* 1300 /*
1301 * Struct page scanning for each node. 1301 * Struct page scanning for each node.
1302 */ 1302 */
1303 lock_memory_hotplug(); 1303 get_online_mems();
1304 for_each_online_node(i) { 1304 for_each_online_node(i) {
1305 unsigned long start_pfn = node_start_pfn(i); 1305 unsigned long start_pfn = node_start_pfn(i);
1306 unsigned long end_pfn = node_end_pfn(i); 1306 unsigned long end_pfn = node_end_pfn(i);
@@ -1318,7 +1318,7 @@ static void kmemleak_scan(void)
1318 scan_block(page, page + 1, NULL, 1); 1318 scan_block(page, page + 1, NULL, 1);
1319 } 1319 }
1320 } 1320 }
1321 unlock_memory_hotplug(); 1321 put_online_mems();
1322 1322
1323 /* 1323 /*
1324 * Scanning the task stacks (may introduce false negatives). 1324 * Scanning the task stacks (may introduce false negatives).
diff --git a/mm/memblock.c b/mm/memblock.c
index a810ba923cdd..0aa0d2b07624 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1033,22 +1033,35 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
1033} 1033}
1034#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 1034#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
1035 1035
1036static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, 1036static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
1037 phys_addr_t align, phys_addr_t max_addr, 1037 phys_addr_t align, phys_addr_t start,
1038 int nid) 1038 phys_addr_t end, int nid)
1039{ 1039{
1040 phys_addr_t found; 1040 phys_addr_t found;
1041 1041
1042 if (!align) 1042 if (!align)
1043 align = SMP_CACHE_BYTES; 1043 align = SMP_CACHE_BYTES;
1044 1044
1045 found = memblock_find_in_range_node(size, align, 0, max_addr, nid); 1045 found = memblock_find_in_range_node(size, align, start, end, nid);
1046 if (found && !memblock_reserve(found, size)) 1046 if (found && !memblock_reserve(found, size))
1047 return found; 1047 return found;
1048 1048
1049 return 0; 1049 return 0;
1050} 1050}
1051 1051
1052phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
1053 phys_addr_t start, phys_addr_t end)
1054{
1055 return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE);
1056}
1057
1058static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
1059 phys_addr_t align, phys_addr_t max_addr,
1060 int nid)
1061{
1062 return memblock_alloc_range_nid(size, align, 0, max_addr, nid);
1063}
1064
1052phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) 1065phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
1053{ 1066{
1054 return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid); 1067 return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
@@ -1389,9 +1402,8 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
1389 if (mid == -1) 1402 if (mid == -1)
1390 return -1; 1403 return -1;
1391 1404
1392 *start_pfn = type->regions[mid].base >> PAGE_SHIFT; 1405 *start_pfn = PFN_DOWN(type->regions[mid].base);
1393 *end_pfn = (type->regions[mid].base + type->regions[mid].size) 1406 *end_pfn = PFN_DOWN(type->regions[mid].base + type->regions[mid].size);
1394 >> PAGE_SHIFT;
1395 1407
1396 return type->regions[mid].nid; 1408 return type->regions[mid].nid;
1397} 1409}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5177c6d4a2dd..a500cb0594c4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -80,7 +80,7 @@ int do_swap_account __read_mostly;
80#ifdef CONFIG_MEMCG_SWAP_ENABLED 80#ifdef CONFIG_MEMCG_SWAP_ENABLED
81static int really_do_swap_account __initdata = 1; 81static int really_do_swap_account __initdata = 1;
82#else 82#else
83static int really_do_swap_account __initdata = 0; 83static int really_do_swap_account __initdata;
84#endif 84#endif
85 85
86#else 86#else
@@ -357,10 +357,9 @@ struct mem_cgroup {
357 struct cg_proto tcp_mem; 357 struct cg_proto tcp_mem;
358#endif 358#endif
359#if defined(CONFIG_MEMCG_KMEM) 359#if defined(CONFIG_MEMCG_KMEM)
360 /* analogous to slab_common's slab_caches list. per-memcg */ 360 /* analogous to slab_common's slab_caches list, but per-memcg;
361 * protected by memcg_slab_mutex */
361 struct list_head memcg_slab_caches; 362 struct list_head memcg_slab_caches;
362 /* Not a spinlock, we can take a lot of time walking the list */
363 struct mutex slab_caches_mutex;
364 /* Index in the kmem_cache->memcg_params->memcg_caches array */ 363 /* Index in the kmem_cache->memcg_params->memcg_caches array */
365 int kmemcg_id; 364 int kmemcg_id;
366#endif 365#endif
@@ -1595,23 +1594,12 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1595} 1594}
1596 1595
1597/* 1596/*
1598 * 2 routines for checking "mem" is under move_account() or not. 1597 * A routine for checking "mem" is under move_account() or not.
1599 * 1598 *
1600 * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This 1599 * Checking a cgroup is mc.from or mc.to or under hierarchy of
1601 * is used for avoiding races in accounting. If true, 1600 * moving cgroups. This is for waiting at high-memory pressure
1602 * pc->mem_cgroup may be overwritten. 1601 * caused by "move".
1603 *
1604 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
1605 * under hierarchy of moving cgroups. This is for
1606 * waiting at hith-memory prressure caused by "move".
1607 */ 1602 */
1608
1609static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
1610{
1611 VM_BUG_ON(!rcu_read_lock_held());
1612 return atomic_read(&memcg->moving_account) > 0;
1613}
1614
1615static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1603static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1616{ 1604{
1617 struct mem_cgroup *from; 1605 struct mem_cgroup *from;
@@ -1654,7 +1642,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1654 * Take this lock when 1642 * Take this lock when
1655 * - a code tries to modify page's memcg while it's USED. 1643 * - a code tries to modify page's memcg while it's USED.
1656 * - a code tries to modify page state accounting in a memcg. 1644 * - a code tries to modify page state accounting in a memcg.
1657 * see mem_cgroup_stolen(), too.
1658 */ 1645 */
1659static void move_lock_mem_cgroup(struct mem_cgroup *memcg, 1646static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
1660 unsigned long *flags) 1647 unsigned long *flags)
@@ -2289,12 +2276,11 @@ cleanup:
2289} 2276}
2290 2277
2291/* 2278/*
2292 * Currently used to update mapped file statistics, but the routine can be 2279 * Used to update mapped file or writeback or other statistics.
2293 * generalized to update other statistics as well.
2294 * 2280 *
2295 * Notes: Race condition 2281 * Notes: Race condition
2296 * 2282 *
2297 * We usually use page_cgroup_lock() for accessing page_cgroup member but 2283 * We usually use lock_page_cgroup() for accessing page_cgroup member but
2298 * it tends to be costly. But considering some conditions, we doesn't need 2284 * it tends to be costly. But considering some conditions, we doesn't need
2299 * to do so _always_. 2285 * to do so _always_.
2300 * 2286 *
@@ -2308,8 +2294,8 @@ cleanup:
2308 * by flags. 2294 * by flags.
2309 * 2295 *
2310 * Considering "move", this is an only case we see a race. To make the race 2296 * Considering "move", this is an only case we see a race. To make the race
2311 * small, we check mm->moving_account and detect there are possibility of race 2297 * small, we check memcg->moving_account and detect there are possibility
2312 * If there is, we take a lock. 2298 * of race or not. If there is, we take a lock.
2313 */ 2299 */
2314 2300
2315void __mem_cgroup_begin_update_page_stat(struct page *page, 2301void __mem_cgroup_begin_update_page_stat(struct page *page,
@@ -2327,9 +2313,10 @@ again:
2327 * If this memory cgroup is not under account moving, we don't 2313 * If this memory cgroup is not under account moving, we don't
2328 * need to take move_lock_mem_cgroup(). Because we already hold 2314 * need to take move_lock_mem_cgroup(). Because we already hold
2329 * rcu_read_lock(), any calls to move_account will be delayed until 2315 * rcu_read_lock(), any calls to move_account will be delayed until
2330 * rcu_read_unlock() if mem_cgroup_stolen() == true. 2316 * rcu_read_unlock().
2331 */ 2317 */
2332 if (!mem_cgroup_stolen(memcg)) 2318 VM_BUG_ON(!rcu_read_lock_held());
2319 if (atomic_read(&memcg->moving_account) <= 0)
2333 return; 2320 return;
2334 2321
2335 move_lock_mem_cgroup(memcg, flags); 2322 move_lock_mem_cgroup(memcg, flags);
@@ -2437,7 +2424,7 @@ static void drain_stock(struct memcg_stock_pcp *stock)
2437 */ 2424 */
2438static void drain_local_stock(struct work_struct *dummy) 2425static void drain_local_stock(struct work_struct *dummy)
2439{ 2426{
2440 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); 2427 struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock);
2441 drain_stock(stock); 2428 drain_stock(stock);
2442 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2429 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2443} 2430}
@@ -2684,7 +2671,8 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
2684 * free their memory. 2671 * free their memory.
2685 */ 2672 */
2686 if (unlikely(test_thread_flag(TIF_MEMDIE) || 2673 if (unlikely(test_thread_flag(TIF_MEMDIE) ||
2687 fatal_signal_pending(current))) 2674 fatal_signal_pending(current) ||
2675 current->flags & PF_EXITING))
2688 goto bypass; 2676 goto bypass;
2689 2677
2690 if (unlikely(task_in_memcg_oom(current))) 2678 if (unlikely(task_in_memcg_oom(current)))
@@ -2912,6 +2900,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2912static DEFINE_MUTEX(set_limit_mutex); 2900static DEFINE_MUTEX(set_limit_mutex);
2913 2901
2914#ifdef CONFIG_MEMCG_KMEM 2902#ifdef CONFIG_MEMCG_KMEM
2903/*
2904 * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or
2905 * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists.
2906 */
2907static DEFINE_MUTEX(memcg_slab_mutex);
2908
2915static DEFINE_MUTEX(activate_kmem_mutex); 2909static DEFINE_MUTEX(activate_kmem_mutex);
2916 2910
2917static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) 2911static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
@@ -2944,10 +2938,10 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
2944 2938
2945 print_slabinfo_header(m); 2939 print_slabinfo_header(m);
2946 2940
2947 mutex_lock(&memcg->slab_caches_mutex); 2941 mutex_lock(&memcg_slab_mutex);
2948 list_for_each_entry(params, &memcg->memcg_slab_caches, list) 2942 list_for_each_entry(params, &memcg->memcg_slab_caches, list)
2949 cache_show(memcg_params_to_cache(params), m); 2943 cache_show(memcg_params_to_cache(params), m);
2950 mutex_unlock(&memcg->slab_caches_mutex); 2944 mutex_unlock(&memcg_slab_mutex);
2951 2945
2952 return 0; 2946 return 0;
2953} 2947}
@@ -3049,8 +3043,6 @@ void memcg_update_array_size(int num)
3049 memcg_limited_groups_array_size = memcg_caches_array_size(num); 3043 memcg_limited_groups_array_size = memcg_caches_array_size(num);
3050} 3044}
3051 3045
3052static void kmem_cache_destroy_work_func(struct work_struct *w);
3053
3054int memcg_update_cache_size(struct kmem_cache *s, int num_groups) 3046int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
3055{ 3047{
3056 struct memcg_cache_params *cur_params = s->memcg_params; 3048 struct memcg_cache_params *cur_params = s->memcg_params;
@@ -3103,29 +3095,6 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
3103 return 0; 3095 return 0;
3104} 3096}
3105 3097
3106char *memcg_create_cache_name(struct mem_cgroup *memcg,
3107 struct kmem_cache *root_cache)
3108{
3109 static char *buf = NULL;
3110
3111 /*
3112 * We need a mutex here to protect the shared buffer. Since this is
3113 * expected to be called only on cache creation, we can employ the
3114 * slab_mutex for that purpose.
3115 */
3116 lockdep_assert_held(&slab_mutex);
3117
3118 if (!buf) {
3119 buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
3120 if (!buf)
3121 return NULL;
3122 }
3123
3124 cgroup_name(memcg->css.cgroup, buf, NAME_MAX + 1);
3125 return kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
3126 memcg_cache_id(memcg), buf);
3127}
3128
3129int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, 3098int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
3130 struct kmem_cache *root_cache) 3099 struct kmem_cache *root_cache)
3131{ 3100{
@@ -3147,8 +3116,6 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
3147 if (memcg) { 3116 if (memcg) {
3148 s->memcg_params->memcg = memcg; 3117 s->memcg_params->memcg = memcg;
3149 s->memcg_params->root_cache = root_cache; 3118 s->memcg_params->root_cache = root_cache;
3150 INIT_WORK(&s->memcg_params->destroy,
3151 kmem_cache_destroy_work_func);
3152 css_get(&memcg->css); 3119 css_get(&memcg->css);
3153 } else 3120 } else
3154 s->memcg_params->is_root_cache = true; 3121 s->memcg_params->is_root_cache = true;
@@ -3165,24 +3132,37 @@ void memcg_free_cache_params(struct kmem_cache *s)
3165 kfree(s->memcg_params); 3132 kfree(s->memcg_params);
3166} 3133}
3167 3134
3168void memcg_register_cache(struct kmem_cache *s) 3135static void memcg_register_cache(struct mem_cgroup *memcg,
3136 struct kmem_cache *root_cache)
3169{ 3137{
3170 struct kmem_cache *root; 3138 static char memcg_name_buf[NAME_MAX + 1]; /* protected by
3171 struct mem_cgroup *memcg; 3139 memcg_slab_mutex */
3140 struct kmem_cache *cachep;
3172 int id; 3141 int id;
3173 3142
3174 if (is_root_cache(s)) 3143 lockdep_assert_held(&memcg_slab_mutex);
3144
3145 id = memcg_cache_id(memcg);
3146
3147 /*
3148 * Since per-memcg caches are created asynchronously on first
3149 * allocation (see memcg_kmem_get_cache()), several threads can try to
3150 * create the same cache, but only one of them may succeed.
3151 */
3152 if (cache_from_memcg_idx(root_cache, id))
3175 return; 3153 return;
3176 3154
3155 cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1);
3156 cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf);
3177 /* 3157 /*
3178 * Holding the slab_mutex assures nobody will touch the memcg_caches 3158 * If we could not create a memcg cache, do not complain, because
3179 * array while we are modifying it. 3159 * that's not critical at all as we can always proceed with the root
3160 * cache.
3180 */ 3161 */
3181 lockdep_assert_held(&slab_mutex); 3162 if (!cachep)
3163 return;
3182 3164
3183 root = s->memcg_params->root_cache; 3165 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
3184 memcg = s->memcg_params->memcg;
3185 id = memcg_cache_id(memcg);
3186 3166
3187 /* 3167 /*
3188 * Since readers won't lock (see cache_from_memcg_idx()), we need a 3168 * Since readers won't lock (see cache_from_memcg_idx()), we need a
@@ -3191,49 +3171,30 @@ void memcg_register_cache(struct kmem_cache *s)
3191 */ 3171 */
3192 smp_wmb(); 3172 smp_wmb();
3193 3173
3194 /* 3174 BUG_ON(root_cache->memcg_params->memcg_caches[id]);
3195 * Initialize the pointer to this cache in its parent's memcg_params 3175 root_cache->memcg_params->memcg_caches[id] = cachep;
3196 * before adding it to the memcg_slab_caches list, otherwise we can
3197 * fail to convert memcg_params_to_cache() while traversing the list.
3198 */
3199 VM_BUG_ON(root->memcg_params->memcg_caches[id]);
3200 root->memcg_params->memcg_caches[id] = s;
3201
3202 mutex_lock(&memcg->slab_caches_mutex);
3203 list_add(&s->memcg_params->list, &memcg->memcg_slab_caches);
3204 mutex_unlock(&memcg->slab_caches_mutex);
3205} 3176}
3206 3177
3207void memcg_unregister_cache(struct kmem_cache *s) 3178static void memcg_unregister_cache(struct kmem_cache *cachep)
3208{ 3179{
3209 struct kmem_cache *root; 3180 struct kmem_cache *root_cache;
3210 struct mem_cgroup *memcg; 3181 struct mem_cgroup *memcg;
3211 int id; 3182 int id;
3212 3183
3213 if (is_root_cache(s)) 3184 lockdep_assert_held(&memcg_slab_mutex);
3214 return;
3215 3185
3216 /* 3186 BUG_ON(is_root_cache(cachep));
3217 * Holding the slab_mutex assures nobody will touch the memcg_caches
3218 * array while we are modifying it.
3219 */
3220 lockdep_assert_held(&slab_mutex);
3221 3187
3222 root = s->memcg_params->root_cache; 3188 root_cache = cachep->memcg_params->root_cache;
3223 memcg = s->memcg_params->memcg; 3189 memcg = cachep->memcg_params->memcg;
3224 id = memcg_cache_id(memcg); 3190 id = memcg_cache_id(memcg);
3225 3191
3226 mutex_lock(&memcg->slab_caches_mutex); 3192 BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep);
3227 list_del(&s->memcg_params->list); 3193 root_cache->memcg_params->memcg_caches[id] = NULL;
3228 mutex_unlock(&memcg->slab_caches_mutex);
3229 3194
3230 /* 3195 list_del(&cachep->memcg_params->list);
3231 * Clear the pointer to this cache in its parent's memcg_params only 3196
3232 * after removing it from the memcg_slab_caches list, otherwise we can 3197 kmem_cache_destroy(cachep);
3233 * fail to convert memcg_params_to_cache() while traversing the list.
3234 */
3235 VM_BUG_ON(root->memcg_params->memcg_caches[id] != s);
3236 root->memcg_params->memcg_caches[id] = NULL;
3237} 3198}
3238 3199
3239/* 3200/*
@@ -3267,144 +3228,61 @@ static inline void memcg_resume_kmem_account(void)
3267 current->memcg_kmem_skip_account--; 3228 current->memcg_kmem_skip_account--;
3268} 3229}
3269 3230
3270static void kmem_cache_destroy_work_func(struct work_struct *w) 3231int __memcg_cleanup_cache_params(struct kmem_cache *s)
3271{
3272 struct kmem_cache *cachep;
3273 struct memcg_cache_params *p;
3274
3275 p = container_of(w, struct memcg_cache_params, destroy);
3276
3277 cachep = memcg_params_to_cache(p);
3278
3279 /*
3280 * If we get down to 0 after shrink, we could delete right away.
3281 * However, memcg_release_pages() already puts us back in the workqueue
3282 * in that case. If we proceed deleting, we'll get a dangling
3283 * reference, and removing the object from the workqueue in that case
3284 * is unnecessary complication. We are not a fast path.
3285 *
3286 * Note that this case is fundamentally different from racing with
3287 * shrink_slab(): if memcg_cgroup_destroy_cache() is called in
3288 * kmem_cache_shrink, not only we would be reinserting a dead cache
3289 * into the queue, but doing so from inside the worker racing to
3290 * destroy it.
3291 *
3292 * So if we aren't down to zero, we'll just schedule a worker and try
3293 * again
3294 */
3295 if (atomic_read(&cachep->memcg_params->nr_pages) != 0)
3296 kmem_cache_shrink(cachep);
3297 else
3298 kmem_cache_destroy(cachep);
3299}
3300
3301void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
3302{
3303 if (!cachep->memcg_params->dead)
3304 return;
3305
3306 /*
3307 * There are many ways in which we can get here.
3308 *
3309 * We can get to a memory-pressure situation while the delayed work is
3310 * still pending to run. The vmscan shrinkers can then release all
3311 * cache memory and get us to destruction. If this is the case, we'll
3312 * be executed twice, which is a bug (the second time will execute over
3313 * bogus data). In this case, cancelling the work should be fine.
3314 *
3315 * But we can also get here from the worker itself, if
3316 * kmem_cache_shrink is enough to shake all the remaining objects and
3317 * get the page count to 0. In this case, we'll deadlock if we try to
3318 * cancel the work (the worker runs with an internal lock held, which
3319 * is the same lock we would hold for cancel_work_sync().)
3320 *
3321 * Since we can't possibly know who got us here, just refrain from
3322 * running if there is already work pending
3323 */
3324 if (work_pending(&cachep->memcg_params->destroy))
3325 return;
3326 /*
3327 * We have to defer the actual destroying to a workqueue, because
3328 * we might currently be in a context that cannot sleep.
3329 */
3330 schedule_work(&cachep->memcg_params->destroy);
3331}
3332
3333int __kmem_cache_destroy_memcg_children(struct kmem_cache *s)
3334{ 3232{
3335 struct kmem_cache *c; 3233 struct kmem_cache *c;
3336 int i, failed = 0; 3234 int i, failed = 0;
3337 3235
3338 /* 3236 mutex_lock(&memcg_slab_mutex);
3339 * If the cache is being destroyed, we trust that there is no one else
3340 * requesting objects from it. Even if there are, the sanity checks in
3341 * kmem_cache_destroy should caught this ill-case.
3342 *
3343 * Still, we don't want anyone else freeing memcg_caches under our
3344 * noses, which can happen if a new memcg comes to life. As usual,
3345 * we'll take the activate_kmem_mutex to protect ourselves against
3346 * this.
3347 */
3348 mutex_lock(&activate_kmem_mutex);
3349 for_each_memcg_cache_index(i) { 3237 for_each_memcg_cache_index(i) {
3350 c = cache_from_memcg_idx(s, i); 3238 c = cache_from_memcg_idx(s, i);
3351 if (!c) 3239 if (!c)
3352 continue; 3240 continue;
3353 3241
3354 /* 3242 memcg_unregister_cache(c);
3355 * We will now manually delete the caches, so to avoid races
3356 * we need to cancel all pending destruction workers and
3357 * proceed with destruction ourselves.
3358 *
3359 * kmem_cache_destroy() will call kmem_cache_shrink internally,
3360 * and that could spawn the workers again: it is likely that
3361 * the cache still have active pages until this very moment.
3362 * This would lead us back to mem_cgroup_destroy_cache.
3363 *
3364 * But that will not execute at all if the "dead" flag is not
3365 * set, so flip it down to guarantee we are in control.
3366 */
3367 c->memcg_params->dead = false;
3368 cancel_work_sync(&c->memcg_params->destroy);
3369 kmem_cache_destroy(c);
3370 3243
3371 if (cache_from_memcg_idx(s, i)) 3244 if (cache_from_memcg_idx(s, i))
3372 failed++; 3245 failed++;
3373 } 3246 }
3374 mutex_unlock(&activate_kmem_mutex); 3247 mutex_unlock(&memcg_slab_mutex);
3375 return failed; 3248 return failed;
3376} 3249}
3377 3250
3378static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) 3251static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
3379{ 3252{
3380 struct kmem_cache *cachep; 3253 struct kmem_cache *cachep;
3381 struct memcg_cache_params *params; 3254 struct memcg_cache_params *params, *tmp;
3382 3255
3383 if (!memcg_kmem_is_active(memcg)) 3256 if (!memcg_kmem_is_active(memcg))
3384 return; 3257 return;
3385 3258
3386 mutex_lock(&memcg->slab_caches_mutex); 3259 mutex_lock(&memcg_slab_mutex);
3387 list_for_each_entry(params, &memcg->memcg_slab_caches, list) { 3260 list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
3388 cachep = memcg_params_to_cache(params); 3261 cachep = memcg_params_to_cache(params);
3389 cachep->memcg_params->dead = true; 3262 kmem_cache_shrink(cachep);
3390 schedule_work(&cachep->memcg_params->destroy); 3263 if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
3264 memcg_unregister_cache(cachep);
3391 } 3265 }
3392 mutex_unlock(&memcg->slab_caches_mutex); 3266 mutex_unlock(&memcg_slab_mutex);
3393} 3267}
3394 3268
3395struct create_work { 3269struct memcg_register_cache_work {
3396 struct mem_cgroup *memcg; 3270 struct mem_cgroup *memcg;
3397 struct kmem_cache *cachep; 3271 struct kmem_cache *cachep;
3398 struct work_struct work; 3272 struct work_struct work;
3399}; 3273};
3400 3274
3401static void memcg_create_cache_work_func(struct work_struct *w) 3275static void memcg_register_cache_func(struct work_struct *w)
3402{ 3276{
3403 struct create_work *cw = container_of(w, struct create_work, work); 3277 struct memcg_register_cache_work *cw =
3278 container_of(w, struct memcg_register_cache_work, work);
3404 struct mem_cgroup *memcg = cw->memcg; 3279 struct mem_cgroup *memcg = cw->memcg;
3405 struct kmem_cache *cachep = cw->cachep; 3280 struct kmem_cache *cachep = cw->cachep;
3406 3281
3407 kmem_cache_create_memcg(memcg, cachep); 3282 mutex_lock(&memcg_slab_mutex);
3283 memcg_register_cache(memcg, cachep);
3284 mutex_unlock(&memcg_slab_mutex);
3285
3408 css_put(&memcg->css); 3286 css_put(&memcg->css);
3409 kfree(cw); 3287 kfree(cw);
3410} 3288}
@@ -3412,12 +3290,12 @@ static void memcg_create_cache_work_func(struct work_struct *w)
3412/* 3290/*
3413 * Enqueue the creation of a per-memcg kmem_cache. 3291 * Enqueue the creation of a per-memcg kmem_cache.
3414 */ 3292 */
3415static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, 3293static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
3416 struct kmem_cache *cachep) 3294 struct kmem_cache *cachep)
3417{ 3295{
3418 struct create_work *cw; 3296 struct memcg_register_cache_work *cw;
3419 3297
3420 cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); 3298 cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
3421 if (cw == NULL) { 3299 if (cw == NULL) {
3422 css_put(&memcg->css); 3300 css_put(&memcg->css);
3423 return; 3301 return;
@@ -3426,17 +3304,17 @@ static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3426 cw->memcg = memcg; 3304 cw->memcg = memcg;
3427 cw->cachep = cachep; 3305 cw->cachep = cachep;
3428 3306
3429 INIT_WORK(&cw->work, memcg_create_cache_work_func); 3307 INIT_WORK(&cw->work, memcg_register_cache_func);
3430 schedule_work(&cw->work); 3308 schedule_work(&cw->work);
3431} 3309}
3432 3310
3433static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, 3311static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
3434 struct kmem_cache *cachep) 3312 struct kmem_cache *cachep)
3435{ 3313{
3436 /* 3314 /*
3437 * We need to stop accounting when we kmalloc, because if the 3315 * We need to stop accounting when we kmalloc, because if the
3438 * corresponding kmalloc cache is not yet created, the first allocation 3316 * corresponding kmalloc cache is not yet created, the first allocation
3439 * in __memcg_create_cache_enqueue will recurse. 3317 * in __memcg_schedule_register_cache will recurse.
3440 * 3318 *
3441 * However, it is better to enclose the whole function. Depending on 3319 * However, it is better to enclose the whole function. Depending on
3442 * the debugging options enabled, INIT_WORK(), for instance, can 3320 * the debugging options enabled, INIT_WORK(), for instance, can
@@ -3445,9 +3323,27 @@ static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3445 * the safest choice is to do it like this, wrapping the whole function. 3323 * the safest choice is to do it like this, wrapping the whole function.
3446 */ 3324 */
3447 memcg_stop_kmem_account(); 3325 memcg_stop_kmem_account();
3448 __memcg_create_cache_enqueue(memcg, cachep); 3326 __memcg_schedule_register_cache(memcg, cachep);
3449 memcg_resume_kmem_account(); 3327 memcg_resume_kmem_account();
3450} 3328}
3329
3330int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
3331{
3332 int res;
3333
3334 res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp,
3335 PAGE_SIZE << order);
3336 if (!res)
3337 atomic_add(1 << order, &cachep->memcg_params->nr_pages);
3338 return res;
3339}
3340
3341void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
3342{
3343 memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order);
3344 atomic_sub(1 << order, &cachep->memcg_params->nr_pages);
3345}
3346
3451/* 3347/*
3452 * Return the kmem_cache we're supposed to use for a slab allocation. 3348 * Return the kmem_cache we're supposed to use for a slab allocation.
3453 * We try to use the current memcg's version of the cache. 3349 * We try to use the current memcg's version of the cache.
@@ -3498,22 +3394,16 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
3498 * 3394 *
3499 * However, there are some clashes that can arrive from locking. 3395 * However, there are some clashes that can arrive from locking.
3500 * For instance, because we acquire the slab_mutex while doing 3396 * For instance, because we acquire the slab_mutex while doing
3501 * kmem_cache_dup, this means no further allocation could happen 3397 * memcg_create_kmem_cache, this means no further allocation
3502 * with the slab_mutex held. 3398 * could happen with the slab_mutex held. So it's better to
3503 * 3399 * defer everything.
3504 * Also, because cache creation issue get_online_cpus(), this
3505 * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
3506 * that ends up reversed during cpu hotplug. (cpuset allocates
3507 * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
3508 * better to defer everything.
3509 */ 3400 */
3510 memcg_create_cache_enqueue(memcg, cachep); 3401 memcg_schedule_register_cache(memcg, cachep);
3511 return cachep; 3402 return cachep;
3512out: 3403out:
3513 rcu_read_unlock(); 3404 rcu_read_unlock();
3514 return cachep; 3405 return cachep;
3515} 3406}
3516EXPORT_SYMBOL(__memcg_kmem_get_cache);
3517 3407
3518/* 3408/*
3519 * We need to verify if the allocation against current->mm->owner's memcg is 3409 * We need to verify if the allocation against current->mm->owner's memcg is
@@ -3540,11 +3430,12 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
3540 /* 3430 /*
3541 * Disabling accounting is only relevant for some specific memcg 3431 * Disabling accounting is only relevant for some specific memcg
3542 * internal allocations. Therefore we would initially not have such 3432 * internal allocations. Therefore we would initially not have such
3543 * check here, since direct calls to the page allocator that are marked 3433 * check here, since direct calls to the page allocator that are
3544 * with GFP_KMEMCG only happen outside memcg core. We are mostly 3434 * accounted to kmemcg (alloc_kmem_pages and friends) only happen
3545 * concerned with cache allocations, and by having this test at 3435 * outside memcg core. We are mostly concerned with cache allocations,
3546 * memcg_kmem_get_cache, we are already able to relay the allocation to 3436 * and by having this test at memcg_kmem_get_cache, we are already able
3547 * the root cache and bypass the memcg cache altogether. 3437 * to relay the allocation to the root cache and bypass the memcg cache
3438 * altogether.
3548 * 3439 *
3549 * There is one exception, though: the SLUB allocator does not create 3440 * There is one exception, though: the SLUB allocator does not create
3550 * large order caches, but rather service large kmallocs directly from 3441 * large order caches, but rather service large kmallocs directly from
@@ -3631,7 +3522,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
3631 memcg_uncharge_kmem(memcg, PAGE_SIZE << order); 3522 memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
3632} 3523}
3633#else 3524#else
3634static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) 3525static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
3635{ 3526{
3636} 3527}
3637#endif /* CONFIG_MEMCG_KMEM */ 3528#endif /* CONFIG_MEMCG_KMEM */
@@ -4784,9 +4675,9 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
4784 if (mem_cgroup_move_parent(page, pc, memcg)) { 4675 if (mem_cgroup_move_parent(page, pc, memcg)) {
4785 /* found lock contention or "pc" is obsolete. */ 4676 /* found lock contention or "pc" is obsolete. */
4786 busy = page; 4677 busy = page;
4787 cond_resched();
4788 } else 4678 } else
4789 busy = NULL; 4679 busy = NULL;
4680 cond_resched();
4790 } while (!list_empty(list)); 4681 } while (!list_empty(list));
4791} 4682}
4792 4683
@@ -5062,13 +4953,14 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg,
5062 * Make sure we have enough space for this cgroup in each root cache's 4953 * Make sure we have enough space for this cgroup in each root cache's
5063 * memcg_params. 4954 * memcg_params.
5064 */ 4955 */
4956 mutex_lock(&memcg_slab_mutex);
5065 err = memcg_update_all_caches(memcg_id + 1); 4957 err = memcg_update_all_caches(memcg_id + 1);
4958 mutex_unlock(&memcg_slab_mutex);
5066 if (err) 4959 if (err)
5067 goto out_rmid; 4960 goto out_rmid;
5068 4961
5069 memcg->kmemcg_id = memcg_id; 4962 memcg->kmemcg_id = memcg_id;
5070 INIT_LIST_HEAD(&memcg->memcg_slab_caches); 4963 INIT_LIST_HEAD(&memcg->memcg_slab_caches);
5071 mutex_init(&memcg->slab_caches_mutex);
5072 4964
5073 /* 4965 /*
5074 * We couldn't have accounted to this cgroup, because it hasn't got the 4966 * We couldn't have accounted to this cgroup, because it hasn't got the
@@ -5443,22 +5335,14 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
5443 struct cftype *cft, u64 val) 5335 struct cftype *cft, u64 val)
5444{ 5336{
5445 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5337 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5446 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));
5447 5338
5448 if (val > 100 || !parent) 5339 if (val > 100)
5449 return -EINVAL; 5340 return -EINVAL;
5450 5341
5451 mutex_lock(&memcg_create_mutex); 5342 if (css_parent(css))
5452 5343 memcg->swappiness = val;
5453 /* If under hierarchy, only empty-root can set this value */ 5344 else
5454 if ((parent->use_hierarchy) || memcg_has_children(memcg)) { 5345 vm_swappiness = val;
5455 mutex_unlock(&memcg_create_mutex);
5456 return -EINVAL;
5457 }
5458
5459 memcg->swappiness = val;
5460
5461 mutex_unlock(&memcg_create_mutex);
5462 5346
5463 return 0; 5347 return 0;
5464} 5348}
@@ -5790,22 +5674,15 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
5790 struct cftype *cft, u64 val) 5674 struct cftype *cft, u64 val)
5791{ 5675{
5792 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5676 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5793 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));
5794 5677
5795 /* cannot set to root cgroup and only 0 and 1 are allowed */ 5678 /* cannot set to root cgroup and only 0 and 1 are allowed */
5796 if (!parent || !((val == 0) || (val == 1))) 5679 if (!css_parent(css) || !((val == 0) || (val == 1)))
5797 return -EINVAL; 5680 return -EINVAL;
5798 5681
5799 mutex_lock(&memcg_create_mutex);
5800 /* oom-kill-disable is a flag for subhierarchy. */
5801 if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
5802 mutex_unlock(&memcg_create_mutex);
5803 return -EINVAL;
5804 }
5805 memcg->oom_kill_disable = val; 5682 memcg->oom_kill_disable = val;
5806 if (!val) 5683 if (!val)
5807 memcg_oom_recover(memcg); 5684 memcg_oom_recover(memcg);
5808 mutex_unlock(&memcg_create_mutex); 5685
5809 return 0; 5686 return 0;
5810} 5687}
5811 5688
@@ -6491,7 +6368,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
6491 css_for_each_descendant_post(iter, css) 6368 css_for_each_descendant_post(iter, css)
6492 mem_cgroup_reparent_charges(mem_cgroup_from_css(iter)); 6369 mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));
6493 6370
6494 mem_cgroup_destroy_all_caches(memcg); 6371 memcg_unregister_all_caches(memcg);
6495 vmpressure_cleanup(&memcg->vmpressure); 6372 vmpressure_cleanup(&memcg->vmpressure);
6496} 6373}
6497 6374
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 9ccef39a9de2..cd8989c1027e 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -204,9 +204,9 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
204#endif 204#endif
205 si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; 205 si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
206 206
207 if ((flags & MF_ACTION_REQUIRED) && t == current) { 207 if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
208 si.si_code = BUS_MCEERR_AR; 208 si.si_code = BUS_MCEERR_AR;
209 ret = force_sig_info(SIGBUS, &si, t); 209 ret = force_sig_info(SIGBUS, &si, current);
210 } else { 210 } else {
211 /* 211 /*
212 * Don't use force here, it's convenient if the signal 212 * Don't use force here, it's convenient if the signal
@@ -380,20 +380,51 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
380 } 380 }
381} 381}
382 382
383static int task_early_kill(struct task_struct *tsk) 383/*
384 * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO)
385 * on behalf of the thread group. Return task_struct of the (first found)
386 * dedicated thread if found, and return NULL otherwise.
387 *
388 * We already hold read_lock(&tasklist_lock) in the caller, so we don't
389 * have to call rcu_read_lock/unlock() in this function.
390 */
391static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
384{ 392{
393 struct task_struct *t;
394
395 for_each_thread(tsk, t)
396 if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))
397 return t;
398 return NULL;
399}
400
401/*
402 * Determine whether a given process is "early kill" process which expects
403 * to be signaled when some page under the process is hwpoisoned.
404 * Return task_struct of the dedicated thread (main thread unless explicitly
405 * specified) if the process is "early kill," and otherwise returns NULL.
406 */
407static struct task_struct *task_early_kill(struct task_struct *tsk,
408 int force_early)
409{
410 struct task_struct *t;
385 if (!tsk->mm) 411 if (!tsk->mm)
386 return 0; 412 return NULL;
387 if (tsk->flags & PF_MCE_PROCESS) 413 if (force_early)
388 return !!(tsk->flags & PF_MCE_EARLY); 414 return tsk;
389 return sysctl_memory_failure_early_kill; 415 t = find_early_kill_thread(tsk);
416 if (t)
417 return t;
418 if (sysctl_memory_failure_early_kill)
419 return tsk;
420 return NULL;
390} 421}
391 422
392/* 423/*
393 * Collect processes when the error hit an anonymous page. 424 * Collect processes when the error hit an anonymous page.
394 */ 425 */
395static void collect_procs_anon(struct page *page, struct list_head *to_kill, 426static void collect_procs_anon(struct page *page, struct list_head *to_kill,
396 struct to_kill **tkc) 427 struct to_kill **tkc, int force_early)
397{ 428{
398 struct vm_area_struct *vma; 429 struct vm_area_struct *vma;
399 struct task_struct *tsk; 430 struct task_struct *tsk;
@@ -408,16 +439,17 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
408 read_lock(&tasklist_lock); 439 read_lock(&tasklist_lock);
409 for_each_process (tsk) { 440 for_each_process (tsk) {
410 struct anon_vma_chain *vmac; 441 struct anon_vma_chain *vmac;
442 struct task_struct *t = task_early_kill(tsk, force_early);
411 443
412 if (!task_early_kill(tsk)) 444 if (!t)
413 continue; 445 continue;
414 anon_vma_interval_tree_foreach(vmac, &av->rb_root, 446 anon_vma_interval_tree_foreach(vmac, &av->rb_root,
415 pgoff, pgoff) { 447 pgoff, pgoff) {
416 vma = vmac->vma; 448 vma = vmac->vma;
417 if (!page_mapped_in_vma(page, vma)) 449 if (!page_mapped_in_vma(page, vma))
418 continue; 450 continue;
419 if (vma->vm_mm == tsk->mm) 451 if (vma->vm_mm == t->mm)
420 add_to_kill(tsk, page, vma, to_kill, tkc); 452 add_to_kill(t, page, vma, to_kill, tkc);
421 } 453 }
422 } 454 }
423 read_unlock(&tasklist_lock); 455 read_unlock(&tasklist_lock);
@@ -428,7 +460,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
428 * Collect processes when the error hit a file mapped page. 460 * Collect processes when the error hit a file mapped page.
429 */ 461 */
430static void collect_procs_file(struct page *page, struct list_head *to_kill, 462static void collect_procs_file(struct page *page, struct list_head *to_kill,
431 struct to_kill **tkc) 463 struct to_kill **tkc, int force_early)
432{ 464{
433 struct vm_area_struct *vma; 465 struct vm_area_struct *vma;
434 struct task_struct *tsk; 466 struct task_struct *tsk;
@@ -438,10 +470,10 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
438 read_lock(&tasklist_lock); 470 read_lock(&tasklist_lock);
439 for_each_process(tsk) { 471 for_each_process(tsk) {
440 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 472 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
473 struct task_struct *t = task_early_kill(tsk, force_early);
441 474
442 if (!task_early_kill(tsk)) 475 if (!t)
443 continue; 476 continue;
444
445 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, 477 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
446 pgoff) { 478 pgoff) {
447 /* 479 /*
@@ -451,8 +483,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
451 * Assume applications who requested early kill want 483 * Assume applications who requested early kill want
452 * to be informed of all such data corruptions. 484 * to be informed of all such data corruptions.
453 */ 485 */
454 if (vma->vm_mm == tsk->mm) 486 if (vma->vm_mm == t->mm)
455 add_to_kill(tsk, page, vma, to_kill, tkc); 487 add_to_kill(t, page, vma, to_kill, tkc);
456 } 488 }
457 } 489 }
458 read_unlock(&tasklist_lock); 490 read_unlock(&tasklist_lock);
@@ -465,7 +497,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
465 * First preallocate one tokill structure outside the spin locks, 497 * First preallocate one tokill structure outside the spin locks,
466 * so that we can kill at least one process reasonably reliable. 498 * so that we can kill at least one process reasonably reliable.
467 */ 499 */
468static void collect_procs(struct page *page, struct list_head *tokill) 500static void collect_procs(struct page *page, struct list_head *tokill,
501 int force_early)
469{ 502{
470 struct to_kill *tk; 503 struct to_kill *tk;
471 504
@@ -476,9 +509,9 @@ static void collect_procs(struct page *page, struct list_head *tokill)
476 if (!tk) 509 if (!tk)
477 return; 510 return;
478 if (PageAnon(page)) 511 if (PageAnon(page))
479 collect_procs_anon(page, tokill, &tk); 512 collect_procs_anon(page, tokill, &tk, force_early);
480 else 513 else
481 collect_procs_file(page, tokill, &tk); 514 collect_procs_file(page, tokill, &tk, force_early);
482 kfree(tk); 515 kfree(tk);
483} 516}
484 517
@@ -963,7 +996,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
963 * there's nothing that can be done. 996 * there's nothing that can be done.
964 */ 997 */
965 if (kill) 998 if (kill)
966 collect_procs(ppage, &tokill); 999 collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED);
967 1000
968 ret = try_to_unmap(ppage, ttu); 1001 ret = try_to_unmap(ppage, ttu);
969 if (ret != SWAP_SUCCESS) 1002 if (ret != SWAP_SUCCESS)
@@ -1132,11 +1165,6 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1132 } 1165 }
1133 } 1166 }
1134 1167
1135 /*
1136 * Lock the page and wait for writeback to finish.
1137 * It's very difficult to mess with pages currently under IO
1138 * and in many cases impossible, so we just avoid it here.
1139 */
1140 lock_page(hpage); 1168 lock_page(hpage);
1141 1169
1142 /* 1170 /*
@@ -1186,6 +1214,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1186 if (PageHuge(p)) 1214 if (PageHuge(p))
1187 set_page_hwpoison_huge_page(hpage); 1215 set_page_hwpoison_huge_page(hpage);
1188 1216
1217 /*
1218 * It's very difficult to mess with pages currently under IO
1219 * and in many cases impossible, so we just avoid it here.
1220 */
1189 wait_on_page_writeback(p); 1221 wait_on_page_writeback(p);
1190 1222
1191 /* 1223 /*
@@ -1298,7 +1330,7 @@ static void memory_failure_work_func(struct work_struct *work)
1298 unsigned long proc_flags; 1330 unsigned long proc_flags;
1299 int gotten; 1331 int gotten;
1300 1332
1301 mf_cpu = &__get_cpu_var(memory_failure_cpu); 1333 mf_cpu = this_cpu_ptr(&memory_failure_cpu);
1302 for (;;) { 1334 for (;;) {
1303 spin_lock_irqsave(&mf_cpu->lock, proc_flags); 1335 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1304 gotten = kfifo_get(&mf_cpu->fifo, &entry); 1336 gotten = kfifo_get(&mf_cpu->fifo, &entry);
@@ -1503,7 +1535,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
1503 1535
1504 /* Keep page count to indicate a given hugepage is isolated. */ 1536 /* Keep page count to indicate a given hugepage is isolated. */
1505 list_move(&hpage->lru, &pagelist); 1537 list_move(&hpage->lru, &pagelist);
1506 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 1538 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1507 MIGRATE_SYNC, MR_MEMORY_FAILURE); 1539 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1508 if (ret) { 1540 if (ret) {
1509 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1541 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
@@ -1584,7 +1616,7 @@ static int __soft_offline_page(struct page *page, int flags)
1584 inc_zone_page_state(page, NR_ISOLATED_ANON + 1616 inc_zone_page_state(page, NR_ISOLATED_ANON +
1585 page_is_file_cache(page)); 1617 page_is_file_cache(page));
1586 list_add(&page->lru, &pagelist); 1618 list_add(&page->lru, &pagelist);
1587 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 1619 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1588 MIGRATE_SYNC, MR_MEMORY_FAILURE); 1620 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1589 if (ret) { 1621 if (ret) {
1590 if (!list_empty(&pagelist)) { 1622 if (!list_empty(&pagelist)) {
@@ -1664,11 +1696,7 @@ int soft_offline_page(struct page *page, int flags)
1664 } 1696 }
1665 } 1697 }
1666 1698
1667 /* 1699 get_online_mems();
1668 * The lock_memory_hotplug prevents a race with memory hotplug.
1669 * This is a big hammer, a better would be nicer.
1670 */
1671 lock_memory_hotplug();
1672 1700
1673 /* 1701 /*
1674 * Isolate the page, so that it doesn't get reallocated if it 1702 * Isolate the page, so that it doesn't get reallocated if it
@@ -1679,7 +1707,7 @@ int soft_offline_page(struct page *page, int flags)
1679 set_migratetype_isolate(page, true); 1707 set_migratetype_isolate(page, true);
1680 1708
1681 ret = get_any_page(page, pfn, flags); 1709 ret = get_any_page(page, pfn, flags);
1682 unlock_memory_hotplug(); 1710 put_online_mems();
1683 if (ret > 0) { /* for in-use pages */ 1711 if (ret > 0) { /* for in-use pages */
1684 if (PageHuge(page)) 1712 if (PageHuge(page))
1685 ret = soft_offline_huge_page(page, flags); 1713 ret = soft_offline_huge_page(page, flags);
diff --git a/mm/memory.c b/mm/memory.c
index e302ae1dcce0..d67fd9fcf1f2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -698,11 +698,6 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
698 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 698 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
699} 699}
700 700
701static inline bool is_cow_mapping(vm_flags_t flags)
702{
703 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
704}
705
706/* 701/*
707 * vm_normal_page -- This function gets the "struct page" associated with a pte. 702 * vm_normal_page -- This function gets the "struct page" associated with a pte.
708 * 703 *
@@ -756,7 +751,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
756 unsigned long pfn = pte_pfn(pte); 751 unsigned long pfn = pte_pfn(pte);
757 752
758 if (HAVE_PTE_SPECIAL) { 753 if (HAVE_PTE_SPECIAL) {
759 if (likely(!pte_special(pte))) 754 if (likely(!pte_special(pte) || pte_numa(pte)))
760 goto check_pfn; 755 goto check_pfn;
761 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) 756 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
762 return NULL; 757 return NULL;
@@ -782,14 +777,15 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
782 } 777 }
783 } 778 }
784 779
785 if (is_zero_pfn(pfn))
786 return NULL;
787check_pfn: 780check_pfn:
788 if (unlikely(pfn > highest_memmap_pfn)) { 781 if (unlikely(pfn > highest_memmap_pfn)) {
789 print_bad_pte(vma, addr, pte, NULL); 782 print_bad_pte(vma, addr, pte, NULL);
790 return NULL; 783 return NULL;
791 } 784 }
792 785
786 if (is_zero_pfn(pfn))
787 return NULL;
788
793 /* 789 /*
794 * NOTE! We still have PageReserved() pages in the page tables. 790 * NOTE! We still have PageReserved() pages in the page tables.
795 * eg. VDSO mappings can cause them to exist. 791 * eg. VDSO mappings can cause them to exist.
@@ -1457,646 +1453,6 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1457} 1453}
1458EXPORT_SYMBOL_GPL(zap_vma_ptes); 1454EXPORT_SYMBOL_GPL(zap_vma_ptes);
1459 1455
1460/**
1461 * follow_page_mask - look up a page descriptor from a user-virtual address
1462 * @vma: vm_area_struct mapping @address
1463 * @address: virtual address to look up
1464 * @flags: flags modifying lookup behaviour
1465 * @page_mask: on output, *page_mask is set according to the size of the page
1466 *
1467 * @flags can have FOLL_ flags set, defined in <linux/mm.h>
1468 *
1469 * Returns the mapped (struct page *), %NULL if no mapping exists, or
1470 * an error pointer if there is a mapping to something not represented
1471 * by a page descriptor (see also vm_normal_page()).
1472 */
1473struct page *follow_page_mask(struct vm_area_struct *vma,
1474 unsigned long address, unsigned int flags,
1475 unsigned int *page_mask)
1476{
1477 pgd_t *pgd;
1478 pud_t *pud;
1479 pmd_t *pmd;
1480 pte_t *ptep, pte;
1481 spinlock_t *ptl;
1482 struct page *page;
1483 struct mm_struct *mm = vma->vm_mm;
1484
1485 *page_mask = 0;
1486
1487 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
1488 if (!IS_ERR(page)) {
1489 BUG_ON(flags & FOLL_GET);
1490 goto out;
1491 }
1492
1493 page = NULL;
1494 pgd = pgd_offset(mm, address);
1495 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
1496 goto no_page_table;
1497
1498 pud = pud_offset(pgd, address);
1499 if (pud_none(*pud))
1500 goto no_page_table;
1501 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
1502 if (flags & FOLL_GET)
1503 goto out;
1504 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
1505 goto out;
1506 }
1507 if (unlikely(pud_bad(*pud)))
1508 goto no_page_table;
1509
1510 pmd = pmd_offset(pud, address);
1511 if (pmd_none(*pmd))
1512 goto no_page_table;
1513 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
1514 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
1515 if (flags & FOLL_GET) {
1516 /*
1517 * Refcount on tail pages are not well-defined and
1518 * shouldn't be taken. The caller should handle a NULL
1519 * return when trying to follow tail pages.
1520 */
1521 if (PageHead(page))
1522 get_page(page);
1523 else {
1524 page = NULL;
1525 goto out;
1526 }
1527 }
1528 goto out;
1529 }
1530 if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
1531 goto no_page_table;
1532 if (pmd_trans_huge(*pmd)) {
1533 if (flags & FOLL_SPLIT) {
1534 split_huge_page_pmd(vma, address, pmd);
1535 goto split_fallthrough;
1536 }
1537 ptl = pmd_lock(mm, pmd);
1538 if (likely(pmd_trans_huge(*pmd))) {
1539 if (unlikely(pmd_trans_splitting(*pmd))) {
1540 spin_unlock(ptl);
1541 wait_split_huge_page(vma->anon_vma, pmd);
1542 } else {
1543 page = follow_trans_huge_pmd(vma, address,
1544 pmd, flags);
1545 spin_unlock(ptl);
1546 *page_mask = HPAGE_PMD_NR - 1;
1547 goto out;
1548 }
1549 } else
1550 spin_unlock(ptl);
1551 /* fall through */
1552 }
1553split_fallthrough:
1554 if (unlikely(pmd_bad(*pmd)))
1555 goto no_page_table;
1556
1557 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
1558
1559 pte = *ptep;
1560 if (!pte_present(pte)) {
1561 swp_entry_t entry;
1562 /*
1563 * KSM's break_ksm() relies upon recognizing a ksm page
1564 * even while it is being migrated, so for that case we
1565 * need migration_entry_wait().
1566 */
1567 if (likely(!(flags & FOLL_MIGRATION)))
1568 goto no_page;
1569 if (pte_none(pte) || pte_file(pte))
1570 goto no_page;
1571 entry = pte_to_swp_entry(pte);
1572 if (!is_migration_entry(entry))
1573 goto no_page;
1574 pte_unmap_unlock(ptep, ptl);
1575 migration_entry_wait(mm, pmd, address);
1576 goto split_fallthrough;
1577 }
1578 if ((flags & FOLL_NUMA) && pte_numa(pte))
1579 goto no_page;
1580 if ((flags & FOLL_WRITE) && !pte_write(pte))
1581 goto unlock;
1582
1583 page = vm_normal_page(vma, address, pte);
1584 if (unlikely(!page)) {
1585 if ((flags & FOLL_DUMP) ||
1586 !is_zero_pfn(pte_pfn(pte)))
1587 goto bad_page;
1588 page = pte_page(pte);
1589 }
1590
1591 if (flags & FOLL_GET)
1592 get_page_foll(page);
1593 if (flags & FOLL_TOUCH) {
1594 if ((flags & FOLL_WRITE) &&
1595 !pte_dirty(pte) && !PageDirty(page))
1596 set_page_dirty(page);
1597 /*
1598 * pte_mkyoung() would be more correct here, but atomic care
1599 * is needed to avoid losing the dirty bit: it is easier to use
1600 * mark_page_accessed().
1601 */
1602 mark_page_accessed(page);
1603 }
1604 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
1605 /*
1606 * The preliminary mapping check is mainly to avoid the
1607 * pointless overhead of lock_page on the ZERO_PAGE
1608 * which might bounce very badly if there is contention.
1609 *
1610 * If the page is already locked, we don't need to
1611 * handle it now - vmscan will handle it later if and
1612 * when it attempts to reclaim the page.
1613 */
1614 if (page->mapping && trylock_page(page)) {
1615 lru_add_drain(); /* push cached pages to LRU */
1616 /*
1617 * Because we lock page here, and migration is
1618 * blocked by the pte's page reference, and we
1619 * know the page is still mapped, we don't even
1620 * need to check for file-cache page truncation.
1621 */
1622 mlock_vma_page(page);
1623 unlock_page(page);
1624 }
1625 }
1626unlock:
1627 pte_unmap_unlock(ptep, ptl);
1628out:
1629 return page;
1630
1631bad_page:
1632 pte_unmap_unlock(ptep, ptl);
1633 return ERR_PTR(-EFAULT);
1634
1635no_page:
1636 pte_unmap_unlock(ptep, ptl);
1637 if (!pte_none(pte))
1638 return page;
1639
1640no_page_table:
1641 /*
1642 * When core dumping an enormous anonymous area that nobody
1643 * has touched so far, we don't want to allocate unnecessary pages or
1644 * page tables. Return error instead of NULL to skip handle_mm_fault,
1645 * then get_dump_page() will return NULL to leave a hole in the dump.
1646 * But we can only make this optimization where a hole would surely
1647 * be zero-filled if handle_mm_fault() actually did handle it.
1648 */
1649 if ((flags & FOLL_DUMP) &&
1650 (!vma->vm_ops || !vma->vm_ops->fault))
1651 return ERR_PTR(-EFAULT);
1652 return page;
1653}
1654
1655static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
1656{
1657 return stack_guard_page_start(vma, addr) ||
1658 stack_guard_page_end(vma, addr+PAGE_SIZE);
1659}
1660
1661/**
1662 * __get_user_pages() - pin user pages in memory
1663 * @tsk: task_struct of target task
1664 * @mm: mm_struct of target mm
1665 * @start: starting user address
1666 * @nr_pages: number of pages from start to pin
1667 * @gup_flags: flags modifying pin behaviour
1668 * @pages: array that receives pointers to the pages pinned.
1669 * Should be at least nr_pages long. Or NULL, if caller
1670 * only intends to ensure the pages are faulted in.
1671 * @vmas: array of pointers to vmas corresponding to each page.
1672 * Or NULL if the caller does not require them.
1673 * @nonblocking: whether waiting for disk IO or mmap_sem contention
1674 *
1675 * Returns number of pages pinned. This may be fewer than the number
1676 * requested. If nr_pages is 0 or negative, returns 0. If no pages
1677 * were pinned, returns -errno. Each page returned must be released
1678 * with a put_page() call when it is finished with. vmas will only
1679 * remain valid while mmap_sem is held.
1680 *
1681 * Must be called with mmap_sem held for read or write.
1682 *
1683 * __get_user_pages walks a process's page tables and takes a reference to
1684 * each struct page that each user address corresponds to at a given
1685 * instant. That is, it takes the page that would be accessed if a user
1686 * thread accesses the given user virtual address at that instant.
1687 *
1688 * This does not guarantee that the page exists in the user mappings when
1689 * __get_user_pages returns, and there may even be a completely different
1690 * page there in some cases (eg. if mmapped pagecache has been invalidated
1691 * and subsequently re faulted). However it does guarantee that the page
1692 * won't be freed completely. And mostly callers simply care that the page
1693 * contains data that was valid *at some point in time*. Typically, an IO
1694 * or similar operation cannot guarantee anything stronger anyway because
1695 * locks can't be held over the syscall boundary.
1696 *
1697 * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
1698 * the page is written to, set_page_dirty (or set_page_dirty_lock, as
1699 * appropriate) must be called after the page is finished with, and
1700 * before put_page is called.
1701 *
1702 * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
1703 * or mmap_sem contention, and if waiting is needed to pin all pages,
1704 * *@nonblocking will be set to 0.
1705 *
1706 * In most cases, get_user_pages or get_user_pages_fast should be used
1707 * instead of __get_user_pages. __get_user_pages should be used only if
1708 * you need some special @gup_flags.
1709 */
1710long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1711 unsigned long start, unsigned long nr_pages,
1712 unsigned int gup_flags, struct page **pages,
1713 struct vm_area_struct **vmas, int *nonblocking)
1714{
1715 long i;
1716 unsigned long vm_flags;
1717 unsigned int page_mask;
1718
1719 if (!nr_pages)
1720 return 0;
1721
1722 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
1723
1724 /*
1725 * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault
1726 * would be called on PROT_NONE ranges. We must never invoke
1727 * handle_mm_fault on PROT_NONE ranges or the NUMA hinting
1728 * page faults would unprotect the PROT_NONE ranges if
1729 * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd
1730 * bitflag. So to avoid that, don't set FOLL_NUMA if
1731 * FOLL_FORCE is set.
1732 */
1733 if (!(gup_flags & FOLL_FORCE))
1734 gup_flags |= FOLL_NUMA;
1735
1736 i = 0;
1737
1738 do {
1739 struct vm_area_struct *vma;
1740
1741 vma = find_extend_vma(mm, start);
1742 if (!vma && in_gate_area(mm, start)) {
1743 unsigned long pg = start & PAGE_MASK;
1744 pgd_t *pgd;
1745 pud_t *pud;
1746 pmd_t *pmd;
1747 pte_t *pte;
1748
1749 /* user gate pages are read-only */
1750 if (gup_flags & FOLL_WRITE)
1751 goto efault;
1752 if (pg > TASK_SIZE)
1753 pgd = pgd_offset_k(pg);
1754 else
1755 pgd = pgd_offset_gate(mm, pg);
1756 BUG_ON(pgd_none(*pgd));
1757 pud = pud_offset(pgd, pg);
1758 BUG_ON(pud_none(*pud));
1759 pmd = pmd_offset(pud, pg);
1760 if (pmd_none(*pmd))
1761 goto efault;
1762 VM_BUG_ON(pmd_trans_huge(*pmd));
1763 pte = pte_offset_map(pmd, pg);
1764 if (pte_none(*pte)) {
1765 pte_unmap(pte);
1766 goto efault;
1767 }
1768 vma = get_gate_vma(mm);
1769 if (pages) {
1770 struct page *page;
1771
1772 page = vm_normal_page(vma, start, *pte);
1773 if (!page) {
1774 if (!(gup_flags & FOLL_DUMP) &&
1775 is_zero_pfn(pte_pfn(*pte)))
1776 page = pte_page(*pte);
1777 else {
1778 pte_unmap(pte);
1779 goto efault;
1780 }
1781 }
1782 pages[i] = page;
1783 get_page(page);
1784 }
1785 pte_unmap(pte);
1786 page_mask = 0;
1787 goto next_page;
1788 }
1789
1790 if (!vma)
1791 goto efault;
1792 vm_flags = vma->vm_flags;
1793 if (vm_flags & (VM_IO | VM_PFNMAP))
1794 goto efault;
1795
1796 if (gup_flags & FOLL_WRITE) {
1797 if (!(vm_flags & VM_WRITE)) {
1798 if (!(gup_flags & FOLL_FORCE))
1799 goto efault;
1800 /*
1801 * We used to let the write,force case do COW
1802 * in a VM_MAYWRITE VM_SHARED !VM_WRITE vma, so
1803 * ptrace could set a breakpoint in a read-only
1804 * mapping of an executable, without corrupting
1805 * the file (yet only when that file had been
1806 * opened for writing!). Anon pages in shared
1807 * mappings are surprising: now just reject it.
1808 */
1809 if (!is_cow_mapping(vm_flags)) {
1810 WARN_ON_ONCE(vm_flags & VM_MAYWRITE);
1811 goto efault;
1812 }
1813 }
1814 } else {
1815 if (!(vm_flags & VM_READ)) {
1816 if (!(gup_flags & FOLL_FORCE))
1817 goto efault;
1818 /*
1819 * Is there actually any vma we can reach here
1820 * which does not have VM_MAYREAD set?
1821 */
1822 if (!(vm_flags & VM_MAYREAD))
1823 goto efault;
1824 }
1825 }
1826
1827 if (is_vm_hugetlb_page(vma)) {
1828 i = follow_hugetlb_page(mm, vma, pages, vmas,
1829 &start, &nr_pages, i, gup_flags);
1830 continue;
1831 }
1832
1833 do {
1834 struct page *page;
1835 unsigned int foll_flags = gup_flags;
1836 unsigned int page_increm;
1837
1838 /*
1839 * If we have a pending SIGKILL, don't keep faulting
1840 * pages and potentially allocating memory.
1841 */
1842 if (unlikely(fatal_signal_pending(current)))
1843 return i ? i : -ERESTARTSYS;
1844
1845 cond_resched();
1846 while (!(page = follow_page_mask(vma, start,
1847 foll_flags, &page_mask))) {
1848 int ret;
1849 unsigned int fault_flags = 0;
1850
1851 /* For mlock, just skip the stack guard page. */
1852 if (foll_flags & FOLL_MLOCK) {
1853 if (stack_guard_page(vma, start))
1854 goto next_page;
1855 }
1856 if (foll_flags & FOLL_WRITE)
1857 fault_flags |= FAULT_FLAG_WRITE;
1858 if (nonblocking)
1859 fault_flags |= FAULT_FLAG_ALLOW_RETRY;
1860 if (foll_flags & FOLL_NOWAIT)
1861 fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT);
1862
1863 ret = handle_mm_fault(mm, vma, start,
1864 fault_flags);
1865
1866 if (ret & VM_FAULT_ERROR) {
1867 if (ret & VM_FAULT_OOM)
1868 return i ? i : -ENOMEM;
1869 if (ret & (VM_FAULT_HWPOISON |
1870 VM_FAULT_HWPOISON_LARGE)) {
1871 if (i)
1872 return i;
1873 else if (gup_flags & FOLL_HWPOISON)
1874 return -EHWPOISON;
1875 else
1876 return -EFAULT;
1877 }
1878 if (ret & VM_FAULT_SIGBUS)
1879 goto efault;
1880 BUG();
1881 }
1882
1883 if (tsk) {
1884 if (ret & VM_FAULT_MAJOR)
1885 tsk->maj_flt++;
1886 else
1887 tsk->min_flt++;
1888 }
1889
1890 if (ret & VM_FAULT_RETRY) {
1891 if (nonblocking)
1892 *nonblocking = 0;
1893 return i;
1894 }
1895
1896 /*
1897 * The VM_FAULT_WRITE bit tells us that
1898 * do_wp_page has broken COW when necessary,
1899 * even if maybe_mkwrite decided not to set
1900 * pte_write. We can thus safely do subsequent
1901 * page lookups as if they were reads. But only
1902 * do so when looping for pte_write is futile:
1903 * in some cases userspace may also be wanting
1904 * to write to the gotten user page, which a
1905 * read fault here might prevent (a readonly
1906 * page might get reCOWed by userspace write).
1907 */
1908 if ((ret & VM_FAULT_WRITE) &&
1909 !(vma->vm_flags & VM_WRITE))
1910 foll_flags &= ~FOLL_WRITE;
1911
1912 cond_resched();
1913 }
1914 if (IS_ERR(page))
1915 return i ? i : PTR_ERR(page);
1916 if (pages) {
1917 pages[i] = page;
1918
1919 flush_anon_page(vma, page, start);
1920 flush_dcache_page(page);
1921 page_mask = 0;
1922 }
1923next_page:
1924 if (vmas) {
1925 vmas[i] = vma;
1926 page_mask = 0;
1927 }
1928 page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
1929 if (page_increm > nr_pages)
1930 page_increm = nr_pages;
1931 i += page_increm;
1932 start += page_increm * PAGE_SIZE;
1933 nr_pages -= page_increm;
1934 } while (nr_pages && start < vma->vm_end);
1935 } while (nr_pages);
1936 return i;
1937efault:
1938 return i ? : -EFAULT;
1939}
1940EXPORT_SYMBOL(__get_user_pages);
1941
1942/*
1943 * fixup_user_fault() - manually resolve a user page fault
1944 * @tsk: the task_struct to use for page fault accounting, or
1945 * NULL if faults are not to be recorded.
1946 * @mm: mm_struct of target mm
1947 * @address: user address
1948 * @fault_flags:flags to pass down to handle_mm_fault()
1949 *
1950 * This is meant to be called in the specific scenario where for locking reasons
1951 * we try to access user memory in atomic context (within a pagefault_disable()
1952 * section), this returns -EFAULT, and we want to resolve the user fault before
1953 * trying again.
1954 *
1955 * Typically this is meant to be used by the futex code.
1956 *
1957 * The main difference with get_user_pages() is that this function will
1958 * unconditionally call handle_mm_fault() which will in turn perform all the
1959 * necessary SW fixup of the dirty and young bits in the PTE, while
1960 * handle_mm_fault() only guarantees to update these in the struct page.
1961 *
1962 * This is important for some architectures where those bits also gate the
1963 * access permission to the page because they are maintained in software. On
1964 * such architectures, gup() will not be enough to make a subsequent access
1965 * succeed.
1966 *
1967 * This should be called with the mm_sem held for read.
1968 */
1969int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
1970 unsigned long address, unsigned int fault_flags)
1971{
1972 struct vm_area_struct *vma;
1973 vm_flags_t vm_flags;
1974 int ret;
1975
1976 vma = find_extend_vma(mm, address);
1977 if (!vma || address < vma->vm_start)
1978 return -EFAULT;
1979
1980 vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ;
1981 if (!(vm_flags & vma->vm_flags))
1982 return -EFAULT;
1983
1984 ret = handle_mm_fault(mm, vma, address, fault_flags);
1985 if (ret & VM_FAULT_ERROR) {
1986 if (ret & VM_FAULT_OOM)
1987 return -ENOMEM;
1988 if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
1989 return -EHWPOISON;
1990 if (ret & VM_FAULT_SIGBUS)
1991 return -EFAULT;
1992 BUG();
1993 }
1994 if (tsk) {
1995 if (ret & VM_FAULT_MAJOR)
1996 tsk->maj_flt++;
1997 else
1998 tsk->min_flt++;
1999 }
2000 return 0;
2001}
2002
2003/*
2004 * get_user_pages() - pin user pages in memory
2005 * @tsk: the task_struct to use for page fault accounting, or
2006 * NULL if faults are not to be recorded.
2007 * @mm: mm_struct of target mm
2008 * @start: starting user address
2009 * @nr_pages: number of pages from start to pin
2010 * @write: whether pages will be written to by the caller
2011 * @force: whether to force access even when user mapping is currently
2012 * protected (but never forces write access to shared mapping).
2013 * @pages: array that receives pointers to the pages pinned.
2014 * Should be at least nr_pages long. Or NULL, if caller
2015 * only intends to ensure the pages are faulted in.
2016 * @vmas: array of pointers to vmas corresponding to each page.
2017 * Or NULL if the caller does not require them.
2018 *
2019 * Returns number of pages pinned. This may be fewer than the number
2020 * requested. If nr_pages is 0 or negative, returns 0. If no pages
2021 * were pinned, returns -errno. Each page returned must be released
2022 * with a put_page() call when it is finished with. vmas will only
2023 * remain valid while mmap_sem is held.
2024 *
2025 * Must be called with mmap_sem held for read or write.
2026 *
2027 * get_user_pages walks a process's page tables and takes a reference to
2028 * each struct page that each user address corresponds to at a given
2029 * instant. That is, it takes the page that would be accessed if a user
2030 * thread accesses the given user virtual address at that instant.
2031 *
2032 * This does not guarantee that the page exists in the user mappings when
2033 * get_user_pages returns, and there may even be a completely different
2034 * page there in some cases (eg. if mmapped pagecache has been invalidated
2035 * and subsequently re faulted). However it does guarantee that the page
2036 * won't be freed completely. And mostly callers simply care that the page
2037 * contains data that was valid *at some point in time*. Typically, an IO
2038 * or similar operation cannot guarantee anything stronger anyway because
2039 * locks can't be held over the syscall boundary.
2040 *
2041 * If write=0, the page must not be written to. If the page is written to,
2042 * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called
2043 * after the page is finished with, and before put_page is called.
2044 *
2045 * get_user_pages is typically used for fewer-copy IO operations, to get a
2046 * handle on the memory by some means other than accesses via the user virtual
2047 * addresses. The pages may be submitted for DMA to devices or accessed via
2048 * their kernel linear mapping (via the kmap APIs). Care should be taken to
2049 * use the correct cache flushing APIs.
2050 *
2051 * See also get_user_pages_fast, for performance critical applications.
2052 */
2053long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
2054 unsigned long start, unsigned long nr_pages, int write,
2055 int force, struct page **pages, struct vm_area_struct **vmas)
2056{
2057 int flags = FOLL_TOUCH;
2058
2059 if (pages)
2060 flags |= FOLL_GET;
2061 if (write)
2062 flags |= FOLL_WRITE;
2063 if (force)
2064 flags |= FOLL_FORCE;
2065
2066 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
2067 NULL);
2068}
2069EXPORT_SYMBOL(get_user_pages);
2070
2071/**
2072 * get_dump_page() - pin user page in memory while writing it to core dump
2073 * @addr: user address
2074 *
2075 * Returns struct page pointer of user page pinned for dump,
2076 * to be freed afterwards by page_cache_release() or put_page().
2077 *
2078 * Returns NULL on any kind of failure - a hole must then be inserted into
2079 * the corefile, to preserve alignment with its headers; and also returns
2080 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
2081 * allowing a hole to be left in the corefile to save diskspace.
2082 *
2083 * Called without mmap_sem, but after all other threads have been killed.
2084 */
2085#ifdef CONFIG_ELF_CORE
2086struct page *get_dump_page(unsigned long addr)
2087{
2088 struct vm_area_struct *vma;
2089 struct page *page;
2090
2091 if (__get_user_pages(current, current->mm, addr, 1,
2092 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
2093 NULL) < 1)
2094 return NULL;
2095 flush_cache_page(vma, addr, page_to_pfn(page));
2096 return page;
2097}
2098#endif /* CONFIG_ELF_CORE */
2099
2100pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, 1456pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
2101 spinlock_t **ptl) 1457 spinlock_t **ptl)
2102{ 1458{
@@ -3402,65 +2758,76 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
3402 update_mmu_cache(vma, address, pte); 2758 update_mmu_cache(vma, address, pte);
3403} 2759}
3404 2760
3405#define FAULT_AROUND_ORDER 4 2761static unsigned long fault_around_bytes = 65536;
2762
2763/*
2764 * fault_around_pages() and fault_around_mask() round down fault_around_bytes
2765 * to nearest page order. It's what do_fault_around() expects to see.
2766 */
2767static inline unsigned long fault_around_pages(void)
2768{
2769 return rounddown_pow_of_two(fault_around_bytes) / PAGE_SIZE;
2770}
2771
2772static inline unsigned long fault_around_mask(void)
2773{
2774 return ~(rounddown_pow_of_two(fault_around_bytes) - 1) & PAGE_MASK;
2775}
3406 2776
3407#ifdef CONFIG_DEBUG_FS
3408static unsigned int fault_around_order = FAULT_AROUND_ORDER;
3409 2777
3410static int fault_around_order_get(void *data, u64 *val) 2778#ifdef CONFIG_DEBUG_FS
2779static int fault_around_bytes_get(void *data, u64 *val)
3411{ 2780{
3412 *val = fault_around_order; 2781 *val = fault_around_bytes;
3413 return 0; 2782 return 0;
3414} 2783}
3415 2784
3416static int fault_around_order_set(void *data, u64 val) 2785static int fault_around_bytes_set(void *data, u64 val)
3417{ 2786{
3418 BUILD_BUG_ON((1UL << FAULT_AROUND_ORDER) > PTRS_PER_PTE); 2787 if (val / PAGE_SIZE > PTRS_PER_PTE)
3419 if (1UL << val > PTRS_PER_PTE)
3420 return -EINVAL; 2788 return -EINVAL;
3421 fault_around_order = val; 2789 fault_around_bytes = val;
3422 return 0; 2790 return 0;
3423} 2791}
3424DEFINE_SIMPLE_ATTRIBUTE(fault_around_order_fops, 2792DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops,
3425 fault_around_order_get, fault_around_order_set, "%llu\n"); 2793 fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
3426 2794
3427static int __init fault_around_debugfs(void) 2795static int __init fault_around_debugfs(void)
3428{ 2796{
3429 void *ret; 2797 void *ret;
3430 2798
3431 ret = debugfs_create_file("fault_around_order", 0644, NULL, NULL, 2799 ret = debugfs_create_file("fault_around_bytes", 0644, NULL, NULL,
3432 &fault_around_order_fops); 2800 &fault_around_bytes_fops);
3433 if (!ret) 2801 if (!ret)
3434 pr_warn("Failed to create fault_around_order in debugfs"); 2802 pr_warn("Failed to create fault_around_bytes in debugfs");
3435 return 0; 2803 return 0;
3436} 2804}
3437late_initcall(fault_around_debugfs); 2805late_initcall(fault_around_debugfs);
3438
3439static inline unsigned long fault_around_pages(void)
3440{
3441 return 1UL << fault_around_order;
3442}
3443
3444static inline unsigned long fault_around_mask(void)
3445{
3446 return ~((1UL << (PAGE_SHIFT + fault_around_order)) - 1);
3447}
3448#else
3449static inline unsigned long fault_around_pages(void)
3450{
3451 unsigned long nr_pages;
3452
3453 nr_pages = 1UL << FAULT_AROUND_ORDER;
3454 BUILD_BUG_ON(nr_pages > PTRS_PER_PTE);
3455 return nr_pages;
3456}
3457
3458static inline unsigned long fault_around_mask(void)
3459{
3460 return ~((1UL << (PAGE_SHIFT + FAULT_AROUND_ORDER)) - 1);
3461}
3462#endif 2806#endif
3463 2807
2808/*
2809 * do_fault_around() tries to map few pages around the fault address. The hope
2810 * is that the pages will be needed soon and this will lower the number of
2811 * faults to handle.
2812 *
2813 * It uses vm_ops->map_pages() to map the pages, which skips the page if it's
2814 * not ready to be mapped: not up-to-date, locked, etc.
2815 *
2816 * This function is called with the page table lock taken. In the split ptlock
2817 * case the page table lock only protects only those entries which belong to
2818 * the page table corresponding to the fault address.
2819 *
2820 * This function doesn't cross the VMA boundaries, in order to call map_pages()
2821 * only once.
2822 *
2823 * fault_around_pages() defines how many pages we'll try to map.
2824 * do_fault_around() expects it to return a power of two less than or equal to
2825 * PTRS_PER_PTE.
2826 *
2827 * The virtual address of the area that we map is naturally aligned to the
2828 * fault_around_pages() value (and therefore to page order). This way it's
2829 * easier to guarantee that we don't cross page table boundaries.
2830 */
3464static void do_fault_around(struct vm_area_struct *vma, unsigned long address, 2831static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
3465 pte_t *pte, pgoff_t pgoff, unsigned int flags) 2832 pte_t *pte, pgoff_t pgoff, unsigned int flags)
3466{ 2833{
@@ -3476,7 +2843,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
3476 2843
3477 /* 2844 /*
3478 * max_pgoff is either end of page table or end of vma 2845 * max_pgoff is either end of page table or end of vma
3479 * or fault_around_pages() from pgoff, depending what is neast. 2846 * or fault_around_pages() from pgoff, depending what is nearest.
3480 */ 2847 */
3481 max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + 2848 max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
3482 PTRS_PER_PTE - 1; 2849 PTRS_PER_PTE - 1;
@@ -3515,7 +2882,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3515 * if page by the offset is not ready to be mapped (cold cache or 2882 * if page by the offset is not ready to be mapped (cold cache or
3516 * something). 2883 * something).
3517 */ 2884 */
3518 if (vma->vm_ops->map_pages) { 2885 if (vma->vm_ops->map_pages && fault_around_pages() > 1) {
3519 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 2886 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
3520 do_fault_around(vma, address, pte, pgoff, flags); 2887 do_fault_around(vma, address, pte, pgoff, flags);
3521 if (!pte_same(*pte, orig_pte)) 2888 if (!pte_same(*pte, orig_pte))
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a650db29606f..469bbf505f85 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -46,19 +46,84 @@
46static void generic_online_page(struct page *page); 46static void generic_online_page(struct page *page);
47 47
48static online_page_callback_t online_page_callback = generic_online_page; 48static online_page_callback_t online_page_callback = generic_online_page;
49static DEFINE_MUTEX(online_page_callback_lock);
49 50
50DEFINE_MUTEX(mem_hotplug_mutex); 51/* The same as the cpu_hotplug lock, but for memory hotplug. */
52static struct {
53 struct task_struct *active_writer;
54 struct mutex lock; /* Synchronizes accesses to refcount, */
55 /*
56 * Also blocks the new readers during
57 * an ongoing mem hotplug operation.
58 */
59 int refcount;
60
61#ifdef CONFIG_DEBUG_LOCK_ALLOC
62 struct lockdep_map dep_map;
63#endif
64} mem_hotplug = {
65 .active_writer = NULL,
66 .lock = __MUTEX_INITIALIZER(mem_hotplug.lock),
67 .refcount = 0,
68#ifdef CONFIG_DEBUG_LOCK_ALLOC
69 .dep_map = {.name = "mem_hotplug.lock" },
70#endif
71};
72
73/* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */
74#define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map)
75#define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map)
76#define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map)
77
78void get_online_mems(void)
79{
80 might_sleep();
81 if (mem_hotplug.active_writer == current)
82 return;
83 memhp_lock_acquire_read();
84 mutex_lock(&mem_hotplug.lock);
85 mem_hotplug.refcount++;
86 mutex_unlock(&mem_hotplug.lock);
87
88}
51 89
52void lock_memory_hotplug(void) 90void put_online_mems(void)
53{ 91{
54 mutex_lock(&mem_hotplug_mutex); 92 if (mem_hotplug.active_writer == current)
93 return;
94 mutex_lock(&mem_hotplug.lock);
95
96 if (WARN_ON(!mem_hotplug.refcount))
97 mem_hotplug.refcount++; /* try to fix things up */
98
99 if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer))
100 wake_up_process(mem_hotplug.active_writer);
101 mutex_unlock(&mem_hotplug.lock);
102 memhp_lock_release();
103
55} 104}
56 105
57void unlock_memory_hotplug(void) 106static void mem_hotplug_begin(void)
58{ 107{
59 mutex_unlock(&mem_hotplug_mutex); 108 mem_hotplug.active_writer = current;
109
110 memhp_lock_acquire();
111 for (;;) {
112 mutex_lock(&mem_hotplug.lock);
113 if (likely(!mem_hotplug.refcount))
114 break;
115 __set_current_state(TASK_UNINTERRUPTIBLE);
116 mutex_unlock(&mem_hotplug.lock);
117 schedule();
118 }
60} 119}
61 120
121static void mem_hotplug_done(void)
122{
123 mem_hotplug.active_writer = NULL;
124 mutex_unlock(&mem_hotplug.lock);
125 memhp_lock_release();
126}
62 127
63/* add this memory to iomem resource */ 128/* add this memory to iomem resource */
64static struct resource *register_memory_resource(u64 start, u64 size) 129static struct resource *register_memory_resource(u64 start, u64 size)
@@ -727,14 +792,16 @@ int set_online_page_callback(online_page_callback_t callback)
727{ 792{
728 int rc = -EINVAL; 793 int rc = -EINVAL;
729 794
730 lock_memory_hotplug(); 795 get_online_mems();
796 mutex_lock(&online_page_callback_lock);
731 797
732 if (online_page_callback == generic_online_page) { 798 if (online_page_callback == generic_online_page) {
733 online_page_callback = callback; 799 online_page_callback = callback;
734 rc = 0; 800 rc = 0;
735 } 801 }
736 802
737 unlock_memory_hotplug(); 803 mutex_unlock(&online_page_callback_lock);
804 put_online_mems();
738 805
739 return rc; 806 return rc;
740} 807}
@@ -744,14 +811,16 @@ int restore_online_page_callback(online_page_callback_t callback)
744{ 811{
745 int rc = -EINVAL; 812 int rc = -EINVAL;
746 813
747 lock_memory_hotplug(); 814 get_online_mems();
815 mutex_lock(&online_page_callback_lock);
748 816
749 if (online_page_callback == callback) { 817 if (online_page_callback == callback) {
750 online_page_callback = generic_online_page; 818 online_page_callback = generic_online_page;
751 rc = 0; 819 rc = 0;
752 } 820 }
753 821
754 unlock_memory_hotplug(); 822 mutex_unlock(&online_page_callback_lock);
823 put_online_mems();
755 824
756 return rc; 825 return rc;
757} 826}
@@ -899,7 +968,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
899 int ret; 968 int ret;
900 struct memory_notify arg; 969 struct memory_notify arg;
901 970
902 lock_memory_hotplug(); 971 mem_hotplug_begin();
903 /* 972 /*
904 * This doesn't need a lock to do pfn_to_page(). 973 * This doesn't need a lock to do pfn_to_page().
905 * The section can't be removed here because of the 974 * The section can't be removed here because of the
@@ -907,23 +976,18 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
907 */ 976 */
908 zone = page_zone(pfn_to_page(pfn)); 977 zone = page_zone(pfn_to_page(pfn));
909 978
979 ret = -EINVAL;
910 if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && 980 if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) &&
911 !can_online_high_movable(zone)) { 981 !can_online_high_movable(zone))
912 unlock_memory_hotplug(); 982 goto out;
913 return -EINVAL;
914 }
915 983
916 if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { 984 if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) {
917 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { 985 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages))
918 unlock_memory_hotplug(); 986 goto out;
919 return -EINVAL;
920 }
921 } 987 }
922 if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { 988 if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) {
923 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { 989 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages))
924 unlock_memory_hotplug(); 990 goto out;
925 return -EINVAL;
926 }
927 } 991 }
928 992
929 /* Previous code may changed the zone of the pfn range */ 993 /* Previous code may changed the zone of the pfn range */
@@ -939,8 +1003,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
939 ret = notifier_to_errno(ret); 1003 ret = notifier_to_errno(ret);
940 if (ret) { 1004 if (ret) {
941 memory_notify(MEM_CANCEL_ONLINE, &arg); 1005 memory_notify(MEM_CANCEL_ONLINE, &arg);
942 unlock_memory_hotplug(); 1006 goto out;
943 return ret;
944 } 1007 }
945 /* 1008 /*
946 * If this zone is not populated, then it is not in zonelist. 1009 * If this zone is not populated, then it is not in zonelist.
@@ -964,8 +1027,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
964 (((unsigned long long) pfn + nr_pages) 1027 (((unsigned long long) pfn + nr_pages)
965 << PAGE_SHIFT) - 1); 1028 << PAGE_SHIFT) - 1);
966 memory_notify(MEM_CANCEL_ONLINE, &arg); 1029 memory_notify(MEM_CANCEL_ONLINE, &arg);
967 unlock_memory_hotplug(); 1030 goto out;
968 return ret;
969 } 1031 }
970 1032
971 zone->present_pages += onlined_pages; 1033 zone->present_pages += onlined_pages;
@@ -995,9 +1057,9 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
995 1057
996 if (onlined_pages) 1058 if (onlined_pages)
997 memory_notify(MEM_ONLINE, &arg); 1059 memory_notify(MEM_ONLINE, &arg);
998 unlock_memory_hotplug(); 1060out:
999 1061 mem_hotplug_done();
1000 return 0; 1062 return ret;
1001} 1063}
1002#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ 1064#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
1003 1065
@@ -1007,7 +1069,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
1007 struct pglist_data *pgdat; 1069 struct pglist_data *pgdat;
1008 unsigned long zones_size[MAX_NR_ZONES] = {0}; 1070 unsigned long zones_size[MAX_NR_ZONES] = {0};
1009 unsigned long zholes_size[MAX_NR_ZONES] = {0}; 1071 unsigned long zholes_size[MAX_NR_ZONES] = {0};
1010 unsigned long start_pfn = start >> PAGE_SHIFT; 1072 unsigned long start_pfn = PFN_DOWN(start);
1011 1073
1012 pgdat = NODE_DATA(nid); 1074 pgdat = NODE_DATA(nid);
1013 if (!pgdat) { 1075 if (!pgdat) {
@@ -1055,7 +1117,7 @@ int try_online_node(int nid)
1055 if (node_online(nid)) 1117 if (node_online(nid))
1056 return 0; 1118 return 0;
1057 1119
1058 lock_memory_hotplug(); 1120 mem_hotplug_begin();
1059 pgdat = hotadd_new_pgdat(nid, 0); 1121 pgdat = hotadd_new_pgdat(nid, 0);
1060 if (!pgdat) { 1122 if (!pgdat) {
1061 pr_err("Cannot online node %d due to NULL pgdat\n", nid); 1123 pr_err("Cannot online node %d due to NULL pgdat\n", nid);
@@ -1073,13 +1135,13 @@ int try_online_node(int nid)
1073 } 1135 }
1074 1136
1075out: 1137out:
1076 unlock_memory_hotplug(); 1138 mem_hotplug_done();
1077 return ret; 1139 return ret;
1078} 1140}
1079 1141
1080static int check_hotplug_memory_range(u64 start, u64 size) 1142static int check_hotplug_memory_range(u64 start, u64 size)
1081{ 1143{
1082 u64 start_pfn = start >> PAGE_SHIFT; 1144 u64 start_pfn = PFN_DOWN(start);
1083 u64 nr_pages = size >> PAGE_SHIFT; 1145 u64 nr_pages = size >> PAGE_SHIFT;
1084 1146
1085 /* Memory range must be aligned with section */ 1147 /* Memory range must be aligned with section */
@@ -1117,7 +1179,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
1117 new_pgdat = !p; 1179 new_pgdat = !p;
1118 } 1180 }
1119 1181
1120 lock_memory_hotplug(); 1182 mem_hotplug_begin();
1121 1183
1122 new_node = !node_online(nid); 1184 new_node = !node_online(nid);
1123 if (new_node) { 1185 if (new_node) {
@@ -1158,7 +1220,7 @@ error:
1158 release_memory_resource(res); 1220 release_memory_resource(res);
1159 1221
1160out: 1222out:
1161 unlock_memory_hotplug(); 1223 mem_hotplug_done();
1162 return ret; 1224 return ret;
1163} 1225}
1164EXPORT_SYMBOL_GPL(add_memory); 1226EXPORT_SYMBOL_GPL(add_memory);
@@ -1332,7 +1394,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1332 * alloc_migrate_target should be improooooved!! 1394 * alloc_migrate_target should be improooooved!!
1333 * migrate_pages returns # of failed pages. 1395 * migrate_pages returns # of failed pages.
1334 */ 1396 */
1335 ret = migrate_pages(&source, alloc_migrate_target, 0, 1397 ret = migrate_pages(&source, alloc_migrate_target, NULL, 0,
1336 MIGRATE_SYNC, MR_MEMORY_HOTPLUG); 1398 MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
1337 if (ret) 1399 if (ret)
1338 putback_movable_pages(&source); 1400 putback_movable_pages(&source);
@@ -1565,7 +1627,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
1565 if (!test_pages_in_a_zone(start_pfn, end_pfn)) 1627 if (!test_pages_in_a_zone(start_pfn, end_pfn))
1566 return -EINVAL; 1628 return -EINVAL;
1567 1629
1568 lock_memory_hotplug(); 1630 mem_hotplug_begin();
1569 1631
1570 zone = page_zone(pfn_to_page(start_pfn)); 1632 zone = page_zone(pfn_to_page(start_pfn));
1571 node = zone_to_nid(zone); 1633 node = zone_to_nid(zone);
@@ -1672,7 +1734,7 @@ repeat:
1672 writeback_set_ratelimit(); 1734 writeback_set_ratelimit();
1673 1735
1674 memory_notify(MEM_OFFLINE, &arg); 1736 memory_notify(MEM_OFFLINE, &arg);
1675 unlock_memory_hotplug(); 1737 mem_hotplug_done();
1676 return 0; 1738 return 0;
1677 1739
1678failed_removal: 1740failed_removal:
@@ -1684,7 +1746,7 @@ failed_removal:
1684 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1746 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
1685 1747
1686out: 1748out:
1687 unlock_memory_hotplug(); 1749 mem_hotplug_done();
1688 return ret; 1750 return ret;
1689} 1751}
1690 1752
@@ -1888,7 +1950,7 @@ void __ref remove_memory(int nid, u64 start, u64 size)
1888 1950
1889 BUG_ON(check_hotplug_memory_range(start, size)); 1951 BUG_ON(check_hotplug_memory_range(start, size));
1890 1952
1891 lock_memory_hotplug(); 1953 mem_hotplug_begin();
1892 1954
1893 /* 1955 /*
1894 * All memory blocks must be offlined before removing memory. Check 1956 * All memory blocks must be offlined before removing memory. Check
@@ -1897,10 +1959,8 @@ void __ref remove_memory(int nid, u64 start, u64 size)
1897 */ 1959 */
1898 ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, 1960 ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
1899 check_memblock_offlined_cb); 1961 check_memblock_offlined_cb);
1900 if (ret) { 1962 if (ret)
1901 unlock_memory_hotplug();
1902 BUG(); 1963 BUG();
1903 }
1904 1964
1905 /* remove memmap entry */ 1965 /* remove memmap entry */
1906 firmware_map_remove(start, start + size, "System RAM"); 1966 firmware_map_remove(start, start + size, "System RAM");
@@ -1909,7 +1969,7 @@ void __ref remove_memory(int nid, u64 start, u64 size)
1909 1969
1910 try_offline_node(nid); 1970 try_offline_node(nid);
1911 1971
1912 unlock_memory_hotplug(); 1972 mem_hotplug_done();
1913} 1973}
1914EXPORT_SYMBOL_GPL(remove_memory); 1974EXPORT_SYMBOL_GPL(remove_memory);
1915#endif /* CONFIG_MEMORY_HOTREMOVE */ 1975#endif /* CONFIG_MEMORY_HOTREMOVE */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 78e1472933ea..16bc9fa42998 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1028,7 +1028,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1028 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 1028 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1029 1029
1030 if (!list_empty(&pagelist)) { 1030 if (!list_empty(&pagelist)) {
1031 err = migrate_pages(&pagelist, new_node_page, dest, 1031 err = migrate_pages(&pagelist, new_node_page, NULL, dest,
1032 MIGRATE_SYNC, MR_SYSCALL); 1032 MIGRATE_SYNC, MR_SYSCALL);
1033 if (err) 1033 if (err)
1034 putback_movable_pages(&pagelist); 1034 putback_movable_pages(&pagelist);
@@ -1277,7 +1277,7 @@ static long do_mbind(unsigned long start, unsigned long len,
1277 if (!list_empty(&pagelist)) { 1277 if (!list_empty(&pagelist)) {
1278 WARN_ON_ONCE(flags & MPOL_MF_LAZY); 1278 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1279 nr_failed = migrate_pages(&pagelist, new_vma_page, 1279 nr_failed = migrate_pages(&pagelist, new_vma_page,
1280 (unsigned long)vma, 1280 NULL, (unsigned long)vma,
1281 MIGRATE_SYNC, MR_MEMPOLICY_MBIND); 1281 MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1282 if (nr_failed) 1282 if (nr_failed)
1283 putback_movable_pages(&pagelist); 1283 putback_movable_pages(&pagelist);
@@ -1362,7 +1362,7 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1362} 1362}
1363 1363
1364SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, 1364SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1365 unsigned long, mode, unsigned long __user *, nmask, 1365 unsigned long, mode, const unsigned long __user *, nmask,
1366 unsigned long, maxnode, unsigned, flags) 1366 unsigned long, maxnode, unsigned, flags)
1367{ 1367{
1368 nodemask_t nodes; 1368 nodemask_t nodes;
@@ -1383,7 +1383,7 @@ SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1383} 1383}
1384 1384
1385/* Set the process memory policy */ 1385/* Set the process memory policy */
1386SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask, 1386SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1387 unsigned long, maxnode) 1387 unsigned long, maxnode)
1388{ 1388{
1389 int err; 1389 int err;
@@ -1606,9 +1606,9 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1606 1606
1607/* 1607/*
1608 * get_vma_policy(@task, @vma, @addr) 1608 * get_vma_policy(@task, @vma, @addr)
1609 * @task - task for fallback if vma policy == default 1609 * @task: task for fallback if vma policy == default
1610 * @vma - virtual memory area whose policy is sought 1610 * @vma: virtual memory area whose policy is sought
1611 * @addr - address in @vma for shared policy lookup 1611 * @addr: address in @vma for shared policy lookup
1612 * 1612 *
1613 * Returns effective policy for a VMA at specified address. 1613 * Returns effective policy for a VMA at specified address.
1614 * Falls back to @task or system default policy, as necessary. 1614 * Falls back to @task or system default policy, as necessary.
@@ -1854,11 +1854,11 @@ int node_random(const nodemask_t *maskp)
1854#ifdef CONFIG_HUGETLBFS 1854#ifdef CONFIG_HUGETLBFS
1855/* 1855/*
1856 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) 1856 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1857 * @vma = virtual memory area whose policy is sought 1857 * @vma: virtual memory area whose policy is sought
1858 * @addr = address in @vma for shared policy lookup and interleave policy 1858 * @addr: address in @vma for shared policy lookup and interleave policy
1859 * @gfp_flags = for requested zone 1859 * @gfp_flags: for requested zone
1860 * @mpol = pointer to mempolicy pointer for reference counted mempolicy 1860 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1861 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask 1861 * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1862 * 1862 *
1863 * Returns a zonelist suitable for a huge page allocation and a pointer 1863 * Returns a zonelist suitable for a huge page allocation and a pointer
1864 * to the struct mempolicy for conditional unref after allocation. 1864 * to the struct mempolicy for conditional unref after allocation.
@@ -2270,9 +2270,9 @@ static void sp_free(struct sp_node *n)
2270/** 2270/**
2271 * mpol_misplaced - check whether current page node is valid in policy 2271 * mpol_misplaced - check whether current page node is valid in policy
2272 * 2272 *
2273 * @page - page to be checked 2273 * @page: page to be checked
2274 * @vma - vm area where page mapped 2274 * @vma: vm area where page mapped
2275 * @addr - virtual address where page mapped 2275 * @addr: virtual address where page mapped
2276 * 2276 *
2277 * Lookup current policy node id for vma,addr and "compare to" page's 2277 * Lookup current policy node id for vma,addr and "compare to" page's
2278 * node id. 2278 * node id.
diff --git a/mm/mempool.c b/mm/mempool.c
index 905434f18c97..455d468c3a5d 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -192,6 +192,7 @@ EXPORT_SYMBOL(mempool_resize);
192 * returns NULL. Note that due to preallocation, this function 192 * returns NULL. Note that due to preallocation, this function
193 * *never* fails when called from process contexts. (it might 193 * *never* fails when called from process contexts. (it might
194 * fail if called from an IRQ context.) 194 * fail if called from an IRQ context.)
195 * Note: using __GFP_ZERO is not supported.
195 */ 196 */
196void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) 197void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
197{ 198{
@@ -200,6 +201,7 @@ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
200 wait_queue_t wait; 201 wait_queue_t wait;
201 gfp_t gfp_temp; 202 gfp_t gfp_temp;
202 203
204 VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
203 might_sleep_if(gfp_mask & __GFP_WAIT); 205 might_sleep_if(gfp_mask & __GFP_WAIT);
204 206
205 gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ 207 gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */
diff --git a/mm/migrate.c b/mm/migrate.c
index bed48809e5d0..63f0cd559999 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -938,8 +938,9 @@ out:
938 * Obtain the lock on page, remove all ptes and migrate the page 938 * Obtain the lock on page, remove all ptes and migrate the page
939 * to the newly allocated page in newpage. 939 * to the newly allocated page in newpage.
940 */ 940 */
941static int unmap_and_move(new_page_t get_new_page, unsigned long private, 941static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
942 struct page *page, int force, enum migrate_mode mode) 942 unsigned long private, struct page *page, int force,
943 enum migrate_mode mode)
943{ 944{
944 int rc = 0; 945 int rc = 0;
945 int *result = NULL; 946 int *result = NULL;
@@ -983,11 +984,17 @@ out:
983 page_is_file_cache(page)); 984 page_is_file_cache(page));
984 putback_lru_page(page); 985 putback_lru_page(page);
985 } 986 }
987
986 /* 988 /*
987 * Move the new page to the LRU. If migration was not successful 989 * If migration was not successful and there's a freeing callback, use
988 * then this will free the page. 990 * it. Otherwise, putback_lru_page() will drop the reference grabbed
991 * during isolation.
989 */ 992 */
990 putback_lru_page(newpage); 993 if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
994 put_new_page(newpage, private);
995 else
996 putback_lru_page(newpage);
997
991 if (result) { 998 if (result) {
992 if (rc) 999 if (rc)
993 *result = rc; 1000 *result = rc;
@@ -1016,8 +1023,9 @@ out:
1016 * will wait in the page fault for migration to complete. 1023 * will wait in the page fault for migration to complete.
1017 */ 1024 */
1018static int unmap_and_move_huge_page(new_page_t get_new_page, 1025static int unmap_and_move_huge_page(new_page_t get_new_page,
1019 unsigned long private, struct page *hpage, 1026 free_page_t put_new_page, unsigned long private,
1020 int force, enum migrate_mode mode) 1027 struct page *hpage, int force,
1028 enum migrate_mode mode)
1021{ 1029{
1022 int rc = 0; 1030 int rc = 0;
1023 int *result = NULL; 1031 int *result = NULL;
@@ -1031,7 +1039,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
1031 * tables or check whether the hugepage is pmd-based or not before 1039 * tables or check whether the hugepage is pmd-based or not before
1032 * kicking migration. 1040 * kicking migration.
1033 */ 1041 */
1034 if (!hugepage_migration_support(page_hstate(hpage))) { 1042 if (!hugepage_migration_supported(page_hstate(hpage))) {
1035 putback_active_hugepage(hpage); 1043 putback_active_hugepage(hpage);
1036 return -ENOSYS; 1044 return -ENOSYS;
1037 } 1045 }
@@ -1056,20 +1064,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
1056 if (!page_mapped(hpage)) 1064 if (!page_mapped(hpage))
1057 rc = move_to_new_page(new_hpage, hpage, 1, mode); 1065 rc = move_to_new_page(new_hpage, hpage, 1, mode);
1058 1066
1059 if (rc) 1067 if (rc != MIGRATEPAGE_SUCCESS)
1060 remove_migration_ptes(hpage, hpage); 1068 remove_migration_ptes(hpage, hpage);
1061 1069
1062 if (anon_vma) 1070 if (anon_vma)
1063 put_anon_vma(anon_vma); 1071 put_anon_vma(anon_vma);
1064 1072
1065 if (!rc) 1073 if (rc == MIGRATEPAGE_SUCCESS)
1066 hugetlb_cgroup_migrate(hpage, new_hpage); 1074 hugetlb_cgroup_migrate(hpage, new_hpage);
1067 1075
1068 unlock_page(hpage); 1076 unlock_page(hpage);
1069out: 1077out:
1070 if (rc != -EAGAIN) 1078 if (rc != -EAGAIN)
1071 putback_active_hugepage(hpage); 1079 putback_active_hugepage(hpage);
1072 put_page(new_hpage); 1080
1081 /*
1082 * If migration was not successful and there's a freeing callback, use
1083 * it. Otherwise, put_page() will drop the reference grabbed during
1084 * isolation.
1085 */
1086 if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
1087 put_new_page(new_hpage, private);
1088 else
1089 put_page(new_hpage);
1090
1073 if (result) { 1091 if (result) {
1074 if (rc) 1092 if (rc)
1075 *result = rc; 1093 *result = rc;
@@ -1086,6 +1104,8 @@ out:
1086 * @from: The list of pages to be migrated. 1104 * @from: The list of pages to be migrated.
1087 * @get_new_page: The function used to allocate free pages to be used 1105 * @get_new_page: The function used to allocate free pages to be used
1088 * as the target of the page migration. 1106 * as the target of the page migration.
1107 * @put_new_page: The function used to free target pages if migration
1108 * fails, or NULL if no special handling is necessary.
1089 * @private: Private data to be passed on to get_new_page() 1109 * @private: Private data to be passed on to get_new_page()
1090 * @mode: The migration mode that specifies the constraints for 1110 * @mode: The migration mode that specifies the constraints for
1091 * page migration, if any. 1111 * page migration, if any.
@@ -1099,7 +1119,8 @@ out:
1099 * Returns the number of pages that were not migrated, or an error code. 1119 * Returns the number of pages that were not migrated, or an error code.
1100 */ 1120 */
1101int migrate_pages(struct list_head *from, new_page_t get_new_page, 1121int migrate_pages(struct list_head *from, new_page_t get_new_page,
1102 unsigned long private, enum migrate_mode mode, int reason) 1122 free_page_t put_new_page, unsigned long private,
1123 enum migrate_mode mode, int reason)
1103{ 1124{
1104 int retry = 1; 1125 int retry = 1;
1105 int nr_failed = 0; 1126 int nr_failed = 0;
@@ -1121,10 +1142,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
1121 1142
1122 if (PageHuge(page)) 1143 if (PageHuge(page))
1123 rc = unmap_and_move_huge_page(get_new_page, 1144 rc = unmap_and_move_huge_page(get_new_page,
1124 private, page, pass > 2, mode); 1145 put_new_page, private, page,
1146 pass > 2, mode);
1125 else 1147 else
1126 rc = unmap_and_move(get_new_page, private, 1148 rc = unmap_and_move(get_new_page, put_new_page,
1127 page, pass > 2, mode); 1149 private, page, pass > 2, mode);
1128 1150
1129 switch(rc) { 1151 switch(rc) {
1130 case -ENOMEM: 1152 case -ENOMEM:
@@ -1273,7 +1295,7 @@ set_status:
1273 1295
1274 err = 0; 1296 err = 0;
1275 if (!list_empty(&pagelist)) { 1297 if (!list_empty(&pagelist)) {
1276 err = migrate_pages(&pagelist, new_page_node, 1298 err = migrate_pages(&pagelist, new_page_node, NULL,
1277 (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); 1299 (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
1278 if (err) 1300 if (err)
1279 putback_movable_pages(&pagelist); 1301 putback_movable_pages(&pagelist);
@@ -1729,7 +1751,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
1729 1751
1730 list_add(&page->lru, &migratepages); 1752 list_add(&page->lru, &migratepages);
1731 nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, 1753 nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
1732 node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); 1754 NULL, node, MIGRATE_ASYNC,
1755 MR_NUMA_MISPLACED);
1733 if (nr_remaining) { 1756 if (nr_remaining) {
1734 if (!list_empty(&migratepages)) { 1757 if (!list_empty(&migratepages)) {
1735 list_del(&page->lru); 1758 list_del(&page->lru);
@@ -1852,7 +1875,7 @@ fail_putback:
1852 * guarantee the copy is visible before the pagetable update. 1875 * guarantee the copy is visible before the pagetable update.
1853 */ 1876 */
1854 flush_cache_range(vma, mmun_start, mmun_end); 1877 flush_cache_range(vma, mmun_start, mmun_end);
1855 page_add_new_anon_rmap(new_page, vma, mmun_start); 1878 page_add_anon_rmap(new_page, vma, mmun_start);
1856 pmdp_clear_flush(vma, mmun_start, pmd); 1879 pmdp_clear_flush(vma, mmun_start, pmd);
1857 set_pmd_at(mm, mmun_start, pmd, entry); 1880 set_pmd_at(mm, mmun_start, pmd, entry);
1858 flush_tlb_range(vma, mmun_start, mmun_end); 1881 flush_tlb_range(vma, mmun_start, mmun_end);
@@ -1877,6 +1900,10 @@ fail_putback:
1877 spin_unlock(ptl); 1900 spin_unlock(ptl);
1878 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1901 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1879 1902
1903 /* Take an "isolate" reference and put new page on the LRU. */
1904 get_page(new_page);
1905 putback_lru_page(new_page);
1906
1880 unlock_page(new_page); 1907 unlock_page(new_page);
1881 unlock_page(page); 1908 unlock_page(page);
1882 put_page(page); /* Drop the rmap reference */ 1909 put_page(page); /* Drop the rmap reference */
diff --git a/mm/mmap.c b/mm/mmap.c
index b1202cf81f4b..8a56d39df4ed 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -640,11 +640,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
640{ 640{
641 struct address_space *mapping = NULL; 641 struct address_space *mapping = NULL;
642 642
643 if (vma->vm_file) 643 if (vma->vm_file) {
644 mapping = vma->vm_file->f_mapping; 644 mapping = vma->vm_file->f_mapping;
645
646 if (mapping)
647 mutex_lock(&mapping->i_mmap_mutex); 645 mutex_lock(&mapping->i_mmap_mutex);
646 }
648 647
649 __vma_link(mm, vma, prev, rb_link, rb_parent); 648 __vma_link(mm, vma, prev, rb_link, rb_parent);
650 __vma_link_file(vma); 649 __vma_link_file(vma);
@@ -2965,9 +2964,7 @@ int install_special_mapping(struct mm_struct *mm,
2965 struct vm_area_struct *vma = _install_special_mapping(mm, 2964 struct vm_area_struct *vma = _install_special_mapping(mm,
2966 addr, len, vm_flags, pages); 2965 addr, len, vm_flags, pages);
2967 2966
2968 if (IS_ERR(vma)) 2967 return PTR_ERR_OR_ZERO(vma);
2969 return PTR_ERR(vma);
2970 return 0;
2971} 2968}
2972 2969
2973static DEFINE_MUTEX(mm_all_locks_mutex); 2970static DEFINE_MUTEX(mm_all_locks_mutex);
diff --git a/mm/msync.c b/mm/msync.c
index 632df4527c01..a5c673669ca6 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -58,6 +58,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
58 vma = find_vma(mm, start); 58 vma = find_vma(mm, start);
59 for (;;) { 59 for (;;) {
60 struct file *file; 60 struct file *file;
61 loff_t fstart, fend;
61 62
62 /* Still start < end. */ 63 /* Still start < end. */
63 error = -ENOMEM; 64 error = -ENOMEM;
@@ -77,12 +78,17 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
77 goto out_unlock; 78 goto out_unlock;
78 } 79 }
79 file = vma->vm_file; 80 file = vma->vm_file;
81 fstart = start + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
82 fend = fstart + (min(end, vma->vm_end) - start) - 1;
80 start = vma->vm_end; 83 start = vma->vm_end;
81 if ((flags & MS_SYNC) && file && 84 if ((flags & MS_SYNC) && file &&
82 (vma->vm_flags & VM_SHARED)) { 85 (vma->vm_flags & VM_SHARED)) {
83 get_file(file); 86 get_file(file);
84 up_read(&mm->mmap_sem); 87 up_read(&mm->mmap_sem);
85 error = vfs_fsync(file, 0); 88 if (vma->vm_flags & VM_NONLINEAR)
89 error = vfs_fsync(file, 1);
90 else
91 error = vfs_fsync_range(file, fstart, fend, 1);
86 fput(file); 92 fput(file);
87 if (error || start >= end) 93 if (error || start >= end)
88 goto out; 94 goto out;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index a4317da60532..533fa60c9ac1 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -156,24 +156,6 @@ static unsigned long writeout_period_time = 0;
156#define VM_COMPLETIONS_PERIOD_LEN (3*HZ) 156#define VM_COMPLETIONS_PERIOD_LEN (3*HZ)
157 157
158/* 158/*
159 * Work out the current dirty-memory clamping and background writeout
160 * thresholds.
161 *
162 * The main aim here is to lower them aggressively if there is a lot of mapped
163 * memory around. To avoid stressing page reclaim with lots of unreclaimable
164 * pages. It is better to clamp down on writers than to start swapping, and
165 * performing lots of scanning.
166 *
167 * We only allow 1/2 of the currently-unmapped memory to be dirtied.
168 *
169 * We don't permit the clamping level to fall below 5% - that is getting rather
170 * excessive.
171 *
172 * We make sure that the background writeout level is below the adjusted
173 * clamping level.
174 */
175
176/*
177 * In a memory zone, there is a certain amount of pages we consider 159 * In a memory zone, there is a certain amount of pages we consider
178 * available for the page cache, which is essentially the number of 160 * available for the page cache, which is essentially the number of
179 * free and reclaimable pages, minus some zone reserves to protect 161 * free and reclaimable pages, minus some zone reserves to protect
@@ -1623,7 +1605,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
1623 * 1000+ tasks, all of them start dirtying pages at exactly the same 1605 * 1000+ tasks, all of them start dirtying pages at exactly the same
1624 * time, hence all honoured too large initial task->nr_dirtied_pause. 1606 * time, hence all honoured too large initial task->nr_dirtied_pause.
1625 */ 1607 */
1626 p = &__get_cpu_var(bdp_ratelimits); 1608 p = this_cpu_ptr(&bdp_ratelimits);
1627 if (unlikely(current->nr_dirtied >= ratelimit)) 1609 if (unlikely(current->nr_dirtied >= ratelimit))
1628 *p = 0; 1610 *p = 0;
1629 else if (unlikely(*p >= ratelimit_pages)) { 1611 else if (unlikely(*p >= ratelimit_pages)) {
@@ -1635,7 +1617,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
1635 * short-lived tasks (eg. gcc invocations in a kernel build) escaping 1617 * short-lived tasks (eg. gcc invocations in a kernel build) escaping
1636 * the dirty throttling and livelock other long-run dirtiers. 1618 * the dirty throttling and livelock other long-run dirtiers.
1637 */ 1619 */
1638 p = &__get_cpu_var(dirty_throttle_leaks); 1620 p = this_cpu_ptr(&dirty_throttle_leaks);
1639 if (*p > 0 && current->nr_dirtied < ratelimit) { 1621 if (*p > 0 && current->nr_dirtied < ratelimit) {
1640 unsigned long nr_pages_dirtied; 1622 unsigned long nr_pages_dirtied;
1641 nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); 1623 nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5dba2933c9c0..a59bdb653958 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -261,8 +261,9 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
261 } while (zone_span_seqretry(zone, seq)); 261 } while (zone_span_seqretry(zone, seq));
262 262
263 if (ret) 263 if (ret)
264 pr_err("page %lu outside zone [ %lu - %lu ]\n", 264 pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
265 pfn, start_pfn, start_pfn + sp); 265 pfn, zone_to_nid(zone), zone->name,
266 start_pfn, start_pfn + sp);
266 267
267 return ret; 268 return ret;
268} 269}
@@ -408,7 +409,8 @@ static int destroy_compound_page(struct page *page, unsigned long order)
408 return bad; 409 return bad;
409} 410}
410 411
411static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) 412static inline void prep_zero_page(struct page *page, unsigned int order,
413 gfp_t gfp_flags)
412{ 414{
413 int i; 415 int i;
414 416
@@ -452,7 +454,7 @@ static inline void set_page_guard_flag(struct page *page) { }
452static inline void clear_page_guard_flag(struct page *page) { } 454static inline void clear_page_guard_flag(struct page *page) { }
453#endif 455#endif
454 456
455static inline void set_page_order(struct page *page, int order) 457static inline void set_page_order(struct page *page, unsigned int order)
456{ 458{
457 set_page_private(page, order); 459 set_page_private(page, order);
458 __SetPageBuddy(page); 460 __SetPageBuddy(page);
@@ -503,21 +505,31 @@ __find_buddy_index(unsigned long page_idx, unsigned int order)
503 * For recording page's order, we use page_private(page). 505 * For recording page's order, we use page_private(page).
504 */ 506 */
505static inline int page_is_buddy(struct page *page, struct page *buddy, 507static inline int page_is_buddy(struct page *page, struct page *buddy,
506 int order) 508 unsigned int order)
507{ 509{
508 if (!pfn_valid_within(page_to_pfn(buddy))) 510 if (!pfn_valid_within(page_to_pfn(buddy)))
509 return 0; 511 return 0;
510 512
511 if (page_zone_id(page) != page_zone_id(buddy))
512 return 0;
513
514 if (page_is_guard(buddy) && page_order(buddy) == order) { 513 if (page_is_guard(buddy) && page_order(buddy) == order) {
515 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); 514 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
515
516 if (page_zone_id(page) != page_zone_id(buddy))
517 return 0;
518
516 return 1; 519 return 1;
517 } 520 }
518 521
519 if (PageBuddy(buddy) && page_order(buddy) == order) { 522 if (PageBuddy(buddy) && page_order(buddy) == order) {
520 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); 523 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
524
525 /*
526 * zone check is done late to avoid uselessly
527 * calculating zone/node ids for pages that could
528 * never merge.
529 */
530 if (page_zone_id(page) != page_zone_id(buddy))
531 return 0;
532
521 return 1; 533 return 1;
522 } 534 }
523 return 0; 535 return 0;
@@ -549,6 +561,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
549 */ 561 */
550 562
551static inline void __free_one_page(struct page *page, 563static inline void __free_one_page(struct page *page,
564 unsigned long pfn,
552 struct zone *zone, unsigned int order, 565 struct zone *zone, unsigned int order,
553 int migratetype) 566 int migratetype)
554{ 567{
@@ -565,7 +578,7 @@ static inline void __free_one_page(struct page *page,
565 578
566 VM_BUG_ON(migratetype == -1); 579 VM_BUG_ON(migratetype == -1);
567 580
568 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 581 page_idx = pfn & ((1 << MAX_ORDER) - 1);
569 582
570 VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); 583 VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
571 VM_BUG_ON_PAGE(bad_range(zone, page), page); 584 VM_BUG_ON_PAGE(bad_range(zone, page), page);
@@ -700,7 +713,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
700 list_del(&page->lru); 713 list_del(&page->lru);
701 mt = get_freepage_migratetype(page); 714 mt = get_freepage_migratetype(page);
702 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ 715 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
703 __free_one_page(page, zone, 0, mt); 716 __free_one_page(page, page_to_pfn(page), zone, 0, mt);
704 trace_mm_page_pcpu_drain(page, 0, mt); 717 trace_mm_page_pcpu_drain(page, 0, mt);
705 if (likely(!is_migrate_isolate_page(page))) { 718 if (likely(!is_migrate_isolate_page(page))) {
706 __mod_zone_page_state(zone, NR_FREE_PAGES, 1); 719 __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
@@ -712,13 +725,15 @@ static void free_pcppages_bulk(struct zone *zone, int count,
712 spin_unlock(&zone->lock); 725 spin_unlock(&zone->lock);
713} 726}
714 727
715static void free_one_page(struct zone *zone, struct page *page, int order, 728static void free_one_page(struct zone *zone,
729 struct page *page, unsigned long pfn,
730 unsigned int order,
716 int migratetype) 731 int migratetype)
717{ 732{
718 spin_lock(&zone->lock); 733 spin_lock(&zone->lock);
719 zone->pages_scanned = 0; 734 zone->pages_scanned = 0;
720 735
721 __free_one_page(page, zone, order, migratetype); 736 __free_one_page(page, pfn, zone, order, migratetype);
722 if (unlikely(!is_migrate_isolate(migratetype))) 737 if (unlikely(!is_migrate_isolate(migratetype)))
723 __mod_zone_freepage_state(zone, 1 << order, migratetype); 738 __mod_zone_freepage_state(zone, 1 << order, migratetype);
724 spin_unlock(&zone->lock); 739 spin_unlock(&zone->lock);
@@ -755,15 +770,16 @@ static void __free_pages_ok(struct page *page, unsigned int order)
755{ 770{
756 unsigned long flags; 771 unsigned long flags;
757 int migratetype; 772 int migratetype;
773 unsigned long pfn = page_to_pfn(page);
758 774
759 if (!free_pages_prepare(page, order)) 775 if (!free_pages_prepare(page, order))
760 return; 776 return;
761 777
778 migratetype = get_pfnblock_migratetype(page, pfn);
762 local_irq_save(flags); 779 local_irq_save(flags);
763 __count_vm_events(PGFREE, 1 << order); 780 __count_vm_events(PGFREE, 1 << order);
764 migratetype = get_pageblock_migratetype(page);
765 set_freepage_migratetype(page, migratetype); 781 set_freepage_migratetype(page, migratetype);
766 free_one_page(page_zone(page), page, order, migratetype); 782 free_one_page(page_zone(page), page, pfn, order, migratetype);
767 local_irq_restore(flags); 783 local_irq_restore(flags);
768} 784}
769 785
@@ -882,7 +898,7 @@ static inline int check_new_page(struct page *page)
882 return 0; 898 return 0;
883} 899}
884 900
885static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) 901static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
886{ 902{
887 int i; 903 int i;
888 904
@@ -931,6 +947,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
931 rmv_page_order(page); 947 rmv_page_order(page);
932 area->nr_free--; 948 area->nr_free--;
933 expand(zone, page, order, current_order, area, migratetype); 949 expand(zone, page, order, current_order, area, migratetype);
950 set_freepage_migratetype(page, migratetype);
934 return page; 951 return page;
935 } 952 }
936 953
@@ -1057,7 +1074,9 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
1057 1074
1058 /* 1075 /*
1059 * When borrowing from MIGRATE_CMA, we need to release the excess 1076 * When borrowing from MIGRATE_CMA, we need to release the excess
1060 * buddy pages to CMA itself. 1077 * buddy pages to CMA itself. We also ensure the freepage_migratetype
1078 * is set to CMA so it is returned to the correct freelist in case
1079 * the page ends up being not actually allocated from the pcp lists.
1061 */ 1080 */
1062 if (is_migrate_cma(fallback_type)) 1081 if (is_migrate_cma(fallback_type))
1063 return fallback_type; 1082 return fallback_type;
@@ -1090,16 +1109,17 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
1090 1109
1091/* Remove an element from the buddy allocator from the fallback list */ 1110/* Remove an element from the buddy allocator from the fallback list */
1092static inline struct page * 1111static inline struct page *
1093__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) 1112__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
1094{ 1113{
1095 struct free_area *area; 1114 struct free_area *area;
1096 int current_order; 1115 unsigned int current_order;
1097 struct page *page; 1116 struct page *page;
1098 int migratetype, new_type, i; 1117 int migratetype, new_type, i;
1099 1118
1100 /* Find the largest possible block of pages in the other list */ 1119 /* Find the largest possible block of pages in the other list */
1101 for (current_order = MAX_ORDER-1; current_order >= order; 1120 for (current_order = MAX_ORDER-1;
1102 --current_order) { 1121 current_order >= order && current_order <= MAX_ORDER-1;
1122 --current_order) {
1103 for (i = 0;; i++) { 1123 for (i = 0;; i++) {
1104 migratetype = fallbacks[start_migratetype][i]; 1124 migratetype = fallbacks[start_migratetype][i];
1105 1125
@@ -1125,6 +1145,12 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
1125 1145
1126 expand(zone, page, order, current_order, area, 1146 expand(zone, page, order, current_order, area,
1127 new_type); 1147 new_type);
1148 /* The freepage_migratetype may differ from pageblock's
1149 * migratetype depending on the decisions in
1150 * try_to_steal_freepages. This is OK as long as it does
1151 * not differ for MIGRATE_CMA type.
1152 */
1153 set_freepage_migratetype(page, new_type);
1128 1154
1129 trace_mm_page_alloc_extfrag(page, order, current_order, 1155 trace_mm_page_alloc_extfrag(page, order, current_order,
1130 start_migratetype, migratetype, new_type); 1156 start_migratetype, migratetype, new_type);
@@ -1173,9 +1199,9 @@ retry_reserve:
1173 */ 1199 */
1174static int rmqueue_bulk(struct zone *zone, unsigned int order, 1200static int rmqueue_bulk(struct zone *zone, unsigned int order,
1175 unsigned long count, struct list_head *list, 1201 unsigned long count, struct list_head *list,
1176 int migratetype, int cold) 1202 int migratetype, bool cold)
1177{ 1203{
1178 int mt = migratetype, i; 1204 int i;
1179 1205
1180 spin_lock(&zone->lock); 1206 spin_lock(&zone->lock);
1181 for (i = 0; i < count; ++i) { 1207 for (i = 0; i < count; ++i) {
@@ -1192,18 +1218,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
1192 * merge IO requests if the physical pages are ordered 1218 * merge IO requests if the physical pages are ordered
1193 * properly. 1219 * properly.
1194 */ 1220 */
1195 if (likely(cold == 0)) 1221 if (likely(!cold))
1196 list_add(&page->lru, list); 1222 list_add(&page->lru, list);
1197 else 1223 else
1198 list_add_tail(&page->lru, list); 1224 list_add_tail(&page->lru, list);
1199 if (IS_ENABLED(CONFIG_CMA)) {
1200 mt = get_pageblock_migratetype(page);
1201 if (!is_migrate_cma(mt) && !is_migrate_isolate(mt))
1202 mt = migratetype;
1203 }
1204 set_freepage_migratetype(page, mt);
1205 list = &page->lru; 1225 list = &page->lru;
1206 if (is_migrate_cma(mt)) 1226 if (is_migrate_cma(get_freepage_migratetype(page)))
1207 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1227 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
1208 -(1 << order)); 1228 -(1 << order));
1209 } 1229 }
@@ -1327,7 +1347,7 @@ void mark_free_pages(struct zone *zone)
1327{ 1347{
1328 unsigned long pfn, max_zone_pfn; 1348 unsigned long pfn, max_zone_pfn;
1329 unsigned long flags; 1349 unsigned long flags;
1330 int order, t; 1350 unsigned int order, t;
1331 struct list_head *curr; 1351 struct list_head *curr;
1332 1352
1333 if (zone_is_empty(zone)) 1353 if (zone_is_empty(zone))
@@ -1359,19 +1379,20 @@ void mark_free_pages(struct zone *zone)
1359 1379
1360/* 1380/*
1361 * Free a 0-order page 1381 * Free a 0-order page
1362 * cold == 1 ? free a cold page : free a hot page 1382 * cold == true ? free a cold page : free a hot page
1363 */ 1383 */
1364void free_hot_cold_page(struct page *page, int cold) 1384void free_hot_cold_page(struct page *page, bool cold)
1365{ 1385{
1366 struct zone *zone = page_zone(page); 1386 struct zone *zone = page_zone(page);
1367 struct per_cpu_pages *pcp; 1387 struct per_cpu_pages *pcp;
1368 unsigned long flags; 1388 unsigned long flags;
1389 unsigned long pfn = page_to_pfn(page);
1369 int migratetype; 1390 int migratetype;
1370 1391
1371 if (!free_pages_prepare(page, 0)) 1392 if (!free_pages_prepare(page, 0))
1372 return; 1393 return;
1373 1394
1374 migratetype = get_pageblock_migratetype(page); 1395 migratetype = get_pfnblock_migratetype(page, pfn);
1375 set_freepage_migratetype(page, migratetype); 1396 set_freepage_migratetype(page, migratetype);
1376 local_irq_save(flags); 1397 local_irq_save(flags);
1377 __count_vm_event(PGFREE); 1398 __count_vm_event(PGFREE);
@@ -1385,17 +1406,17 @@ void free_hot_cold_page(struct page *page, int cold)
1385 */ 1406 */
1386 if (migratetype >= MIGRATE_PCPTYPES) { 1407 if (migratetype >= MIGRATE_PCPTYPES) {
1387 if (unlikely(is_migrate_isolate(migratetype))) { 1408 if (unlikely(is_migrate_isolate(migratetype))) {
1388 free_one_page(zone, page, 0, migratetype); 1409 free_one_page(zone, page, pfn, 0, migratetype);
1389 goto out; 1410 goto out;
1390 } 1411 }
1391 migratetype = MIGRATE_MOVABLE; 1412 migratetype = MIGRATE_MOVABLE;
1392 } 1413 }
1393 1414
1394 pcp = &this_cpu_ptr(zone->pageset)->pcp; 1415 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1395 if (cold) 1416 if (!cold)
1396 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1397 else
1398 list_add(&page->lru, &pcp->lists[migratetype]); 1417 list_add(&page->lru, &pcp->lists[migratetype]);
1418 else
1419 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1399 pcp->count++; 1420 pcp->count++;
1400 if (pcp->count >= pcp->high) { 1421 if (pcp->count >= pcp->high) {
1401 unsigned long batch = ACCESS_ONCE(pcp->batch); 1422 unsigned long batch = ACCESS_ONCE(pcp->batch);
@@ -1410,7 +1431,7 @@ out:
1410/* 1431/*
1411 * Free a list of 0-order pages 1432 * Free a list of 0-order pages
1412 */ 1433 */
1413void free_hot_cold_page_list(struct list_head *list, int cold) 1434void free_hot_cold_page_list(struct list_head *list, bool cold)
1414{ 1435{
1415 struct page *page, *next; 1436 struct page *page, *next;
1416 1437
@@ -1522,12 +1543,12 @@ int split_free_page(struct page *page)
1522 */ 1543 */
1523static inline 1544static inline
1524struct page *buffered_rmqueue(struct zone *preferred_zone, 1545struct page *buffered_rmqueue(struct zone *preferred_zone,
1525 struct zone *zone, int order, gfp_t gfp_flags, 1546 struct zone *zone, unsigned int order,
1526 int migratetype) 1547 gfp_t gfp_flags, int migratetype)
1527{ 1548{
1528 unsigned long flags; 1549 unsigned long flags;
1529 struct page *page; 1550 struct page *page;
1530 int cold = !!(gfp_flags & __GFP_COLD); 1551 bool cold = ((gfp_flags & __GFP_COLD) != 0);
1531 1552
1532again: 1553again:
1533 if (likely(order == 0)) { 1554 if (likely(order == 0)) {
@@ -1572,7 +1593,7 @@ again:
1572 if (!page) 1593 if (!page)
1573 goto failed; 1594 goto failed;
1574 __mod_zone_freepage_state(zone, -(1 << order), 1595 __mod_zone_freepage_state(zone, -(1 << order),
1575 get_pageblock_migratetype(page)); 1596 get_freepage_migratetype(page));
1576 } 1597 }
1577 1598
1578 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); 1599 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
@@ -1672,8 +1693,9 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1672 * Return true if free pages are above 'mark'. This takes into account the order 1693 * Return true if free pages are above 'mark'. This takes into account the order
1673 * of the allocation. 1694 * of the allocation.
1674 */ 1695 */
1675static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1696static bool __zone_watermark_ok(struct zone *z, unsigned int order,
1676 int classzone_idx, int alloc_flags, long free_pages) 1697 unsigned long mark, int classzone_idx, int alloc_flags,
1698 long free_pages)
1677{ 1699{
1678 /* free_pages my go negative - that's OK */ 1700 /* free_pages my go negative - that's OK */
1679 long min = mark; 1701 long min = mark;
@@ -1707,15 +1729,15 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1707 return true; 1729 return true;
1708} 1730}
1709 1731
1710bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1732bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
1711 int classzone_idx, int alloc_flags) 1733 int classzone_idx, int alloc_flags)
1712{ 1734{
1713 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 1735 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1714 zone_page_state(z, NR_FREE_PAGES)); 1736 zone_page_state(z, NR_FREE_PAGES));
1715} 1737}
1716 1738
1717bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, 1739bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
1718 int classzone_idx, int alloc_flags) 1740 unsigned long mark, int classzone_idx, int alloc_flags)
1719{ 1741{
1720 long free_pages = zone_page_state(z, NR_FREE_PAGES); 1742 long free_pages = zone_page_state(z, NR_FREE_PAGES);
1721 1743
@@ -1850,18 +1872,8 @@ static bool zone_local(struct zone *local_zone, struct zone *zone)
1850 1872
1851static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 1873static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1852{ 1874{
1853 return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes); 1875 return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
1854} 1876 RECLAIM_DISTANCE;
1855
1856static void __paginginit init_zone_allows_reclaim(int nid)
1857{
1858 int i;
1859
1860 for_each_node_state(i, N_MEMORY)
1861 if (node_distance(nid, i) <= RECLAIM_DISTANCE)
1862 node_set(i, NODE_DATA(nid)->reclaim_nodes);
1863 else
1864 zone_reclaim_mode = 1;
1865} 1877}
1866 1878
1867#else /* CONFIG_NUMA */ 1879#else /* CONFIG_NUMA */
@@ -1895,9 +1907,6 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1895 return true; 1907 return true;
1896} 1908}
1897 1909
1898static inline void init_zone_allows_reclaim(int nid)
1899{
1900}
1901#endif /* CONFIG_NUMA */ 1910#endif /* CONFIG_NUMA */
1902 1911
1903/* 1912/*
@@ -1907,17 +1916,17 @@ static inline void init_zone_allows_reclaim(int nid)
1907static struct page * 1916static struct page *
1908get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, 1917get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1909 struct zonelist *zonelist, int high_zoneidx, int alloc_flags, 1918 struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
1910 struct zone *preferred_zone, int migratetype) 1919 struct zone *preferred_zone, int classzone_idx, int migratetype)
1911{ 1920{
1912 struct zoneref *z; 1921 struct zoneref *z;
1913 struct page *page = NULL; 1922 struct page *page = NULL;
1914 int classzone_idx;
1915 struct zone *zone; 1923 struct zone *zone;
1916 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ 1924 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1917 int zlc_active = 0; /* set if using zonelist_cache */ 1925 int zlc_active = 0; /* set if using zonelist_cache */
1918 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1926 int did_zlc_setup = 0; /* just call zlc_setup() one time */
1927 bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
1928 (gfp_mask & __GFP_WRITE);
1919 1929
1920 classzone_idx = zone_idx(preferred_zone);
1921zonelist_scan: 1930zonelist_scan:
1922 /* 1931 /*
1923 * Scan zonelist, looking for a zone with enough free. 1932 * Scan zonelist, looking for a zone with enough free.
@@ -1930,12 +1939,10 @@ zonelist_scan:
1930 if (IS_ENABLED(CONFIG_NUMA) && zlc_active && 1939 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
1931 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1940 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1932 continue; 1941 continue;
1933 if ((alloc_flags & ALLOC_CPUSET) && 1942 if (cpusets_enabled() &&
1943 (alloc_flags & ALLOC_CPUSET) &&
1934 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1944 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1935 continue; 1945 continue;
1936 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1937 if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS))
1938 goto try_this_zone;
1939 /* 1946 /*
1940 * Distribute pages in proportion to the individual 1947 * Distribute pages in proportion to the individual
1941 * zone size to ensure fair page aging. The zone a 1948 * zone size to ensure fair page aging. The zone a
@@ -1974,15 +1981,19 @@ zonelist_scan:
1974 * will require awareness of zones in the 1981 * will require awareness of zones in the
1975 * dirty-throttling and the flusher threads. 1982 * dirty-throttling and the flusher threads.
1976 */ 1983 */
1977 if ((alloc_flags & ALLOC_WMARK_LOW) && 1984 if (consider_zone_dirty && !zone_dirty_ok(zone))
1978 (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) 1985 continue;
1979 goto this_zone_full;
1980 1986
1981 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; 1987 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1982 if (!zone_watermark_ok(zone, order, mark, 1988 if (!zone_watermark_ok(zone, order, mark,
1983 classzone_idx, alloc_flags)) { 1989 classzone_idx, alloc_flags)) {
1984 int ret; 1990 int ret;
1985 1991
1992 /* Checked here to keep the fast path fast */
1993 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1994 if (alloc_flags & ALLOC_NO_WATERMARKS)
1995 goto try_this_zone;
1996
1986 if (IS_ENABLED(CONFIG_NUMA) && 1997 if (IS_ENABLED(CONFIG_NUMA) &&
1987 !did_zlc_setup && nr_online_nodes > 1) { 1998 !did_zlc_setup && nr_online_nodes > 1) {
1988 /* 1999 /*
@@ -2044,7 +2055,7 @@ try_this_zone:
2044 if (page) 2055 if (page)
2045 break; 2056 break;
2046this_zone_full: 2057this_zone_full:
2047 if (IS_ENABLED(CONFIG_NUMA)) 2058 if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
2048 zlc_mark_zone_full(zonelist, z); 2059 zlc_mark_zone_full(zonelist, z);
2049 } 2060 }
2050 2061
@@ -2173,7 +2184,7 @@ static inline struct page *
2173__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, 2184__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2174 struct zonelist *zonelist, enum zone_type high_zoneidx, 2185 struct zonelist *zonelist, enum zone_type high_zoneidx,
2175 nodemask_t *nodemask, struct zone *preferred_zone, 2186 nodemask_t *nodemask, struct zone *preferred_zone,
2176 int migratetype) 2187 int classzone_idx, int migratetype)
2177{ 2188{
2178 struct page *page; 2189 struct page *page;
2179 2190
@@ -2191,7 +2202,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2191 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, 2202 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
2192 order, zonelist, high_zoneidx, 2203 order, zonelist, high_zoneidx,
2193 ALLOC_WMARK_HIGH|ALLOC_CPUSET, 2204 ALLOC_WMARK_HIGH|ALLOC_CPUSET,
2194 preferred_zone, migratetype); 2205 preferred_zone, classzone_idx, migratetype);
2195 if (page) 2206 if (page)
2196 goto out; 2207 goto out;
2197 2208
@@ -2226,7 +2237,7 @@ static struct page *
2226__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2237__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2227 struct zonelist *zonelist, enum zone_type high_zoneidx, 2238 struct zonelist *zonelist, enum zone_type high_zoneidx,
2228 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2239 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2229 int migratetype, bool sync_migration, 2240 int classzone_idx, int migratetype, enum migrate_mode mode,
2230 bool *contended_compaction, bool *deferred_compaction, 2241 bool *contended_compaction, bool *deferred_compaction,
2231 unsigned long *did_some_progress) 2242 unsigned long *did_some_progress)
2232{ 2243{
@@ -2240,7 +2251,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2240 2251
2241 current->flags |= PF_MEMALLOC; 2252 current->flags |= PF_MEMALLOC;
2242 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 2253 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
2243 nodemask, sync_migration, 2254 nodemask, mode,
2244 contended_compaction); 2255 contended_compaction);
2245 current->flags &= ~PF_MEMALLOC; 2256 current->flags &= ~PF_MEMALLOC;
2246 2257
@@ -2254,7 +2265,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2254 page = get_page_from_freelist(gfp_mask, nodemask, 2265 page = get_page_from_freelist(gfp_mask, nodemask,
2255 order, zonelist, high_zoneidx, 2266 order, zonelist, high_zoneidx,
2256 alloc_flags & ~ALLOC_NO_WATERMARKS, 2267 alloc_flags & ~ALLOC_NO_WATERMARKS,
2257 preferred_zone, migratetype); 2268 preferred_zone, classzone_idx, migratetype);
2258 if (page) { 2269 if (page) {
2259 preferred_zone->compact_blockskip_flush = false; 2270 preferred_zone->compact_blockskip_flush = false;
2260 compaction_defer_reset(preferred_zone, order, true); 2271 compaction_defer_reset(preferred_zone, order, true);
@@ -2273,7 +2284,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2273 * As async compaction considers a subset of pageblocks, only 2284 * As async compaction considers a subset of pageblocks, only
2274 * defer if the failure was a sync compaction failure. 2285 * defer if the failure was a sync compaction failure.
2275 */ 2286 */
2276 if (sync_migration) 2287 if (mode != MIGRATE_ASYNC)
2277 defer_compaction(preferred_zone, order); 2288 defer_compaction(preferred_zone, order);
2278 2289
2279 cond_resched(); 2290 cond_resched();
@@ -2286,9 +2297,9 @@ static inline struct page *
2286__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2297__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2287 struct zonelist *zonelist, enum zone_type high_zoneidx, 2298 struct zonelist *zonelist, enum zone_type high_zoneidx,
2288 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2299 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2289 int migratetype, bool sync_migration, 2300 int classzone_idx, int migratetype,
2290 bool *contended_compaction, bool *deferred_compaction, 2301 enum migrate_mode mode, bool *contended_compaction,
2291 unsigned long *did_some_progress) 2302 bool *deferred_compaction, unsigned long *did_some_progress)
2292{ 2303{
2293 return NULL; 2304 return NULL;
2294} 2305}
@@ -2327,7 +2338,7 @@ static inline struct page *
2327__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 2338__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2328 struct zonelist *zonelist, enum zone_type high_zoneidx, 2339 struct zonelist *zonelist, enum zone_type high_zoneidx,
2329 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2340 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2330 int migratetype, unsigned long *did_some_progress) 2341 int classzone_idx, int migratetype, unsigned long *did_some_progress)
2331{ 2342{
2332 struct page *page = NULL; 2343 struct page *page = NULL;
2333 bool drained = false; 2344 bool drained = false;
@@ -2345,7 +2356,8 @@ retry:
2345 page = get_page_from_freelist(gfp_mask, nodemask, order, 2356 page = get_page_from_freelist(gfp_mask, nodemask, order,
2346 zonelist, high_zoneidx, 2357 zonelist, high_zoneidx,
2347 alloc_flags & ~ALLOC_NO_WATERMARKS, 2358 alloc_flags & ~ALLOC_NO_WATERMARKS,
2348 preferred_zone, migratetype); 2359 preferred_zone, classzone_idx,
2360 migratetype);
2349 2361
2350 /* 2362 /*
2351 * If an allocation failed after direct reclaim, it could be because 2363 * If an allocation failed after direct reclaim, it could be because
@@ -2368,14 +2380,14 @@ static inline struct page *
2368__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, 2380__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
2369 struct zonelist *zonelist, enum zone_type high_zoneidx, 2381 struct zonelist *zonelist, enum zone_type high_zoneidx,
2370 nodemask_t *nodemask, struct zone *preferred_zone, 2382 nodemask_t *nodemask, struct zone *preferred_zone,
2371 int migratetype) 2383 int classzone_idx, int migratetype)
2372{ 2384{
2373 struct page *page; 2385 struct page *page;
2374 2386
2375 do { 2387 do {
2376 page = get_page_from_freelist(gfp_mask, nodemask, order, 2388 page = get_page_from_freelist(gfp_mask, nodemask, order,
2377 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, 2389 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
2378 preferred_zone, migratetype); 2390 preferred_zone, classzone_idx, migratetype);
2379 2391
2380 if (!page && gfp_mask & __GFP_NOFAIL) 2392 if (!page && gfp_mask & __GFP_NOFAIL)
2381 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2393 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
@@ -2476,14 +2488,14 @@ static inline struct page *
2476__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 2488__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2477 struct zonelist *zonelist, enum zone_type high_zoneidx, 2489 struct zonelist *zonelist, enum zone_type high_zoneidx,
2478 nodemask_t *nodemask, struct zone *preferred_zone, 2490 nodemask_t *nodemask, struct zone *preferred_zone,
2479 int migratetype) 2491 int classzone_idx, int migratetype)
2480{ 2492{
2481 const gfp_t wait = gfp_mask & __GFP_WAIT; 2493 const gfp_t wait = gfp_mask & __GFP_WAIT;
2482 struct page *page = NULL; 2494 struct page *page = NULL;
2483 int alloc_flags; 2495 int alloc_flags;
2484 unsigned long pages_reclaimed = 0; 2496 unsigned long pages_reclaimed = 0;
2485 unsigned long did_some_progress; 2497 unsigned long did_some_progress;
2486 bool sync_migration = false; 2498 enum migrate_mode migration_mode = MIGRATE_ASYNC;
2487 bool deferred_compaction = false; 2499 bool deferred_compaction = false;
2488 bool contended_compaction = false; 2500 bool contended_compaction = false;
2489 2501
@@ -2525,15 +2537,18 @@ restart:
2525 * Find the true preferred zone if the allocation is unconstrained by 2537 * Find the true preferred zone if the allocation is unconstrained by
2526 * cpusets. 2538 * cpusets.
2527 */ 2539 */
2528 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) 2540 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) {
2529 first_zones_zonelist(zonelist, high_zoneidx, NULL, 2541 struct zoneref *preferred_zoneref;
2530 &preferred_zone); 2542 preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
2543 NULL, &preferred_zone);
2544 classzone_idx = zonelist_zone_idx(preferred_zoneref);
2545 }
2531 2546
2532rebalance: 2547rebalance:
2533 /* This is the last chance, in general, before the goto nopage. */ 2548 /* This is the last chance, in general, before the goto nopage. */
2534 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 2549 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
2535 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, 2550 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
2536 preferred_zone, migratetype); 2551 preferred_zone, classzone_idx, migratetype);
2537 if (page) 2552 if (page)
2538 goto got_pg; 2553 goto got_pg;
2539 2554
@@ -2548,7 +2563,7 @@ rebalance:
2548 2563
2549 page = __alloc_pages_high_priority(gfp_mask, order, 2564 page = __alloc_pages_high_priority(gfp_mask, order,
2550 zonelist, high_zoneidx, nodemask, 2565 zonelist, high_zoneidx, nodemask,
2551 preferred_zone, migratetype); 2566 preferred_zone, classzone_idx, migratetype);
2552 if (page) { 2567 if (page) {
2553 goto got_pg; 2568 goto got_pg;
2554 } 2569 }
@@ -2577,17 +2592,23 @@ rebalance:
2577 * Try direct compaction. The first pass is asynchronous. Subsequent 2592 * Try direct compaction. The first pass is asynchronous. Subsequent
2578 * attempts after direct reclaim are synchronous 2593 * attempts after direct reclaim are synchronous
2579 */ 2594 */
2580 page = __alloc_pages_direct_compact(gfp_mask, order, 2595 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
2581 zonelist, high_zoneidx, 2596 high_zoneidx, nodemask, alloc_flags,
2582 nodemask, 2597 preferred_zone,
2583 alloc_flags, preferred_zone, 2598 classzone_idx, migratetype,
2584 migratetype, sync_migration, 2599 migration_mode, &contended_compaction,
2585 &contended_compaction,
2586 &deferred_compaction, 2600 &deferred_compaction,
2587 &did_some_progress); 2601 &did_some_progress);
2588 if (page) 2602 if (page)
2589 goto got_pg; 2603 goto got_pg;
2590 sync_migration = true; 2604
2605 /*
2606 * It can become very expensive to allocate transparent hugepages at
2607 * fault, so use asynchronous memory compaction for THP unless it is
2608 * khugepaged trying to collapse.
2609 */
2610 if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD))
2611 migration_mode = MIGRATE_SYNC_LIGHT;
2591 2612
2592 /* 2613 /*
2593 * If compaction is deferred for high-order allocations, it is because 2614 * If compaction is deferred for high-order allocations, it is because
@@ -2604,7 +2625,8 @@ rebalance:
2604 zonelist, high_zoneidx, 2625 zonelist, high_zoneidx,
2605 nodemask, 2626 nodemask,
2606 alloc_flags, preferred_zone, 2627 alloc_flags, preferred_zone,
2607 migratetype, &did_some_progress); 2628 classzone_idx, migratetype,
2629 &did_some_progress);
2608 if (page) 2630 if (page)
2609 goto got_pg; 2631 goto got_pg;
2610 2632
@@ -2623,7 +2645,7 @@ rebalance:
2623 page = __alloc_pages_may_oom(gfp_mask, order, 2645 page = __alloc_pages_may_oom(gfp_mask, order,
2624 zonelist, high_zoneidx, 2646 zonelist, high_zoneidx,
2625 nodemask, preferred_zone, 2647 nodemask, preferred_zone,
2626 migratetype); 2648 classzone_idx, migratetype);
2627 if (page) 2649 if (page)
2628 goto got_pg; 2650 goto got_pg;
2629 2651
@@ -2662,12 +2684,11 @@ rebalance:
2662 * direct reclaim and reclaim/compaction depends on compaction 2684 * direct reclaim and reclaim/compaction depends on compaction
2663 * being called after reclaim so call directly if necessary 2685 * being called after reclaim so call directly if necessary
2664 */ 2686 */
2665 page = __alloc_pages_direct_compact(gfp_mask, order, 2687 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
2666 zonelist, high_zoneidx, 2688 high_zoneidx, nodemask, alloc_flags,
2667 nodemask, 2689 preferred_zone,
2668 alloc_flags, preferred_zone, 2690 classzone_idx, migratetype,
2669 migratetype, sync_migration, 2691 migration_mode, &contended_compaction,
2670 &contended_compaction,
2671 &deferred_compaction, 2692 &deferred_compaction,
2672 &did_some_progress); 2693 &did_some_progress);
2673 if (page) 2694 if (page)
@@ -2693,11 +2714,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2693{ 2714{
2694 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 2715 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
2695 struct zone *preferred_zone; 2716 struct zone *preferred_zone;
2717 struct zoneref *preferred_zoneref;
2696 struct page *page = NULL; 2718 struct page *page = NULL;
2697 int migratetype = allocflags_to_migratetype(gfp_mask); 2719 int migratetype = allocflags_to_migratetype(gfp_mask);
2698 unsigned int cpuset_mems_cookie; 2720 unsigned int cpuset_mems_cookie;
2699 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; 2721 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
2700 struct mem_cgroup *memcg = NULL; 2722 int classzone_idx;
2701 2723
2702 gfp_mask &= gfp_allowed_mask; 2724 gfp_mask &= gfp_allowed_mask;
2703 2725
@@ -2716,22 +2738,16 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2716 if (unlikely(!zonelist->_zonerefs->zone)) 2738 if (unlikely(!zonelist->_zonerefs->zone))
2717 return NULL; 2739 return NULL;
2718 2740
2719 /*
2720 * Will only have any effect when __GFP_KMEMCG is set. This is
2721 * verified in the (always inline) callee
2722 */
2723 if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
2724 return NULL;
2725
2726retry_cpuset: 2741retry_cpuset:
2727 cpuset_mems_cookie = read_mems_allowed_begin(); 2742 cpuset_mems_cookie = read_mems_allowed_begin();
2728 2743
2729 /* The preferred zone is used for statistics later */ 2744 /* The preferred zone is used for statistics later */
2730 first_zones_zonelist(zonelist, high_zoneidx, 2745 preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
2731 nodemask ? : &cpuset_current_mems_allowed, 2746 nodemask ? : &cpuset_current_mems_allowed,
2732 &preferred_zone); 2747 &preferred_zone);
2733 if (!preferred_zone) 2748 if (!preferred_zone)
2734 goto out; 2749 goto out;
2750 classzone_idx = zonelist_zone_idx(preferred_zoneref);
2735 2751
2736#ifdef CONFIG_CMA 2752#ifdef CONFIG_CMA
2737 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 2753 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
@@ -2741,7 +2757,7 @@ retry:
2741 /* First allocation attempt */ 2757 /* First allocation attempt */
2742 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2758 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
2743 zonelist, high_zoneidx, alloc_flags, 2759 zonelist, high_zoneidx, alloc_flags,
2744 preferred_zone, migratetype); 2760 preferred_zone, classzone_idx, migratetype);
2745 if (unlikely(!page)) { 2761 if (unlikely(!page)) {
2746 /* 2762 /*
2747 * The first pass makes sure allocations are spread 2763 * The first pass makes sure allocations are spread
@@ -2767,7 +2783,7 @@ retry:
2767 gfp_mask = memalloc_noio_flags(gfp_mask); 2783 gfp_mask = memalloc_noio_flags(gfp_mask);
2768 page = __alloc_pages_slowpath(gfp_mask, order, 2784 page = __alloc_pages_slowpath(gfp_mask, order,
2769 zonelist, high_zoneidx, nodemask, 2785 zonelist, high_zoneidx, nodemask,
2770 preferred_zone, migratetype); 2786 preferred_zone, classzone_idx, migratetype);
2771 } 2787 }
2772 2788
2773 trace_mm_page_alloc(page, order, gfp_mask, migratetype); 2789 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
@@ -2782,8 +2798,6 @@ out:
2782 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) 2798 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2783 goto retry_cpuset; 2799 goto retry_cpuset;
2784 2800
2785 memcg_kmem_commit_charge(page, memcg, order);
2786
2787 return page; 2801 return page;
2788} 2802}
2789EXPORT_SYMBOL(__alloc_pages_nodemask); 2803EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2818,7 +2832,7 @@ void __free_pages(struct page *page, unsigned int order)
2818{ 2832{
2819 if (put_page_testzero(page)) { 2833 if (put_page_testzero(page)) {
2820 if (order == 0) 2834 if (order == 0)
2821 free_hot_cold_page(page, 0); 2835 free_hot_cold_page(page, false);
2822 else 2836 else
2823 __free_pages_ok(page, order); 2837 __free_pages_ok(page, order);
2824 } 2838 }
@@ -2837,27 +2851,51 @@ void free_pages(unsigned long addr, unsigned int order)
2837EXPORT_SYMBOL(free_pages); 2851EXPORT_SYMBOL(free_pages);
2838 2852
2839/* 2853/*
2840 * __free_memcg_kmem_pages and free_memcg_kmem_pages will free 2854 * alloc_kmem_pages charges newly allocated pages to the kmem resource counter
2841 * pages allocated with __GFP_KMEMCG. 2855 * of the current memory cgroup.
2842 *
2843 * Those pages are accounted to a particular memcg, embedded in the
2844 * corresponding page_cgroup. To avoid adding a hit in the allocator to search
2845 * for that information only to find out that it is NULL for users who have no
2846 * interest in that whatsoever, we provide these functions.
2847 * 2856 *
2848 * The caller knows better which flags it relies on. 2857 * It should be used when the caller would like to use kmalloc, but since the
2858 * allocation is large, it has to fall back to the page allocator.
2849 */ 2859 */
2850void __free_memcg_kmem_pages(struct page *page, unsigned int order) 2860struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order)
2861{
2862 struct page *page;
2863 struct mem_cgroup *memcg = NULL;
2864
2865 if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
2866 return NULL;
2867 page = alloc_pages(gfp_mask, order);
2868 memcg_kmem_commit_charge(page, memcg, order);
2869 return page;
2870}
2871
2872struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
2873{
2874 struct page *page;
2875 struct mem_cgroup *memcg = NULL;
2876
2877 if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
2878 return NULL;
2879 page = alloc_pages_node(nid, gfp_mask, order);
2880 memcg_kmem_commit_charge(page, memcg, order);
2881 return page;
2882}
2883
2884/*
2885 * __free_kmem_pages and free_kmem_pages will free pages allocated with
2886 * alloc_kmem_pages.
2887 */
2888void __free_kmem_pages(struct page *page, unsigned int order)
2851{ 2889{
2852 memcg_kmem_uncharge_pages(page, order); 2890 memcg_kmem_uncharge_pages(page, order);
2853 __free_pages(page, order); 2891 __free_pages(page, order);
2854} 2892}
2855 2893
2856void free_memcg_kmem_pages(unsigned long addr, unsigned int order) 2894void free_kmem_pages(unsigned long addr, unsigned int order)
2857{ 2895{
2858 if (addr != 0) { 2896 if (addr != 0) {
2859 VM_BUG_ON(!virt_addr_valid((void *)addr)); 2897 VM_BUG_ON(!virt_addr_valid((void *)addr));
2860 __free_memcg_kmem_pages(virt_to_page((void *)addr), order); 2898 __free_kmem_pages(virt_to_page((void *)addr), order);
2861 } 2899 }
2862} 2900}
2863 2901
@@ -4095,7 +4133,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
4095 4133
4096static void __meminit zone_init_free_lists(struct zone *zone) 4134static void __meminit zone_init_free_lists(struct zone *zone)
4097{ 4135{
4098 int order, t; 4136 unsigned int order, t;
4099 for_each_migratetype_order(order, t) { 4137 for_each_migratetype_order(order, t) {
4100 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); 4138 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
4101 zone->free_area[order].nr_free = 0; 4139 zone->free_area[order].nr_free = 0;
@@ -4349,9 +4387,6 @@ int __meminit init_currently_empty_zone(struct zone *zone,
4349#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID 4387#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
4350/* 4388/*
4351 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. 4389 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
4352 * Architectures may implement their own version but if add_active_range()
4353 * was used and there are no special requirements, this is a convenient
4354 * alternative
4355 */ 4390 */
4356int __meminit __early_pfn_to_nid(unsigned long pfn) 4391int __meminit __early_pfn_to_nid(unsigned long pfn)
4357{ 4392{
@@ -4406,10 +4441,9 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
4406 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. 4441 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
4407 * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid 4442 * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
4408 * 4443 *
4409 * If an architecture guarantees that all ranges registered with 4444 * If an architecture guarantees that all ranges registered contain no holes
4410 * add_active_ranges() contain no holes and may be freed, this 4445 * and may be freed, this this function may be used instead of calling
4411 * this function may be used instead of calling memblock_free_early_nid() 4446 * memblock_free_early_nid() manually.
4412 * manually.
4413 */ 4447 */
4414void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) 4448void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
4415{ 4449{
@@ -4431,9 +4465,8 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
4431 * sparse_memory_present_with_active_regions - Call memory_present for each active range 4465 * sparse_memory_present_with_active_regions - Call memory_present for each active range
4432 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. 4466 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
4433 * 4467 *
4434 * If an architecture guarantees that all ranges registered with 4468 * If an architecture guarantees that all ranges registered contain no holes and may
4435 * add_active_ranges() contain no holes and may be freed, this 4469 * be freed, this function may be used instead of calling memory_present() manually.
4436 * function may be used instead of calling memory_present() manually.
4437 */ 4470 */
4438void __init sparse_memory_present_with_active_regions(int nid) 4471void __init sparse_memory_present_with_active_regions(int nid)
4439{ 4472{
@@ -4451,7 +4484,7 @@ void __init sparse_memory_present_with_active_regions(int nid)
4451 * @end_pfn: Passed by reference. On return, it will have the node end_pfn. 4484 * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
4452 * 4485 *
4453 * It returns the start and end page frame of a node based on information 4486 * It returns the start and end page frame of a node based on information
4454 * provided by an arch calling add_active_range(). If called for a node 4487 * provided by memblock_set_node(). If called for a node
4455 * with no available memory, a warning is printed and the start and end 4488 * with no available memory, a warning is printed and the start and end
4456 * PFNs will be 0. 4489 * PFNs will be 0.
4457 */ 4490 */
@@ -4921,8 +4954,6 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4921 4954
4922 pgdat->node_id = nid; 4955 pgdat->node_id = nid;
4923 pgdat->node_start_pfn = node_start_pfn; 4956 pgdat->node_start_pfn = node_start_pfn;
4924 if (node_state(nid, N_MEMORY))
4925 init_zone_allows_reclaim(nid);
4926#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4957#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4927 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 4958 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
4928#endif 4959#endif
@@ -5030,7 +5061,7 @@ static unsigned long __init find_min_pfn_for_node(int nid)
5030 * find_min_pfn_with_active_regions - Find the minimum PFN registered 5061 * find_min_pfn_with_active_regions - Find the minimum PFN registered
5031 * 5062 *
5032 * It returns the minimum PFN based on information provided via 5063 * It returns the minimum PFN based on information provided via
5033 * add_active_range(). 5064 * memblock_set_node().
5034 */ 5065 */
5035unsigned long __init find_min_pfn_with_active_regions(void) 5066unsigned long __init find_min_pfn_with_active_regions(void)
5036{ 5067{
@@ -5251,7 +5282,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
5251 * @max_zone_pfn: an array of max PFNs for each zone 5282 * @max_zone_pfn: an array of max PFNs for each zone
5252 * 5283 *
5253 * This will call free_area_init_node() for each active node in the system. 5284 * This will call free_area_init_node() for each active node in the system.
5254 * Using the page ranges provided by add_active_range(), the size of each 5285 * Using the page ranges provided by memblock_set_node(), the size of each
5255 * zone in each node and their holes is calculated. If the maximum PFN 5286 * zone in each node and their holes is calculated. If the maximum PFN
5256 * between two adjacent zones match, it is assumed that the zone is empty. 5287 * between two adjacent zones match, it is assumed that the zone is empty.
5257 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed 5288 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
@@ -6009,53 +6040,64 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
6009 * @end_bitidx: The last bit of interest 6040 * @end_bitidx: The last bit of interest
6010 * returns pageblock_bits flags 6041 * returns pageblock_bits flags
6011 */ 6042 */
6012unsigned long get_pageblock_flags_group(struct page *page, 6043unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
6013 int start_bitidx, int end_bitidx) 6044 unsigned long end_bitidx,
6045 unsigned long mask)
6014{ 6046{
6015 struct zone *zone; 6047 struct zone *zone;
6016 unsigned long *bitmap; 6048 unsigned long *bitmap;
6017 unsigned long pfn, bitidx; 6049 unsigned long bitidx, word_bitidx;
6018 unsigned long flags = 0; 6050 unsigned long word;
6019 unsigned long value = 1;
6020 6051
6021 zone = page_zone(page); 6052 zone = page_zone(page);
6022 pfn = page_to_pfn(page);
6023 bitmap = get_pageblock_bitmap(zone, pfn); 6053 bitmap = get_pageblock_bitmap(zone, pfn);
6024 bitidx = pfn_to_bitidx(zone, pfn); 6054 bitidx = pfn_to_bitidx(zone, pfn);
6055 word_bitidx = bitidx / BITS_PER_LONG;
6056 bitidx &= (BITS_PER_LONG-1);
6025 6057
6026 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) 6058 word = bitmap[word_bitidx];
6027 if (test_bit(bitidx + start_bitidx, bitmap)) 6059 bitidx += end_bitidx;
6028 flags |= value; 6060 return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
6029
6030 return flags;
6031} 6061}
6032 6062
6033/** 6063/**
6034 * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages 6064 * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
6035 * @page: The page within the block of interest 6065 * @page: The page within the block of interest
6036 * @start_bitidx: The first bit of interest 6066 * @start_bitidx: The first bit of interest
6037 * @end_bitidx: The last bit of interest 6067 * @end_bitidx: The last bit of interest
6038 * @flags: The flags to set 6068 * @flags: The flags to set
6039 */ 6069 */
6040void set_pageblock_flags_group(struct page *page, unsigned long flags, 6070void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
6041 int start_bitidx, int end_bitidx) 6071 unsigned long pfn,
6072 unsigned long end_bitidx,
6073 unsigned long mask)
6042{ 6074{
6043 struct zone *zone; 6075 struct zone *zone;
6044 unsigned long *bitmap; 6076 unsigned long *bitmap;
6045 unsigned long pfn, bitidx; 6077 unsigned long bitidx, word_bitidx;
6046 unsigned long value = 1; 6078 unsigned long old_word, word;
6079
6080 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
6047 6081
6048 zone = page_zone(page); 6082 zone = page_zone(page);
6049 pfn = page_to_pfn(page);
6050 bitmap = get_pageblock_bitmap(zone, pfn); 6083 bitmap = get_pageblock_bitmap(zone, pfn);
6051 bitidx = pfn_to_bitidx(zone, pfn); 6084 bitidx = pfn_to_bitidx(zone, pfn);
6085 word_bitidx = bitidx / BITS_PER_LONG;
6086 bitidx &= (BITS_PER_LONG-1);
6087
6052 VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page); 6088 VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page);
6053 6089
6054 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) 6090 bitidx += end_bitidx;
6055 if (flags & value) 6091 mask <<= (BITS_PER_LONG - bitidx - 1);
6056 __set_bit(bitidx + start_bitidx, bitmap); 6092 flags <<= (BITS_PER_LONG - bitidx - 1);
6057 else 6093
6058 __clear_bit(bitidx + start_bitidx, bitmap); 6094 word = ACCESS_ONCE(bitmap[word_bitidx]);
6095 for (;;) {
6096 old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
6097 if (word == old_word)
6098 break;
6099 word = old_word;
6100 }
6059} 6101}
6060 6102
6061/* 6103/*
@@ -6215,7 +6257,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
6215 cc->nr_migratepages -= nr_reclaimed; 6257 cc->nr_migratepages -= nr_reclaimed;
6216 6258
6217 ret = migrate_pages(&cc->migratepages, alloc_migrate_target, 6259 ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
6218 0, MIGRATE_SYNC, MR_CMA); 6260 NULL, 0, cc->mode, MR_CMA);
6219 } 6261 }
6220 if (ret < 0) { 6262 if (ret < 0) {
6221 putback_movable_pages(&cc->migratepages); 6263 putback_movable_pages(&cc->migratepages);
@@ -6254,7 +6296,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
6254 .nr_migratepages = 0, 6296 .nr_migratepages = 0,
6255 .order = -1, 6297 .order = -1,
6256 .zone = page_zone(pfn_to_page(start)), 6298 .zone = page_zone(pfn_to_page(start)),
6257 .sync = true, 6299 .mode = MIGRATE_SYNC,
6258 .ignore_skip_hint = true, 6300 .ignore_skip_hint = true,
6259 }; 6301 };
6260 INIT_LIST_HEAD(&cc.migratepages); 6302 INIT_LIST_HEAD(&cc.migratepages);
@@ -6409,7 +6451,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
6409{ 6451{
6410 struct page *page; 6452 struct page *page;
6411 struct zone *zone; 6453 struct zone *zone;
6412 int order, i; 6454 unsigned int order, i;
6413 unsigned long pfn; 6455 unsigned long pfn;
6414 unsigned long flags; 6456 unsigned long flags;
6415 /* find the first valid pfn */ 6457 /* find the first valid pfn */
@@ -6461,7 +6503,7 @@ bool is_free_buddy_page(struct page *page)
6461 struct zone *zone = page_zone(page); 6503 struct zone *zone = page_zone(page);
6462 unsigned long pfn = page_to_pfn(page); 6504 unsigned long pfn = page_to_pfn(page);
6463 unsigned long flags; 6505 unsigned long flags;
6464 int order; 6506 unsigned int order;
6465 6507
6466 spin_lock_irqsave(&zone->lock, flags); 6508 spin_lock_irqsave(&zone->lock, flags);
6467 for (order = 0; order < MAX_ORDER; order++) { 6509 for (order = 0; order < MAX_ORDER; order++) {
diff --git a/mm/page_io.c b/mm/page_io.c
index 7c59ef681381..58b50d2901fe 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -248,11 +248,16 @@ out:
248 return ret; 248 return ret;
249} 249}
250 250
251static sector_t swap_page_sector(struct page *page)
252{
253 return (sector_t)__page_file_index(page) << (PAGE_CACHE_SHIFT - 9);
254}
255
251int __swap_writepage(struct page *page, struct writeback_control *wbc, 256int __swap_writepage(struct page *page, struct writeback_control *wbc,
252 void (*end_write_func)(struct bio *, int)) 257 void (*end_write_func)(struct bio *, int))
253{ 258{
254 struct bio *bio; 259 struct bio *bio;
255 int ret = 0, rw = WRITE; 260 int ret, rw = WRITE;
256 struct swap_info_struct *sis = page_swap_info(page); 261 struct swap_info_struct *sis = page_swap_info(page);
257 262
258 if (sis->flags & SWP_FILE) { 263 if (sis->flags & SWP_FILE) {
@@ -297,6 +302,13 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
297 return ret; 302 return ret;
298 } 303 }
299 304
305 ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc);
306 if (!ret) {
307 count_vm_event(PSWPOUT);
308 return 0;
309 }
310
311 ret = 0;
300 bio = get_swap_bio(GFP_NOIO, page, end_write_func); 312 bio = get_swap_bio(GFP_NOIO, page, end_write_func);
301 if (bio == NULL) { 313 if (bio == NULL) {
302 set_page_dirty(page); 314 set_page_dirty(page);
@@ -338,6 +350,13 @@ int swap_readpage(struct page *page)
338 return ret; 350 return ret;
339 } 351 }
340 352
353 ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
354 if (!ret) {
355 count_vm_event(PSWPIN);
356 return 0;
357 }
358
359 ret = 0;
341 bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); 360 bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
342 if (bio == NULL) { 361 if (bio == NULL) {
343 unlock_page(page); 362 unlock_page(page);
diff --git a/mm/rmap.c b/mm/rmap.c
index 9c3e77396d1a..ea8e20d75b29 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -103,6 +103,7 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
103 * LOCK should suffice since the actual taking of the lock must 103 * LOCK should suffice since the actual taking of the lock must
104 * happen _before_ what follows. 104 * happen _before_ what follows.
105 */ 105 */
106 might_sleep();
106 if (rwsem_is_locked(&anon_vma->root->rwsem)) { 107 if (rwsem_is_locked(&anon_vma->root->rwsem)) {
107 anon_vma_lock_write(anon_vma); 108 anon_vma_lock_write(anon_vma);
108 anon_vma_unlock_write(anon_vma); 109 anon_vma_unlock_write(anon_vma);
@@ -426,8 +427,9 @@ struct anon_vma *page_get_anon_vma(struct page *page)
426 * above cannot corrupt). 427 * above cannot corrupt).
427 */ 428 */
428 if (!page_mapped(page)) { 429 if (!page_mapped(page)) {
430 rcu_read_unlock();
429 put_anon_vma(anon_vma); 431 put_anon_vma(anon_vma);
430 anon_vma = NULL; 432 return NULL;
431 } 433 }
432out: 434out:
433 rcu_read_unlock(); 435 rcu_read_unlock();
@@ -477,9 +479,9 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page)
477 } 479 }
478 480
479 if (!page_mapped(page)) { 481 if (!page_mapped(page)) {
482 rcu_read_unlock();
480 put_anon_vma(anon_vma); 483 put_anon_vma(anon_vma);
481 anon_vma = NULL; 484 return NULL;
482 goto out;
483 } 485 }
484 486
485 /* we pinned the anon_vma, its safe to sleep */ 487 /* we pinned the anon_vma, its safe to sleep */
@@ -669,7 +671,7 @@ struct page_referenced_arg {
669/* 671/*
670 * arg: page_referenced_arg will be passed 672 * arg: page_referenced_arg will be passed
671 */ 673 */
672int page_referenced_one(struct page *page, struct vm_area_struct *vma, 674static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
673 unsigned long address, void *arg) 675 unsigned long address, void *arg)
674{ 676{
675 struct mm_struct *mm = vma->vm_mm; 677 struct mm_struct *mm = vma->vm_mm;
@@ -986,6 +988,12 @@ void do_page_add_anon_rmap(struct page *page,
986{ 988{
987 int first = atomic_inc_and_test(&page->_mapcount); 989 int first = atomic_inc_and_test(&page->_mapcount);
988 if (first) { 990 if (first) {
991 /*
992 * We use the irq-unsafe __{inc|mod}_zone_page_stat because
993 * these counters are not modified in interrupt context, and
994 * pte lock(a spinlock) is held, which implies preemption
995 * disabled.
996 */
989 if (PageTransHuge(page)) 997 if (PageTransHuge(page))
990 __inc_zone_page_state(page, 998 __inc_zone_page_state(page,
991 NR_ANON_TRANSPARENT_HUGEPAGES); 999 NR_ANON_TRANSPARENT_HUGEPAGES);
@@ -1024,11 +1032,25 @@ void page_add_new_anon_rmap(struct page *page,
1024 __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, 1032 __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
1025 hpage_nr_pages(page)); 1033 hpage_nr_pages(page));
1026 __page_set_anon_rmap(page, vma, address, 1); 1034 __page_set_anon_rmap(page, vma, address, 1);
1027 if (!mlocked_vma_newpage(vma, page)) { 1035
1036 VM_BUG_ON_PAGE(PageLRU(page), page);
1037 if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) {
1028 SetPageActive(page); 1038 SetPageActive(page);
1029 lru_cache_add(page); 1039 lru_cache_add(page);
1030 } else 1040 return;
1031 add_page_to_unevictable_list(page); 1041 }
1042
1043 if (!TestSetPageMlocked(page)) {
1044 /*
1045 * We use the irq-unsafe __mod_zone_page_stat because this
1046 * counter is not modified from interrupt context, and the pte
1047 * lock is held(spinlock), which implies preemption disabled.
1048 */
1049 __mod_zone_page_state(page_zone(page), NR_MLOCK,
1050 hpage_nr_pages(page));
1051 count_vm_event(UNEVICTABLE_PGMLOCKED);
1052 }
1053 add_page_to_unevictable_list(page);
1032} 1054}
1033 1055
1034/** 1056/**
@@ -1077,6 +1099,11 @@ void page_remove_rmap(struct page *page)
1077 /* 1099 /*
1078 * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED 1100 * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
1079 * and not charged by memcg for now. 1101 * and not charged by memcg for now.
1102 *
1103 * We use the irq-unsafe __{inc|mod}_zone_page_stat because
1104 * these counters are not modified in interrupt context, and
1105 * these counters are not modified in interrupt context, and
1106 * pte lock(a spinlock) is held, which implies preemption disabled.
1080 */ 1107 */
1081 if (unlikely(PageHuge(page))) 1108 if (unlikely(PageHuge(page)))
1082 goto out; 1109 goto out;
@@ -1112,7 +1139,7 @@ out:
1112/* 1139/*
1113 * @arg: enum ttu_flags will be passed to this argument 1140 * @arg: enum ttu_flags will be passed to this argument
1114 */ 1141 */
1115int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 1142static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1116 unsigned long address, void *arg) 1143 unsigned long address, void *arg)
1117{ 1144{
1118 struct mm_struct *mm = vma->vm_mm; 1145 struct mm_struct *mm = vma->vm_mm;
@@ -1135,7 +1162,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1135 if (vma->vm_flags & VM_LOCKED) 1162 if (vma->vm_flags & VM_LOCKED)
1136 goto out_mlock; 1163 goto out_mlock;
1137 1164
1138 if (TTU_ACTION(flags) == TTU_MUNLOCK) 1165 if (flags & TTU_MUNLOCK)
1139 goto out_unmap; 1166 goto out_unmap;
1140 } 1167 }
1141 if (!(flags & TTU_IGNORE_ACCESS)) { 1168 if (!(flags & TTU_IGNORE_ACCESS)) {
@@ -1203,7 +1230,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1203 * pte. do_swap_page() will wait until the migration 1230 * pte. do_swap_page() will wait until the migration
1204 * pte is removed and then restart fault handling. 1231 * pte is removed and then restart fault handling.
1205 */ 1232 */
1206 BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); 1233 BUG_ON(!(flags & TTU_MIGRATION));
1207 entry = make_migration_entry(page, pte_write(pteval)); 1234 entry = make_migration_entry(page, pte_write(pteval));
1208 } 1235 }
1209 swp_pte = swp_entry_to_pte(entry); 1236 swp_pte = swp_entry_to_pte(entry);
@@ -1212,7 +1239,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1212 set_pte_at(mm, address, pte, swp_pte); 1239 set_pte_at(mm, address, pte, swp_pte);
1213 BUG_ON(pte_file(*pte)); 1240 BUG_ON(pte_file(*pte));
1214 } else if (IS_ENABLED(CONFIG_MIGRATION) && 1241 } else if (IS_ENABLED(CONFIG_MIGRATION) &&
1215 (TTU_ACTION(flags) == TTU_MIGRATION)) { 1242 (flags & TTU_MIGRATION)) {
1216 /* Establish migration entry for a file page */ 1243 /* Establish migration entry for a file page */
1217 swp_entry_t entry; 1244 swp_entry_t entry;
1218 entry = make_migration_entry(page, pte_write(pteval)); 1245 entry = make_migration_entry(page, pte_write(pteval));
@@ -1225,7 +1252,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1225 1252
1226out_unmap: 1253out_unmap:
1227 pte_unmap_unlock(pte, ptl); 1254 pte_unmap_unlock(pte, ptl);
1228 if (ret != SWAP_FAIL) 1255 if (ret != SWAP_FAIL && !(flags & TTU_MUNLOCK))
1229 mmu_notifier_invalidate_page(mm, address); 1256 mmu_notifier_invalidate_page(mm, address);
1230out: 1257out:
1231 return ret; 1258 return ret;
@@ -1359,7 +1386,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1359 if (page->index != linear_page_index(vma, address)) { 1386 if (page->index != linear_page_index(vma, address)) {
1360 pte_t ptfile = pgoff_to_pte(page->index); 1387 pte_t ptfile = pgoff_to_pte(page->index);
1361 if (pte_soft_dirty(pteval)) 1388 if (pte_soft_dirty(pteval))
1362 pte_file_mksoft_dirty(ptfile); 1389 ptfile = pte_file_mksoft_dirty(ptfile);
1363 set_pte_at(mm, address, pte, ptfile); 1390 set_pte_at(mm, address, pte, ptfile);
1364 } 1391 }
1365 1392
@@ -1512,7 +1539,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
1512 * locking requirements of exec(), migration skips 1539 * locking requirements of exec(), migration skips
1513 * temporary VMAs until after exec() completes. 1540 * temporary VMAs until after exec() completes.
1514 */ 1541 */
1515 if (flags & TTU_MIGRATION && !PageKsm(page) && PageAnon(page)) 1542 if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page))
1516 rwc.invalid_vma = invalid_migration_vma; 1543 rwc.invalid_vma = invalid_migration_vma;
1517 1544
1518 ret = rmap_walk(page, &rwc); 1545 ret = rmap_walk(page, &rwc);
diff --git a/mm/shmem.c b/mm/shmem.c
index 9f70e02111c6..5402481c28d1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1132,7 +1132,7 @@ repeat:
1132 goto decused; 1132 goto decused;
1133 } 1133 }
1134 1134
1135 SetPageSwapBacked(page); 1135 __SetPageSwapBacked(page);
1136 __set_page_locked(page); 1136 __set_page_locked(page);
1137 error = mem_cgroup_charge_file(page, current->mm, 1137 error = mem_cgroup_charge_file(page, current->mm,
1138 gfp & GFP_RECLAIM_MASK); 1138 gfp & GFP_RECLAIM_MASK);
@@ -1372,9 +1372,13 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
1372 loff_t pos, unsigned len, unsigned flags, 1372 loff_t pos, unsigned len, unsigned flags,
1373 struct page **pagep, void **fsdata) 1373 struct page **pagep, void **fsdata)
1374{ 1374{
1375 int ret;
1375 struct inode *inode = mapping->host; 1376 struct inode *inode = mapping->host;
1376 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 1377 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1377 return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); 1378 ret = shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
1379 if (ret == 0 && *pagep)
1380 init_page_accessed(*pagep);
1381 return ret;
1378} 1382}
1379 1383
1380static int 1384static int
diff --git a/mm/slab.c b/mm/slab.c
index 19d92181ce24..9ca3b87edabc 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1621,10 +1621,16 @@ __initcall(cpucache_init);
1621static noinline void 1621static noinline void
1622slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) 1622slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
1623{ 1623{
1624#if DEBUG
1624 struct kmem_cache_node *n; 1625 struct kmem_cache_node *n;
1625 struct page *page; 1626 struct page *page;
1626 unsigned long flags; 1627 unsigned long flags;
1627 int node; 1628 int node;
1629 static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
1630 DEFAULT_RATELIMIT_BURST);
1631
1632 if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs))
1633 return;
1628 1634
1629 printk(KERN_WARNING 1635 printk(KERN_WARNING
1630 "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", 1636 "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n",
@@ -1662,6 +1668,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
1662 node, active_slabs, num_slabs, active_objs, num_objs, 1668 node, active_slabs, num_slabs, active_objs, num_objs,
1663 free_objects); 1669 free_objects);
1664 } 1670 }
1671#endif
1665} 1672}
1666 1673
1667/* 1674/*
@@ -1681,10 +1688,13 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
1681 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1688 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1682 flags |= __GFP_RECLAIMABLE; 1689 flags |= __GFP_RECLAIMABLE;
1683 1690
1691 if (memcg_charge_slab(cachep, flags, cachep->gfporder))
1692 return NULL;
1693
1684 page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); 1694 page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
1685 if (!page) { 1695 if (!page) {
1686 if (!(flags & __GFP_NOWARN) && printk_ratelimit()) 1696 memcg_uncharge_slab(cachep, cachep->gfporder);
1687 slab_out_of_memory(cachep, flags, nodeid); 1697 slab_out_of_memory(cachep, flags, nodeid);
1688 return NULL; 1698 return NULL;
1689 } 1699 }
1690 1700
@@ -1702,7 +1712,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
1702 __SetPageSlab(page); 1712 __SetPageSlab(page);
1703 if (page->pfmemalloc) 1713 if (page->pfmemalloc)
1704 SetPageSlabPfmemalloc(page); 1714 SetPageSlabPfmemalloc(page);
1705 memcg_bind_pages(cachep, cachep->gfporder);
1706 1715
1707 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { 1716 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
1708 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); 1717 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
@@ -1738,10 +1747,10 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
1738 page_mapcount_reset(page); 1747 page_mapcount_reset(page);
1739 page->mapping = NULL; 1748 page->mapping = NULL;
1740 1749
1741 memcg_release_pages(cachep, cachep->gfporder);
1742 if (current->reclaim_state) 1750 if (current->reclaim_state)
1743 current->reclaim_state->reclaimed_slab += nr_freed; 1751 current->reclaim_state->reclaimed_slab += nr_freed;
1744 __free_memcg_kmem_pages(page, cachep->gfporder); 1752 __free_pages(page, cachep->gfporder);
1753 memcg_uncharge_slab(cachep, cachep->gfporder);
1745} 1754}
1746 1755
1747static void kmem_rcu_free(struct rcu_head *head) 1756static void kmem_rcu_free(struct rcu_head *head)
@@ -2469,8 +2478,7 @@ out:
2469 return nr_freed; 2478 return nr_freed;
2470} 2479}
2471 2480
2472/* Called with slab_mutex held to protect against cpu hotplug */ 2481int __kmem_cache_shrink(struct kmem_cache *cachep)
2473static int __cache_shrink(struct kmem_cache *cachep)
2474{ 2482{
2475 int ret = 0, i = 0; 2483 int ret = 0, i = 0;
2476 struct kmem_cache_node *n; 2484 struct kmem_cache_node *n;
@@ -2491,32 +2499,11 @@ static int __cache_shrink(struct kmem_cache *cachep)
2491 return (ret ? 1 : 0); 2499 return (ret ? 1 : 0);
2492} 2500}
2493 2501
2494/**
2495 * kmem_cache_shrink - Shrink a cache.
2496 * @cachep: The cache to shrink.
2497 *
2498 * Releases as many slabs as possible for a cache.
2499 * To help debugging, a zero exit status indicates all slabs were released.
2500 */
2501int kmem_cache_shrink(struct kmem_cache *cachep)
2502{
2503 int ret;
2504 BUG_ON(!cachep || in_interrupt());
2505
2506 get_online_cpus();
2507 mutex_lock(&slab_mutex);
2508 ret = __cache_shrink(cachep);
2509 mutex_unlock(&slab_mutex);
2510 put_online_cpus();
2511 return ret;
2512}
2513EXPORT_SYMBOL(kmem_cache_shrink);
2514
2515int __kmem_cache_shutdown(struct kmem_cache *cachep) 2502int __kmem_cache_shutdown(struct kmem_cache *cachep)
2516{ 2503{
2517 int i; 2504 int i;
2518 struct kmem_cache_node *n; 2505 struct kmem_cache_node *n;
2519 int rc = __cache_shrink(cachep); 2506 int rc = __kmem_cache_shrink(cachep);
2520 2507
2521 if (rc) 2508 if (rc)
2522 return rc; 2509 return rc;
diff --git a/mm/slab.h b/mm/slab.h
index 6bd4c353704f..961a3fb1f5a2 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -91,6 +91,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
91#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) 91#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
92 92
93int __kmem_cache_shutdown(struct kmem_cache *); 93int __kmem_cache_shutdown(struct kmem_cache *);
94int __kmem_cache_shrink(struct kmem_cache *);
94void slab_kmem_cache_release(struct kmem_cache *); 95void slab_kmem_cache_release(struct kmem_cache *);
95 96
96struct seq_file; 97struct seq_file;
@@ -120,21 +121,6 @@ static inline bool is_root_cache(struct kmem_cache *s)
120 return !s->memcg_params || s->memcg_params->is_root_cache; 121 return !s->memcg_params || s->memcg_params->is_root_cache;
121} 122}
122 123
123static inline void memcg_bind_pages(struct kmem_cache *s, int order)
124{
125 if (!is_root_cache(s))
126 atomic_add(1 << order, &s->memcg_params->nr_pages);
127}
128
129static inline void memcg_release_pages(struct kmem_cache *s, int order)
130{
131 if (is_root_cache(s))
132 return;
133
134 if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages))
135 mem_cgroup_destroy_cache(s);
136}
137
138static inline bool slab_equal_or_root(struct kmem_cache *s, 124static inline bool slab_equal_or_root(struct kmem_cache *s,
139 struct kmem_cache *p) 125 struct kmem_cache *p)
140{ 126{
@@ -192,18 +178,29 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
192 return s; 178 return s;
193 return s->memcg_params->root_cache; 179 return s->memcg_params->root_cache;
194} 180}
195#else 181
196static inline bool is_root_cache(struct kmem_cache *s) 182static __always_inline int memcg_charge_slab(struct kmem_cache *s,
183 gfp_t gfp, int order)
197{ 184{
198 return true; 185 if (!memcg_kmem_enabled())
186 return 0;
187 if (is_root_cache(s))
188 return 0;
189 return __memcg_charge_slab(s, gfp, order);
199} 190}
200 191
201static inline void memcg_bind_pages(struct kmem_cache *s, int order) 192static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
202{ 193{
194 if (!memcg_kmem_enabled())
195 return;
196 if (is_root_cache(s))
197 return;
198 __memcg_uncharge_slab(s, order);
203} 199}
204 200#else
205static inline void memcg_release_pages(struct kmem_cache *s, int order) 201static inline bool is_root_cache(struct kmem_cache *s)
206{ 202{
203 return true;
207} 204}
208 205
209static inline bool slab_equal_or_root(struct kmem_cache *s, 206static inline bool slab_equal_or_root(struct kmem_cache *s,
@@ -227,6 +224,15 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
227{ 224{
228 return s; 225 return s;
229} 226}
227
228static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order)
229{
230 return 0;
231}
232
233static inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
234{
235}
230#endif 236#endif
231 237
232static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) 238static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 102cc6fca3d3..735e01a0db6f 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -160,7 +160,6 @@ do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align,
160 160
161 s->refcount = 1; 161 s->refcount = 1;
162 list_add(&s->list, &slab_caches); 162 list_add(&s->list, &slab_caches);
163 memcg_register_cache(s);
164out: 163out:
165 if (err) 164 if (err)
166 return ERR_PTR(err); 165 return ERR_PTR(err);
@@ -205,6 +204,8 @@ kmem_cache_create(const char *name, size_t size, size_t align,
205 int err; 204 int err;
206 205
207 get_online_cpus(); 206 get_online_cpus();
207 get_online_mems();
208
208 mutex_lock(&slab_mutex); 209 mutex_lock(&slab_mutex);
209 210
210 err = kmem_cache_sanity_check(name, size); 211 err = kmem_cache_sanity_check(name, size);
@@ -239,6 +240,8 @@ kmem_cache_create(const char *name, size_t size, size_t align,
239 240
240out_unlock: 241out_unlock:
241 mutex_unlock(&slab_mutex); 242 mutex_unlock(&slab_mutex);
243
244 put_online_mems();
242 put_online_cpus(); 245 put_online_cpus();
243 246
244 if (err) { 247 if (err) {
@@ -258,31 +261,29 @@ EXPORT_SYMBOL(kmem_cache_create);
258 261
259#ifdef CONFIG_MEMCG_KMEM 262#ifdef CONFIG_MEMCG_KMEM
260/* 263/*
261 * kmem_cache_create_memcg - Create a cache for a memory cgroup. 264 * memcg_create_kmem_cache - Create a cache for a memory cgroup.
262 * @memcg: The memory cgroup the new cache is for. 265 * @memcg: The memory cgroup the new cache is for.
263 * @root_cache: The parent of the new cache. 266 * @root_cache: The parent of the new cache.
267 * @memcg_name: The name of the memory cgroup (used for naming the new cache).
264 * 268 *
265 * This function attempts to create a kmem cache that will serve allocation 269 * This function attempts to create a kmem cache that will serve allocation
266 * requests going from @memcg to @root_cache. The new cache inherits properties 270 * requests going from @memcg to @root_cache. The new cache inherits properties
267 * from its parent. 271 * from its parent.
268 */ 272 */
269void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_cache) 273struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
274 struct kmem_cache *root_cache,
275 const char *memcg_name)
270{ 276{
271 struct kmem_cache *s; 277 struct kmem_cache *s = NULL;
272 char *cache_name; 278 char *cache_name;
273 279
274 get_online_cpus(); 280 get_online_cpus();
275 mutex_lock(&slab_mutex); 281 get_online_mems();
276 282
277 /* 283 mutex_lock(&slab_mutex);
278 * Since per-memcg caches are created asynchronously on first
279 * allocation (see memcg_kmem_get_cache()), several threads can try to
280 * create the same cache, but only one of them may succeed.
281 */
282 if (cache_from_memcg_idx(root_cache, memcg_cache_id(memcg)))
283 goto out_unlock;
284 284
285 cache_name = memcg_create_cache_name(memcg, root_cache); 285 cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
286 memcg_cache_id(memcg), memcg_name);
286 if (!cache_name) 287 if (!cache_name)
287 goto out_unlock; 288 goto out_unlock;
288 289
@@ -292,17 +293,19 @@ void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_c
292 memcg, root_cache); 293 memcg, root_cache);
293 if (IS_ERR(s)) { 294 if (IS_ERR(s)) {
294 kfree(cache_name); 295 kfree(cache_name);
295 goto out_unlock; 296 s = NULL;
296 } 297 }
297 298
298 s->allocflags |= __GFP_KMEMCG;
299
300out_unlock: 299out_unlock:
301 mutex_unlock(&slab_mutex); 300 mutex_unlock(&slab_mutex);
301
302 put_online_mems();
302 put_online_cpus(); 303 put_online_cpus();
304
305 return s;
303} 306}
304 307
305static int kmem_cache_destroy_memcg_children(struct kmem_cache *s) 308static int memcg_cleanup_cache_params(struct kmem_cache *s)
306{ 309{
307 int rc; 310 int rc;
308 311
@@ -311,13 +314,13 @@ static int kmem_cache_destroy_memcg_children(struct kmem_cache *s)
311 return 0; 314 return 0;
312 315
313 mutex_unlock(&slab_mutex); 316 mutex_unlock(&slab_mutex);
314 rc = __kmem_cache_destroy_memcg_children(s); 317 rc = __memcg_cleanup_cache_params(s);
315 mutex_lock(&slab_mutex); 318 mutex_lock(&slab_mutex);
316 319
317 return rc; 320 return rc;
318} 321}
319#else 322#else
320static int kmem_cache_destroy_memcg_children(struct kmem_cache *s) 323static int memcg_cleanup_cache_params(struct kmem_cache *s)
321{ 324{
322 return 0; 325 return 0;
323} 326}
@@ -332,27 +335,26 @@ void slab_kmem_cache_release(struct kmem_cache *s)
332void kmem_cache_destroy(struct kmem_cache *s) 335void kmem_cache_destroy(struct kmem_cache *s)
333{ 336{
334 get_online_cpus(); 337 get_online_cpus();
338 get_online_mems();
339
335 mutex_lock(&slab_mutex); 340 mutex_lock(&slab_mutex);
336 341
337 s->refcount--; 342 s->refcount--;
338 if (s->refcount) 343 if (s->refcount)
339 goto out_unlock; 344 goto out_unlock;
340 345
341 if (kmem_cache_destroy_memcg_children(s) != 0) 346 if (memcg_cleanup_cache_params(s) != 0)
342 goto out_unlock; 347 goto out_unlock;
343 348
344 list_del(&s->list);
345 memcg_unregister_cache(s);
346
347 if (__kmem_cache_shutdown(s) != 0) { 349 if (__kmem_cache_shutdown(s) != 0) {
348 list_add(&s->list, &slab_caches);
349 memcg_register_cache(s);
350 printk(KERN_ERR "kmem_cache_destroy %s: " 350 printk(KERN_ERR "kmem_cache_destroy %s: "
351 "Slab cache still has objects\n", s->name); 351 "Slab cache still has objects\n", s->name);
352 dump_stack(); 352 dump_stack();
353 goto out_unlock; 353 goto out_unlock;
354 } 354 }
355 355
356 list_del(&s->list);
357
356 mutex_unlock(&slab_mutex); 358 mutex_unlock(&slab_mutex);
357 if (s->flags & SLAB_DESTROY_BY_RCU) 359 if (s->flags & SLAB_DESTROY_BY_RCU)
358 rcu_barrier(); 360 rcu_barrier();
@@ -363,15 +365,36 @@ void kmem_cache_destroy(struct kmem_cache *s)
363#else 365#else
364 slab_kmem_cache_release(s); 366 slab_kmem_cache_release(s);
365#endif 367#endif
366 goto out_put_cpus; 368 goto out;
367 369
368out_unlock: 370out_unlock:
369 mutex_unlock(&slab_mutex); 371 mutex_unlock(&slab_mutex);
370out_put_cpus: 372out:
373 put_online_mems();
371 put_online_cpus(); 374 put_online_cpus();
372} 375}
373EXPORT_SYMBOL(kmem_cache_destroy); 376EXPORT_SYMBOL(kmem_cache_destroy);
374 377
378/**
379 * kmem_cache_shrink - Shrink a cache.
380 * @cachep: The cache to shrink.
381 *
382 * Releases as many slabs as possible for a cache.
383 * To help debugging, a zero exit status indicates all slabs were released.
384 */
385int kmem_cache_shrink(struct kmem_cache *cachep)
386{
387 int ret;
388
389 get_online_cpus();
390 get_online_mems();
391 ret = __kmem_cache_shrink(cachep);
392 put_online_mems();
393 put_online_cpus();
394 return ret;
395}
396EXPORT_SYMBOL(kmem_cache_shrink);
397
375int slab_is_available(void) 398int slab_is_available(void)
376{ 399{
377 return slab_state >= UP; 400 return slab_state >= UP;
@@ -586,6 +609,24 @@ void __init create_kmalloc_caches(unsigned long flags)
586} 609}
587#endif /* !CONFIG_SLOB */ 610#endif /* !CONFIG_SLOB */
588 611
612/*
613 * To avoid unnecessary overhead, we pass through large allocation requests
614 * directly to the page allocator. We use __GFP_COMP, because we will need to
615 * know the allocation order to free the pages properly in kfree.
616 */
617void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
618{
619 void *ret;
620 struct page *page;
621
622 flags |= __GFP_COMP;
623 page = alloc_kmem_pages(flags, order);
624 ret = page ? page_address(page) : NULL;
625 kmemleak_alloc(ret, size, 1, flags);
626 return ret;
627}
628EXPORT_SYMBOL(kmalloc_order);
629
589#ifdef CONFIG_TRACING 630#ifdef CONFIG_TRACING
590void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) 631void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
591{ 632{
diff --git a/mm/slob.c b/mm/slob.c
index 730cad45d4be..21980e0f39a8 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -620,11 +620,10 @@ int __kmem_cache_shutdown(struct kmem_cache *c)
620 return 0; 620 return 0;
621} 621}
622 622
623int kmem_cache_shrink(struct kmem_cache *d) 623int __kmem_cache_shrink(struct kmem_cache *d)
624{ 624{
625 return 0; 625 return 0;
626} 626}
627EXPORT_SYMBOL(kmem_cache_shrink);
628 627
629struct kmem_cache kmem_cache_boot = { 628struct kmem_cache kmem_cache_boot = {
630 .name = "kmem_cache", 629 .name = "kmem_cache",
diff --git a/mm/slub.c b/mm/slub.c
index 2b1ce697fc4b..fdf0fe4da9a9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -403,7 +403,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
403 stat(s, CMPXCHG_DOUBLE_FAIL); 403 stat(s, CMPXCHG_DOUBLE_FAIL);
404 404
405#ifdef SLUB_DEBUG_CMPXCHG 405#ifdef SLUB_DEBUG_CMPXCHG
406 printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); 406 pr_info("%s %s: cmpxchg double redo ", n, s->name);
407#endif 407#endif
408 408
409 return 0; 409 return 0;
@@ -444,7 +444,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
444 stat(s, CMPXCHG_DOUBLE_FAIL); 444 stat(s, CMPXCHG_DOUBLE_FAIL);
445 445
446#ifdef SLUB_DEBUG_CMPXCHG 446#ifdef SLUB_DEBUG_CMPXCHG
447 printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); 447 pr_info("%s %s: cmpxchg double redo ", n, s->name);
448#endif 448#endif
449 449
450 return 0; 450 return 0;
@@ -546,14 +546,14 @@ static void print_track(const char *s, struct track *t)
546 if (!t->addr) 546 if (!t->addr)
547 return; 547 return;
548 548
549 printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", 549 pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
550 s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); 550 s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
551#ifdef CONFIG_STACKTRACE 551#ifdef CONFIG_STACKTRACE
552 { 552 {
553 int i; 553 int i;
554 for (i = 0; i < TRACK_ADDRS_COUNT; i++) 554 for (i = 0; i < TRACK_ADDRS_COUNT; i++)
555 if (t->addrs[i]) 555 if (t->addrs[i])
556 printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]); 556 pr_err("\t%pS\n", (void *)t->addrs[i]);
557 else 557 else
558 break; 558 break;
559 } 559 }
@@ -571,38 +571,37 @@ static void print_tracking(struct kmem_cache *s, void *object)
571 571
572static void print_page_info(struct page *page) 572static void print_page_info(struct page *page)
573{ 573{
574 printk(KERN_ERR 574 pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
575 "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
576 page, page->objects, page->inuse, page->freelist, page->flags); 575 page, page->objects, page->inuse, page->freelist, page->flags);
577 576
578} 577}
579 578
580static void slab_bug(struct kmem_cache *s, char *fmt, ...) 579static void slab_bug(struct kmem_cache *s, char *fmt, ...)
581{ 580{
581 struct va_format vaf;
582 va_list args; 582 va_list args;
583 char buf[100];
584 583
585 va_start(args, fmt); 584 va_start(args, fmt);
586 vsnprintf(buf, sizeof(buf), fmt, args); 585 vaf.fmt = fmt;
587 va_end(args); 586 vaf.va = &args;
588 printk(KERN_ERR "========================================" 587 pr_err("=============================================================================\n");
589 "=====================================\n"); 588 pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf);
590 printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf); 589 pr_err("-----------------------------------------------------------------------------\n\n");
591 printk(KERN_ERR "----------------------------------------"
592 "-------------------------------------\n\n");
593 590
594 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 591 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
592 va_end(args);
595} 593}
596 594
597static void slab_fix(struct kmem_cache *s, char *fmt, ...) 595static void slab_fix(struct kmem_cache *s, char *fmt, ...)
598{ 596{
597 struct va_format vaf;
599 va_list args; 598 va_list args;
600 char buf[100];
601 599
602 va_start(args, fmt); 600 va_start(args, fmt);
603 vsnprintf(buf, sizeof(buf), fmt, args); 601 vaf.fmt = fmt;
602 vaf.va = &args;
603 pr_err("FIX %s: %pV\n", s->name, &vaf);
604 va_end(args); 604 va_end(args);
605 printk(KERN_ERR "FIX %s: %s\n", s->name, buf);
606} 605}
607 606
608static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) 607static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
@@ -614,8 +613,8 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
614 613
615 print_page_info(page); 614 print_page_info(page);
616 615
617 printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", 616 pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
618 p, p - addr, get_freepointer(s, p)); 617 p, p - addr, get_freepointer(s, p));
619 618
620 if (p > addr + 16) 619 if (p > addr + 16)
621 print_section("Bytes b4 ", p - 16, 16); 620 print_section("Bytes b4 ", p - 16, 16);
@@ -698,7 +697,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
698 end--; 697 end--;
699 698
700 slab_bug(s, "%s overwritten", what); 699 slab_bug(s, "%s overwritten", what);
701 printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", 700 pr_err("INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
702 fault, end - 1, fault[0], value); 701 fault, end - 1, fault[0], value);
703 print_trailer(s, page, object); 702 print_trailer(s, page, object);
704 703
@@ -931,7 +930,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
931 int alloc) 930 int alloc)
932{ 931{
933 if (s->flags & SLAB_TRACE) { 932 if (s->flags & SLAB_TRACE) {
934 printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", 933 pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
935 s->name, 934 s->name,
936 alloc ? "alloc" : "free", 935 alloc ? "alloc" : "free",
937 object, page->inuse, 936 object, page->inuse,
@@ -1134,9 +1133,8 @@ static noinline struct kmem_cache_node *free_debug_processing(
1134 slab_err(s, page, "Attempt to free object(0x%p) " 1133 slab_err(s, page, "Attempt to free object(0x%p) "
1135 "outside of slab", object); 1134 "outside of slab", object);
1136 } else if (!page->slab_cache) { 1135 } else if (!page->slab_cache) {
1137 printk(KERN_ERR 1136 pr_err("SLUB <none>: no slab for object 0x%p.\n",
1138 "SLUB <none>: no slab for object 0x%p.\n", 1137 object);
1139 object);
1140 dump_stack(); 1138 dump_stack();
1141 } else 1139 } else
1142 object_err(s, page, object, 1140 object_err(s, page, object,
@@ -1219,8 +1217,8 @@ static int __init setup_slub_debug(char *str)
1219 slub_debug |= SLAB_FAILSLAB; 1217 slub_debug |= SLAB_FAILSLAB;
1220 break; 1218 break;
1221 default: 1219 default:
1222 printk(KERN_ERR "slub_debug option '%c' " 1220 pr_err("slub_debug option '%c' unknown. skipped\n",
1223 "unknown. skipped\n", *str); 1221 *str);
1224 } 1222 }
1225 } 1223 }
1226 1224
@@ -1314,17 +1312,26 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
1314/* 1312/*
1315 * Slab allocation and freeing 1313 * Slab allocation and freeing
1316 */ 1314 */
1317static inline struct page *alloc_slab_page(gfp_t flags, int node, 1315static inline struct page *alloc_slab_page(struct kmem_cache *s,
1318 struct kmem_cache_order_objects oo) 1316 gfp_t flags, int node, struct kmem_cache_order_objects oo)
1319{ 1317{
1318 struct page *page;
1320 int order = oo_order(oo); 1319 int order = oo_order(oo);
1321 1320
1322 flags |= __GFP_NOTRACK; 1321 flags |= __GFP_NOTRACK;
1323 1322
1323 if (memcg_charge_slab(s, flags, order))
1324 return NULL;
1325
1324 if (node == NUMA_NO_NODE) 1326 if (node == NUMA_NO_NODE)
1325 return alloc_pages(flags, order); 1327 page = alloc_pages(flags, order);
1326 else 1328 else
1327 return alloc_pages_exact_node(node, flags, order); 1329 page = alloc_pages_exact_node(node, flags, order);
1330
1331 if (!page)
1332 memcg_uncharge_slab(s, order);
1333
1334 return page;
1328} 1335}
1329 1336
1330static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) 1337static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1346,7 +1353,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1346 */ 1353 */
1347 alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; 1354 alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
1348 1355
1349 page = alloc_slab_page(alloc_gfp, node, oo); 1356 page = alloc_slab_page(s, alloc_gfp, node, oo);
1350 if (unlikely(!page)) { 1357 if (unlikely(!page)) {
1351 oo = s->min; 1358 oo = s->min;
1352 alloc_gfp = flags; 1359 alloc_gfp = flags;
@@ -1354,7 +1361,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1354 * Allocation may have failed due to fragmentation. 1361 * Allocation may have failed due to fragmentation.
1355 * Try a lower order alloc if possible 1362 * Try a lower order alloc if possible
1356 */ 1363 */
1357 page = alloc_slab_page(alloc_gfp, node, oo); 1364 page = alloc_slab_page(s, alloc_gfp, node, oo);
1358 1365
1359 if (page) 1366 if (page)
1360 stat(s, ORDER_FALLBACK); 1367 stat(s, ORDER_FALLBACK);
@@ -1415,7 +1422,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1415 1422
1416 order = compound_order(page); 1423 order = compound_order(page);
1417 inc_slabs_node(s, page_to_nid(page), page->objects); 1424 inc_slabs_node(s, page_to_nid(page), page->objects);
1418 memcg_bind_pages(s, order);
1419 page->slab_cache = s; 1425 page->slab_cache = s;
1420 __SetPageSlab(page); 1426 __SetPageSlab(page);
1421 if (page->pfmemalloc) 1427 if (page->pfmemalloc)
@@ -1466,11 +1472,11 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1466 __ClearPageSlabPfmemalloc(page); 1472 __ClearPageSlabPfmemalloc(page);
1467 __ClearPageSlab(page); 1473 __ClearPageSlab(page);
1468 1474
1469 memcg_release_pages(s, order);
1470 page_mapcount_reset(page); 1475 page_mapcount_reset(page);
1471 if (current->reclaim_state) 1476 if (current->reclaim_state)
1472 current->reclaim_state->reclaimed_slab += pages; 1477 current->reclaim_state->reclaimed_slab += pages;
1473 __free_memcg_kmem_pages(page, order); 1478 __free_pages(page, order);
1479 memcg_uncharge_slab(s, order);
1474} 1480}
1475 1481
1476#define need_reserve_slab_rcu \ 1482#define need_reserve_slab_rcu \
@@ -1770,19 +1776,19 @@ static inline void note_cmpxchg_failure(const char *n,
1770#ifdef SLUB_DEBUG_CMPXCHG 1776#ifdef SLUB_DEBUG_CMPXCHG
1771 unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); 1777 unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
1772 1778
1773 printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name); 1779 pr_info("%s %s: cmpxchg redo ", n, s->name);
1774 1780
1775#ifdef CONFIG_PREEMPT 1781#ifdef CONFIG_PREEMPT
1776 if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) 1782 if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
1777 printk("due to cpu change %d -> %d\n", 1783 pr_warn("due to cpu change %d -> %d\n",
1778 tid_to_cpu(tid), tid_to_cpu(actual_tid)); 1784 tid_to_cpu(tid), tid_to_cpu(actual_tid));
1779 else 1785 else
1780#endif 1786#endif
1781 if (tid_to_event(tid) != tid_to_event(actual_tid)) 1787 if (tid_to_event(tid) != tid_to_event(actual_tid))
1782 printk("due to cpu running other code. Event %ld->%ld\n", 1788 pr_warn("due to cpu running other code. Event %ld->%ld\n",
1783 tid_to_event(tid), tid_to_event(actual_tid)); 1789 tid_to_event(tid), tid_to_event(actual_tid));
1784 else 1790 else
1785 printk("for unknown reason: actual=%lx was=%lx target=%lx\n", 1791 pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n",
1786 actual_tid, tid, next_tid(tid)); 1792 actual_tid, tid, next_tid(tid));
1787#endif 1793#endif
1788 stat(s, CMPXCHG_DOUBLE_CPU_FAIL); 1794 stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
@@ -2121,11 +2127,19 @@ static inline int node_match(struct page *page, int node)
2121 return 1; 2127 return 1;
2122} 2128}
2123 2129
2130#ifdef CONFIG_SLUB_DEBUG
2124static int count_free(struct page *page) 2131static int count_free(struct page *page)
2125{ 2132{
2126 return page->objects - page->inuse; 2133 return page->objects - page->inuse;
2127} 2134}
2128 2135
2136static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
2137{
2138 return atomic_long_read(&n->total_objects);
2139}
2140#endif /* CONFIG_SLUB_DEBUG */
2141
2142#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)
2129static unsigned long count_partial(struct kmem_cache_node *n, 2143static unsigned long count_partial(struct kmem_cache_node *n,
2130 int (*get_count)(struct page *)) 2144 int (*get_count)(struct page *))
2131{ 2145{
@@ -2139,31 +2153,28 @@ static unsigned long count_partial(struct kmem_cache_node *n,
2139 spin_unlock_irqrestore(&n->list_lock, flags); 2153 spin_unlock_irqrestore(&n->list_lock, flags);
2140 return x; 2154 return x;
2141} 2155}
2142 2156#endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
2143static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
2144{
2145#ifdef CONFIG_SLUB_DEBUG
2146 return atomic_long_read(&n->total_objects);
2147#else
2148 return 0;
2149#endif
2150}
2151 2157
2152static noinline void 2158static noinline void
2153slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) 2159slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
2154{ 2160{
2161#ifdef CONFIG_SLUB_DEBUG
2162 static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
2163 DEFAULT_RATELIMIT_BURST);
2155 int node; 2164 int node;
2156 2165
2157 printk(KERN_WARNING 2166 if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs))
2158 "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", 2167 return;
2168
2169 pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
2159 nid, gfpflags); 2170 nid, gfpflags);
2160 printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " 2171 pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n",
2161 "default order: %d, min order: %d\n", s->name, s->object_size, 2172 s->name, s->object_size, s->size, oo_order(s->oo),
2162 s->size, oo_order(s->oo), oo_order(s->min)); 2173 oo_order(s->min));
2163 2174
2164 if (oo_order(s->min) > get_order(s->object_size)) 2175 if (oo_order(s->min) > get_order(s->object_size))
2165 printk(KERN_WARNING " %s debugging increased min order, use " 2176 pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n",
2166 "slub_debug=O to disable.\n", s->name); 2177 s->name);
2167 2178
2168 for_each_online_node(node) { 2179 for_each_online_node(node) {
2169 struct kmem_cache_node *n = get_node(s, node); 2180 struct kmem_cache_node *n = get_node(s, node);
@@ -2178,10 +2189,10 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
2178 nr_slabs = node_nr_slabs(n); 2189 nr_slabs = node_nr_slabs(n);
2179 nr_objs = node_nr_objs(n); 2190 nr_objs = node_nr_objs(n);
2180 2191
2181 printk(KERN_WARNING 2192 pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n",
2182 " node %d: slabs: %ld, objs: %ld, free: %ld\n",
2183 node, nr_slabs, nr_objs, nr_free); 2193 node, nr_slabs, nr_objs, nr_free);
2184 } 2194 }
2195#endif
2185} 2196}
2186 2197
2187static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, 2198static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
@@ -2198,7 +2209,7 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
2198 2209
2199 page = new_slab(s, flags, node); 2210 page = new_slab(s, flags, node);
2200 if (page) { 2211 if (page) {
2201 c = __this_cpu_ptr(s->cpu_slab); 2212 c = raw_cpu_ptr(s->cpu_slab);
2202 if (c->page) 2213 if (c->page)
2203 flush_slab(s, c); 2214 flush_slab(s, c);
2204 2215
@@ -2323,8 +2334,6 @@ redo:
2323 if (freelist) 2334 if (freelist)
2324 goto load_freelist; 2335 goto load_freelist;
2325 2336
2326 stat(s, ALLOC_SLOWPATH);
2327
2328 freelist = get_freelist(s, page); 2337 freelist = get_freelist(s, page);
2329 2338
2330 if (!freelist) { 2339 if (!freelist) {
@@ -2360,9 +2369,7 @@ new_slab:
2360 freelist = new_slab_objects(s, gfpflags, node, &c); 2369 freelist = new_slab_objects(s, gfpflags, node, &c);
2361 2370
2362 if (unlikely(!freelist)) { 2371 if (unlikely(!freelist)) {
2363 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) 2372 slab_out_of_memory(s, gfpflags, node);
2364 slab_out_of_memory(s, gfpflags, node);
2365
2366 local_irq_restore(flags); 2373 local_irq_restore(flags);
2367 return NULL; 2374 return NULL;
2368 } 2375 }
@@ -2418,7 +2425,7 @@ redo:
2418 * and the retrieval of the tid. 2425 * and the retrieval of the tid.
2419 */ 2426 */
2420 preempt_disable(); 2427 preempt_disable();
2421 c = __this_cpu_ptr(s->cpu_slab); 2428 c = this_cpu_ptr(s->cpu_slab);
2422 2429
2423 /* 2430 /*
2424 * The transaction ids are globally unique per cpu and per operation on 2431 * The transaction ids are globally unique per cpu and per operation on
@@ -2431,10 +2438,10 @@ redo:
2431 2438
2432 object = c->freelist; 2439 object = c->freelist;
2433 page = c->page; 2440 page = c->page;
2434 if (unlikely(!object || !node_match(page, node))) 2441 if (unlikely(!object || !node_match(page, node))) {
2435 object = __slab_alloc(s, gfpflags, node, addr, c); 2442 object = __slab_alloc(s, gfpflags, node, addr, c);
2436 2443 stat(s, ALLOC_SLOWPATH);
2437 else { 2444 } else {
2438 void *next_object = get_freepointer_safe(s, object); 2445 void *next_object = get_freepointer_safe(s, object);
2439 2446
2440 /* 2447 /*
@@ -2674,7 +2681,7 @@ redo:
2674 * during the cmpxchg then the free will succedd. 2681 * during the cmpxchg then the free will succedd.
2675 */ 2682 */
2676 preempt_disable(); 2683 preempt_disable();
2677 c = __this_cpu_ptr(s->cpu_slab); 2684 c = this_cpu_ptr(s->cpu_slab);
2678 2685
2679 tid = c->tid; 2686 tid = c->tid;
2680 preempt_enable(); 2687 preempt_enable();
@@ -2894,10 +2901,8 @@ static void early_kmem_cache_node_alloc(int node)
2894 2901
2895 BUG_ON(!page); 2902 BUG_ON(!page);
2896 if (page_to_nid(page) != node) { 2903 if (page_to_nid(page) != node) {
2897 printk(KERN_ERR "SLUB: Unable to allocate memory from " 2904 pr_err("SLUB: Unable to allocate memory from node %d\n", node);
2898 "node %d\n", node); 2905 pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
2899 printk(KERN_ERR "SLUB: Allocating a useless per node structure "
2900 "in order to be able to continue\n");
2901 } 2906 }
2902 2907
2903 n = page->freelist; 2908 n = page->freelist;
@@ -3182,8 +3187,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
3182 for_each_object(p, s, addr, page->objects) { 3187 for_each_object(p, s, addr, page->objects) {
3183 3188
3184 if (!test_bit(slab_index(p, s, addr), map)) { 3189 if (!test_bit(slab_index(p, s, addr), map)) {
3185 printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n", 3190 pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr);
3186 p, p - addr);
3187 print_tracking(s, p); 3191 print_tracking(s, p);
3188 } 3192 }
3189 } 3193 }
@@ -3305,8 +3309,8 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
3305 struct page *page; 3309 struct page *page;
3306 void *ptr = NULL; 3310 void *ptr = NULL;
3307 3311
3308 flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG; 3312 flags |= __GFP_COMP | __GFP_NOTRACK;
3309 page = alloc_pages_node(node, flags, get_order(size)); 3313 page = alloc_kmem_pages_node(node, flags, get_order(size));
3310 if (page) 3314 if (page)
3311 ptr = page_address(page); 3315 ptr = page_address(page);
3312 3316
@@ -3375,7 +3379,7 @@ void kfree(const void *x)
3375 if (unlikely(!PageSlab(page))) { 3379 if (unlikely(!PageSlab(page))) {
3376 BUG_ON(!PageCompound(page)); 3380 BUG_ON(!PageCompound(page));
3377 kfree_hook(x); 3381 kfree_hook(x);
3378 __free_memcg_kmem_pages(page, compound_order(page)); 3382 __free_kmem_pages(page, compound_order(page));
3379 return; 3383 return;
3380 } 3384 }
3381 slab_free(page->slab_cache, page, object, _RET_IP_); 3385 slab_free(page->slab_cache, page, object, _RET_IP_);
@@ -3392,7 +3396,7 @@ EXPORT_SYMBOL(kfree);
3392 * being allocated from last increasing the chance that the last objects 3396 * being allocated from last increasing the chance that the last objects
3393 * are freed in them. 3397 * are freed in them.
3394 */ 3398 */
3395int kmem_cache_shrink(struct kmem_cache *s) 3399int __kmem_cache_shrink(struct kmem_cache *s)
3396{ 3400{
3397 int node; 3401 int node;
3398 int i; 3402 int i;
@@ -3448,7 +3452,6 @@ int kmem_cache_shrink(struct kmem_cache *s)
3448 kfree(slabs_by_inuse); 3452 kfree(slabs_by_inuse);
3449 return 0; 3453 return 0;
3450} 3454}
3451EXPORT_SYMBOL(kmem_cache_shrink);
3452 3455
3453static int slab_mem_going_offline_callback(void *arg) 3456static int slab_mem_going_offline_callback(void *arg)
3454{ 3457{
@@ -3456,7 +3459,7 @@ static int slab_mem_going_offline_callback(void *arg)
3456 3459
3457 mutex_lock(&slab_mutex); 3460 mutex_lock(&slab_mutex);
3458 list_for_each_entry(s, &slab_caches, list) 3461 list_for_each_entry(s, &slab_caches, list)
3459 kmem_cache_shrink(s); 3462 __kmem_cache_shrink(s);
3460 mutex_unlock(&slab_mutex); 3463 mutex_unlock(&slab_mutex);
3461 3464
3462 return 0; 3465 return 0;
@@ -3650,9 +3653,7 @@ void __init kmem_cache_init(void)
3650 register_cpu_notifier(&slab_notifier); 3653 register_cpu_notifier(&slab_notifier);
3651#endif 3654#endif
3652 3655
3653 printk(KERN_INFO 3656 pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%d, Nodes=%d\n",
3654 "SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d,"
3655 " CPUs=%d, Nodes=%d\n",
3656 cache_line_size(), 3657 cache_line_size(),
3657 slub_min_order, slub_max_order, slub_min_objects, 3658 slub_min_order, slub_max_order, slub_min_objects,
3658 nr_cpu_ids, nr_node_ids); 3659 nr_cpu_ids, nr_node_ids);
@@ -3934,8 +3935,8 @@ static int validate_slab_node(struct kmem_cache *s,
3934 count++; 3935 count++;
3935 } 3936 }
3936 if (count != n->nr_partial) 3937 if (count != n->nr_partial)
3937 printk(KERN_ERR "SLUB %s: %ld partial slabs counted but " 3938 pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n",
3938 "counter=%ld\n", s->name, count, n->nr_partial); 3939 s->name, count, n->nr_partial);
3939 3940
3940 if (!(s->flags & SLAB_STORE_USER)) 3941 if (!(s->flags & SLAB_STORE_USER))
3941 goto out; 3942 goto out;
@@ -3945,9 +3946,8 @@ static int validate_slab_node(struct kmem_cache *s,
3945 count++; 3946 count++;
3946 } 3947 }
3947 if (count != atomic_long_read(&n->nr_slabs)) 3948 if (count != atomic_long_read(&n->nr_slabs))
3948 printk(KERN_ERR "SLUB: %s %ld slabs counted but " 3949 pr_err("SLUB: %s %ld slabs counted but counter=%ld\n",
3949 "counter=%ld\n", s->name, count, 3950 s->name, count, atomic_long_read(&n->nr_slabs));
3950 atomic_long_read(&n->nr_slabs));
3951 3951
3952out: 3952out:
3953 spin_unlock_irqrestore(&n->list_lock, flags); 3953 spin_unlock_irqrestore(&n->list_lock, flags);
@@ -4211,53 +4211,50 @@ static void resiliency_test(void)
4211 4211
4212 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10); 4212 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10);
4213 4213
4214 printk(KERN_ERR "SLUB resiliency testing\n"); 4214 pr_err("SLUB resiliency testing\n");
4215 printk(KERN_ERR "-----------------------\n"); 4215 pr_err("-----------------------\n");
4216 printk(KERN_ERR "A. Corruption after allocation\n"); 4216 pr_err("A. Corruption after allocation\n");
4217 4217
4218 p = kzalloc(16, GFP_KERNEL); 4218 p = kzalloc(16, GFP_KERNEL);
4219 p[16] = 0x12; 4219 p[16] = 0x12;
4220 printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" 4220 pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n",
4221 " 0x12->0x%p\n\n", p + 16); 4221 p + 16);
4222 4222
4223 validate_slab_cache(kmalloc_caches[4]); 4223 validate_slab_cache(kmalloc_caches[4]);
4224 4224
4225 /* Hmmm... The next two are dangerous */ 4225 /* Hmmm... The next two are dangerous */
4226 p = kzalloc(32, GFP_KERNEL); 4226 p = kzalloc(32, GFP_KERNEL);
4227 p[32 + sizeof(void *)] = 0x34; 4227 p[32 + sizeof(void *)] = 0x34;
4228 printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" 4228 pr_err("\n2. kmalloc-32: Clobber next pointer/next slab 0x34 -> -0x%p\n",
4229 " 0x34 -> -0x%p\n", p); 4229 p);
4230 printk(KERN_ERR 4230 pr_err("If allocated object is overwritten then not detectable\n\n");
4231 "If allocated object is overwritten then not detectable\n\n");
4232 4231
4233 validate_slab_cache(kmalloc_caches[5]); 4232 validate_slab_cache(kmalloc_caches[5]);
4234 p = kzalloc(64, GFP_KERNEL); 4233 p = kzalloc(64, GFP_KERNEL);
4235 p += 64 + (get_cycles() & 0xff) * sizeof(void *); 4234 p += 64 + (get_cycles() & 0xff) * sizeof(void *);
4236 *p = 0x56; 4235 *p = 0x56;
4237 printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", 4236 pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
4238 p); 4237 p);
4239 printk(KERN_ERR 4238 pr_err("If allocated object is overwritten then not detectable\n\n");
4240 "If allocated object is overwritten then not detectable\n\n");
4241 validate_slab_cache(kmalloc_caches[6]); 4239 validate_slab_cache(kmalloc_caches[6]);
4242 4240
4243 printk(KERN_ERR "\nB. Corruption after free\n"); 4241 pr_err("\nB. Corruption after free\n");
4244 p = kzalloc(128, GFP_KERNEL); 4242 p = kzalloc(128, GFP_KERNEL);
4245 kfree(p); 4243 kfree(p);
4246 *p = 0x78; 4244 *p = 0x78;
4247 printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); 4245 pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
4248 validate_slab_cache(kmalloc_caches[7]); 4246 validate_slab_cache(kmalloc_caches[7]);
4249 4247
4250 p = kzalloc(256, GFP_KERNEL); 4248 p = kzalloc(256, GFP_KERNEL);
4251 kfree(p); 4249 kfree(p);
4252 p[50] = 0x9a; 4250 p[50] = 0x9a;
4253 printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", 4251 pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
4254 p);
4255 validate_slab_cache(kmalloc_caches[8]); 4252 validate_slab_cache(kmalloc_caches[8]);
4256 4253
4257 p = kzalloc(512, GFP_KERNEL); 4254 p = kzalloc(512, GFP_KERNEL);
4258 kfree(p); 4255 kfree(p);
4259 p[512] = 0xab; 4256 p[512] = 0xab;
4260 printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); 4257 pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
4261 validate_slab_cache(kmalloc_caches[9]); 4258 validate_slab_cache(kmalloc_caches[9]);
4262} 4259}
4263#else 4260#else
@@ -4332,7 +4329,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
4332 } 4329 }
4333 } 4330 }
4334 4331
4335 lock_memory_hotplug(); 4332 get_online_mems();
4336#ifdef CONFIG_SLUB_DEBUG 4333#ifdef CONFIG_SLUB_DEBUG
4337 if (flags & SO_ALL) { 4334 if (flags & SO_ALL) {
4338 for_each_node_state(node, N_NORMAL_MEMORY) { 4335 for_each_node_state(node, N_NORMAL_MEMORY) {
@@ -4372,7 +4369,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
4372 x += sprintf(buf + x, " N%d=%lu", 4369 x += sprintf(buf + x, " N%d=%lu",
4373 node, nodes[node]); 4370 node, nodes[node]);
4374#endif 4371#endif
4375 unlock_memory_hotplug(); 4372 put_online_mems();
4376 kfree(nodes); 4373 kfree(nodes);
4377 return x + sprintf(buf + x, "\n"); 4374 return x + sprintf(buf + x, "\n");
4378} 4375}
@@ -5303,7 +5300,7 @@ static int __init slab_sysfs_init(void)
5303 slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); 5300 slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
5304 if (!slab_kset) { 5301 if (!slab_kset) {
5305 mutex_unlock(&slab_mutex); 5302 mutex_unlock(&slab_mutex);
5306 printk(KERN_ERR "Cannot register slab subsystem.\n"); 5303 pr_err("Cannot register slab subsystem.\n");
5307 return -ENOSYS; 5304 return -ENOSYS;
5308 } 5305 }
5309 5306
@@ -5312,8 +5309,8 @@ static int __init slab_sysfs_init(void)
5312 list_for_each_entry(s, &slab_caches, list) { 5309 list_for_each_entry(s, &slab_caches, list) {
5313 err = sysfs_slab_add(s); 5310 err = sysfs_slab_add(s);
5314 if (err) 5311 if (err)
5315 printk(KERN_ERR "SLUB: Unable to add boot slab %s" 5312 pr_err("SLUB: Unable to add boot slab %s to sysfs\n",
5316 " to sysfs\n", s->name); 5313 s->name);
5317 } 5314 }
5318 5315
5319 while (alias_list) { 5316 while (alias_list) {
@@ -5322,8 +5319,8 @@ static int __init slab_sysfs_init(void)
5322 alias_list = alias_list->next; 5319 alias_list = alias_list->next;
5323 err = sysfs_slab_alias(al->s, al->name); 5320 err = sysfs_slab_alias(al->s, al->name);
5324 if (err) 5321 if (err)
5325 printk(KERN_ERR "SLUB: Unable to add boot slab alias" 5322 pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n",
5326 " %s to sysfs\n", al->name); 5323 al->name);
5327 kfree(al); 5324 kfree(al);
5328 } 5325 }
5329 5326
diff --git a/mm/swap.c b/mm/swap.c
index 9ce43ba4498b..9e8e3472248b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -67,7 +67,7 @@ static void __page_cache_release(struct page *page)
67static void __put_single_page(struct page *page) 67static void __put_single_page(struct page *page)
68{ 68{
69 __page_cache_release(page); 69 __page_cache_release(page);
70 free_hot_cold_page(page, 0); 70 free_hot_cold_page(page, false);
71} 71}
72 72
73static void __put_compound_page(struct page *page) 73static void __put_compound_page(struct page *page)
@@ -79,95 +79,88 @@ static void __put_compound_page(struct page *page)
79 (*dtor)(page); 79 (*dtor)(page);
80} 80}
81 81
82static void put_compound_page(struct page *page) 82/**
83 * Two special cases here: we could avoid taking compound_lock_irqsave
84 * and could skip the tail refcounting(in _mapcount).
85 *
86 * 1. Hugetlbfs page:
87 *
88 * PageHeadHuge will remain true until the compound page
89 * is released and enters the buddy allocator, and it could
90 * not be split by __split_huge_page_refcount().
91 *
92 * So if we see PageHeadHuge set, and we have the tail page pin,
93 * then we could safely put head page.
94 *
95 * 2. Slab THP page:
96 *
97 * PG_slab is cleared before the slab frees the head page, and
98 * tail pin cannot be the last reference left on the head page,
99 * because the slab code is free to reuse the compound page
100 * after a kfree/kmem_cache_free without having to check if
101 * there's any tail pin left. In turn all tail pinsmust be always
102 * released while the head is still pinned by the slab code
103 * and so we know PG_slab will be still set too.
104 *
105 * So if we see PageSlab set, and we have the tail page pin,
106 * then we could safely put head page.
107 */
108static __always_inline
109void put_unrefcounted_compound_page(struct page *page_head, struct page *page)
83{ 110{
84 struct page *page_head;
85
86 if (likely(!PageTail(page))) {
87 if (put_page_testzero(page)) {
88 /*
89 * By the time all refcounts have been released
90 * split_huge_page cannot run anymore from under us.
91 */
92 if (PageHead(page))
93 __put_compound_page(page);
94 else
95 __put_single_page(page);
96 }
97 return;
98 }
99
100 /* __split_huge_page_refcount can run under us */
101 page_head = compound_head(page);
102
103 /* 111 /*
104 * THP can not break up slab pages so avoid taking 112 * If @page is a THP tail, we must read the tail page
105 * compound_lock() and skip the tail page refcounting (in 113 * flags after the head page flags. The
106 * _mapcount) too. Slab performs non-atomic bit ops on 114 * __split_huge_page_refcount side enforces write memory barriers
107 * page->flags for better performance. In particular 115 * between clearing PageTail and before the head page
108 * slab_unlock() in slub used to be a hot path. It is still 116 * can be freed and reallocated.
109 * hot on arches that do not support
110 * this_cpu_cmpxchg_double().
111 *
112 * If "page" is part of a slab or hugetlbfs page it cannot be
113 * splitted and the head page cannot change from under us. And
114 * if "page" is part of a THP page under splitting, if the
115 * head page pointed by the THP tail isn't a THP head anymore,
116 * we'll find PageTail clear after smp_rmb() and we'll treat
117 * it as a single page.
118 */ 117 */
119 if (!__compound_tail_refcounted(page_head)) { 118 smp_rmb();
119 if (likely(PageTail(page))) {
120 /* 120 /*
121 * If "page" is a THP tail, we must read the tail page 121 * __split_huge_page_refcount cannot race
122 * flags after the head page flags. The 122 * here, see the comment above this function.
123 * split_huge_page side enforces write memory barriers
124 * between clearing PageTail and before the head page
125 * can be freed and reallocated.
126 */ 123 */
127 smp_rmb(); 124 VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
128 if (likely(PageTail(page))) { 125 VM_BUG_ON_PAGE(page_mapcount(page) != 0, page);
129 /* 126 if (put_page_testzero(page_head)) {
130 * __split_huge_page_refcount cannot race
131 * here.
132 */
133 VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
134 VM_BUG_ON_PAGE(page_mapcount(page) != 0, page);
135 if (put_page_testzero(page_head)) {
136 /*
137 * If this is the tail of a slab
138 * compound page, the tail pin must
139 * not be the last reference held on
140 * the page, because the PG_slab
141 * cannot be cleared before all tail
142 * pins (which skips the _mapcount
143 * tail refcounting) have been
144 * released. For hugetlbfs the tail
145 * pin may be the last reference on
146 * the page instead, because
147 * PageHeadHuge will not go away until
148 * the compound page enters the buddy
149 * allocator.
150 */
151 VM_BUG_ON_PAGE(PageSlab(page_head), page_head);
152 __put_compound_page(page_head);
153 }
154 return;
155 } else
156 /* 127 /*
157 * __split_huge_page_refcount run before us, 128 * If this is the tail of a slab THP page,
158 * "page" was a THP tail. The split page_head 129 * the tail pin must not be the last reference
159 * has been freed and reallocated as slab or 130 * held on the page, because the PG_slab cannot
160 * hugetlbfs page of smaller order (only 131 * be cleared before all tail pins (which skips
161 * possible if reallocated as slab on x86). 132 * the _mapcount tail refcounting) have been
133 * released.
134 *
135 * If this is the tail of a hugetlbfs page,
136 * the tail pin may be the last reference on
137 * the page instead, because PageHeadHuge will
138 * not go away until the compound page enters
139 * the buddy allocator.
162 */ 140 */
163 goto out_put_single; 141 VM_BUG_ON_PAGE(PageSlab(page_head), page_head);
164 } 142 __put_compound_page(page_head);
143 }
144 } else
145 /*
146 * __split_huge_page_refcount run before us,
147 * @page was a THP tail. The split @page_head
148 * has been freed and reallocated as slab or
149 * hugetlbfs page of smaller order (only
150 * possible if reallocated as slab on x86).
151 */
152 if (put_page_testzero(page))
153 __put_single_page(page);
154}
165 155
156static __always_inline
157void put_refcounted_compound_page(struct page *page_head, struct page *page)
158{
166 if (likely(page != page_head && get_page_unless_zero(page_head))) { 159 if (likely(page != page_head && get_page_unless_zero(page_head))) {
167 unsigned long flags; 160 unsigned long flags;
168 161
169 /* 162 /*
170 * page_head wasn't a dangling pointer but it may not 163 * @page_head wasn't a dangling pointer but it may not
171 * be a head page anymore by the time we obtain the 164 * be a head page anymore by the time we obtain the
172 * lock. That is ok as long as it can't be freed from 165 * lock. That is ok as long as it can't be freed from
173 * under us. 166 * under us.
@@ -178,7 +171,7 @@ static void put_compound_page(struct page *page)
178 compound_unlock_irqrestore(page_head, flags); 171 compound_unlock_irqrestore(page_head, flags);
179 if (put_page_testzero(page_head)) { 172 if (put_page_testzero(page_head)) {
180 /* 173 /*
181 * The head page may have been freed 174 * The @page_head may have been freed
182 * and reallocated as a compound page 175 * and reallocated as a compound page
183 * of smaller order and then freed 176 * of smaller order and then freed
184 * again. All we know is that it 177 * again. All we know is that it
@@ -222,12 +215,51 @@ out_put_single:
222 __put_single_page(page_head); 215 __put_single_page(page_head);
223 } 216 }
224 } else { 217 } else {
225 /* page_head is a dangling pointer */ 218 /* @page_head is a dangling pointer */
226 VM_BUG_ON_PAGE(PageTail(page), page); 219 VM_BUG_ON_PAGE(PageTail(page), page);
227 goto out_put_single; 220 goto out_put_single;
228 } 221 }
229} 222}
230 223
224static void put_compound_page(struct page *page)
225{
226 struct page *page_head;
227
228 /*
229 * We see the PageCompound set and PageTail not set, so @page maybe:
230 * 1. hugetlbfs head page, or
231 * 2. THP head page.
232 */
233 if (likely(!PageTail(page))) {
234 if (put_page_testzero(page)) {
235 /*
236 * By the time all refcounts have been released
237 * split_huge_page cannot run anymore from under us.
238 */
239 if (PageHead(page))
240 __put_compound_page(page);
241 else
242 __put_single_page(page);
243 }
244 return;
245 }
246
247 /*
248 * We see the PageCompound set and PageTail set, so @page maybe:
249 * 1. a tail hugetlbfs page, or
250 * 2. a tail THP page, or
251 * 3. a split THP page.
252 *
253 * Case 3 is possible, as we may race with
254 * __split_huge_page_refcount tearing down a THP page.
255 */
256 page_head = compound_head_by_tail(page);
257 if (!__compound_tail_refcounted(page_head))
258 put_unrefcounted_compound_page(page_head, page);
259 else
260 put_refcounted_compound_page(page_head, page);
261}
262
231void put_page(struct page *page) 263void put_page(struct page *page)
232{ 264{
233 if (unlikely(PageCompound(page))) 265 if (unlikely(PageCompound(page)))
@@ -441,7 +473,7 @@ void rotate_reclaimable_page(struct page *page)
441 473
442 page_cache_get(page); 474 page_cache_get(page);
443 local_irq_save(flags); 475 local_irq_save(flags);
444 pvec = &__get_cpu_var(lru_rotate_pvecs); 476 pvec = this_cpu_ptr(&lru_rotate_pvecs);
445 if (!pagevec_add(pvec, page)) 477 if (!pagevec_add(pvec, page))
446 pagevec_move_tail(pvec); 478 pagevec_move_tail(pvec);
447 local_irq_restore(flags); 479 local_irq_restore(flags);
@@ -583,12 +615,17 @@ void mark_page_accessed(struct page *page)
583EXPORT_SYMBOL(mark_page_accessed); 615EXPORT_SYMBOL(mark_page_accessed);
584 616
585/* 617/*
586 * Queue the page for addition to the LRU via pagevec. The decision on whether 618 * Used to mark_page_accessed(page) that is not visible yet and when it is
587 * to add the page to the [in]active [file|anon] list is deferred until the 619 * still safe to use non-atomic ops
588 * pagevec is drained. This gives a chance for the caller of __lru_cache_add()
589 * have the page added to the active list using mark_page_accessed().
590 */ 620 */
591void __lru_cache_add(struct page *page) 621void init_page_accessed(struct page *page)
622{
623 if (!PageReferenced(page))
624 __SetPageReferenced(page);
625}
626EXPORT_SYMBOL(init_page_accessed);
627
628static void __lru_cache_add(struct page *page)
592{ 629{
593 struct pagevec *pvec = &get_cpu_var(lru_add_pvec); 630 struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
594 631
@@ -598,11 +635,34 @@ void __lru_cache_add(struct page *page)
598 pagevec_add(pvec, page); 635 pagevec_add(pvec, page);
599 put_cpu_var(lru_add_pvec); 636 put_cpu_var(lru_add_pvec);
600} 637}
601EXPORT_SYMBOL(__lru_cache_add); 638
639/**
640 * lru_cache_add: add a page to the page lists
641 * @page: the page to add
642 */
643void lru_cache_add_anon(struct page *page)
644{
645 if (PageActive(page))
646 ClearPageActive(page);
647 __lru_cache_add(page);
648}
649
650void lru_cache_add_file(struct page *page)
651{
652 if (PageActive(page))
653 ClearPageActive(page);
654 __lru_cache_add(page);
655}
656EXPORT_SYMBOL(lru_cache_add_file);
602 657
603/** 658/**
604 * lru_cache_add - add a page to a page list 659 * lru_cache_add - add a page to a page list
605 * @page: the page to be added to the LRU. 660 * @page: the page to be added to the LRU.
661 *
662 * Queue the page for addition to the LRU via pagevec. The decision on whether
663 * to add the page to the [in]active [file|anon] list is deferred until the
664 * pagevec is drained. This gives a chance for the caller of lru_cache_add()
665 * have the page added to the active list using mark_page_accessed().
606 */ 666 */
607void lru_cache_add(struct page *page) 667void lru_cache_add(struct page *page)
608{ 668{
@@ -813,7 +873,7 @@ void lru_add_drain_all(void)
813 * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() 873 * grabbed the page via the LRU. If it did, give up: shrink_inactive_list()
814 * will free it. 874 * will free it.
815 */ 875 */
816void release_pages(struct page **pages, int nr, int cold) 876void release_pages(struct page **pages, int nr, bool cold)
817{ 877{
818 int i; 878 int i;
819 LIST_HEAD(pages_to_free); 879 LIST_HEAD(pages_to_free);
@@ -854,7 +914,7 @@ void release_pages(struct page **pages, int nr, int cold)
854 } 914 }
855 915
856 /* Clear Active bit in case of parallel mark_page_accessed */ 916 /* Clear Active bit in case of parallel mark_page_accessed */
857 ClearPageActive(page); 917 __ClearPageActive(page);
858 918
859 list_add(&page->lru, &pages_to_free); 919 list_add(&page->lru, &pages_to_free);
860 } 920 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e76ace30d436..2972eee184a4 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -270,7 +270,7 @@ void free_pages_and_swap_cache(struct page **pages, int nr)
270 270
271 for (i = 0; i < todo; i++) 271 for (i = 0; i < todo; i++)
272 free_swap_cache(pagep[i]); 272 free_swap_cache(pagep[i]);
273 release_pages(pagep, todo, 0); 273 release_pages(pagep, todo, false);
274 pagep += todo; 274 pagep += todo;
275 nr -= todo; 275 nr -= todo;
276 } 276 }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4a7f7e6992b6..4c524f7bd0bf 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -51,14 +51,32 @@ atomic_long_t nr_swap_pages;
51/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ 51/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
52long total_swap_pages; 52long total_swap_pages;
53static int least_priority; 53static int least_priority;
54static atomic_t highest_priority_index = ATOMIC_INIT(-1);
55 54
56static const char Bad_file[] = "Bad swap file entry "; 55static const char Bad_file[] = "Bad swap file entry ";
57static const char Unused_file[] = "Unused swap file entry "; 56static const char Unused_file[] = "Unused swap file entry ";
58static const char Bad_offset[] = "Bad swap offset entry "; 57static const char Bad_offset[] = "Bad swap offset entry ";
59static const char Unused_offset[] = "Unused swap offset entry "; 58static const char Unused_offset[] = "Unused swap offset entry ";
60 59
61struct swap_list_t swap_list = {-1, -1}; 60/*
61 * all active swap_info_structs
62 * protected with swap_lock, and ordered by priority.
63 */
64PLIST_HEAD(swap_active_head);
65
66/*
67 * all available (active, not full) swap_info_structs
68 * protected with swap_avail_lock, ordered by priority.
69 * This is used by get_swap_page() instead of swap_active_head
70 * because swap_active_head includes all swap_info_structs,
71 * but get_swap_page() doesn't need to look at full ones.
72 * This uses its own lock instead of swap_lock because when a
73 * swap_info_struct changes between not-full/full, it needs to
74 * add/remove itself to/from this list, but the swap_info_struct->lock
75 * is held and the locking order requires swap_lock to be taken
76 * before any swap_info_struct->lock.
77 */
78static PLIST_HEAD(swap_avail_head);
79static DEFINE_SPINLOCK(swap_avail_lock);
62 80
63struct swap_info_struct *swap_info[MAX_SWAPFILES]; 81struct swap_info_struct *swap_info[MAX_SWAPFILES];
64 82
@@ -505,13 +523,10 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
505 /* 523 /*
506 * If seek is expensive, start searching for new cluster from 524 * If seek is expensive, start searching for new cluster from
507 * start of partition, to minimize the span of allocated swap. 525 * start of partition, to minimize the span of allocated swap.
508 * But if seek is cheap, search from our current position, so 526 * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info
509 * that swap is allocated from all over the partition: if the 527 * case, just handled by scan_swap_map_try_ssd_cluster() above.
510 * Flash Translation Layer only remaps within limited zones,
511 * we don't want to wear out the first zone too quickly.
512 */ 528 */
513 if (!(si->flags & SWP_SOLIDSTATE)) 529 scan_base = offset = si->lowest_bit;
514 scan_base = offset = si->lowest_bit;
515 last_in_cluster = offset + SWAPFILE_CLUSTER - 1; 530 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
516 531
517 /* Locate the first empty (unaligned) cluster */ 532 /* Locate the first empty (unaligned) cluster */
@@ -531,26 +546,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
531 } 546 }
532 } 547 }
533 548
534 offset = si->lowest_bit;
535 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
536
537 /* Locate the first empty (unaligned) cluster */
538 for (; last_in_cluster < scan_base; offset++) {
539 if (si->swap_map[offset])
540 last_in_cluster = offset + SWAPFILE_CLUSTER;
541 else if (offset == last_in_cluster) {
542 spin_lock(&si->lock);
543 offset -= SWAPFILE_CLUSTER - 1;
544 si->cluster_next = offset;
545 si->cluster_nr = SWAPFILE_CLUSTER - 1;
546 goto checks;
547 }
548 if (unlikely(--latency_ration < 0)) {
549 cond_resched();
550 latency_ration = LATENCY_LIMIT;
551 }
552 }
553
554 offset = scan_base; 549 offset = scan_base;
555 spin_lock(&si->lock); 550 spin_lock(&si->lock);
556 si->cluster_nr = SWAPFILE_CLUSTER - 1; 551 si->cluster_nr = SWAPFILE_CLUSTER - 1;
@@ -591,6 +586,9 @@ checks:
591 if (si->inuse_pages == si->pages) { 586 if (si->inuse_pages == si->pages) {
592 si->lowest_bit = si->max; 587 si->lowest_bit = si->max;
593 si->highest_bit = 0; 588 si->highest_bit = 0;
589 spin_lock(&swap_avail_lock);
590 plist_del(&si->avail_list, &swap_avail_head);
591 spin_unlock(&swap_avail_lock);
594 } 592 }
595 si->swap_map[offset] = usage; 593 si->swap_map[offset] = usage;
596 inc_cluster_info_page(si, si->cluster_info, offset); 594 inc_cluster_info_page(si, si->cluster_info, offset);
@@ -640,71 +638,65 @@ no_page:
640 638
641swp_entry_t get_swap_page(void) 639swp_entry_t get_swap_page(void)
642{ 640{
643 struct swap_info_struct *si; 641 struct swap_info_struct *si, *next;
644 pgoff_t offset; 642 pgoff_t offset;
645 int type, next;
646 int wrapped = 0;
647 int hp_index;
648 643
649 spin_lock(&swap_lock);
650 if (atomic_long_read(&nr_swap_pages) <= 0) 644 if (atomic_long_read(&nr_swap_pages) <= 0)
651 goto noswap; 645 goto noswap;
652 atomic_long_dec(&nr_swap_pages); 646 atomic_long_dec(&nr_swap_pages);
653 647
654 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { 648 spin_lock(&swap_avail_lock);
655 hp_index = atomic_xchg(&highest_priority_index, -1);
656 /*
657 * highest_priority_index records current highest priority swap
658 * type which just frees swap entries. If its priority is
659 * higher than that of swap_list.next swap type, we use it. It
660 * isn't protected by swap_lock, so it can be an invalid value
661 * if the corresponding swap type is swapoff. We double check
662 * the flags here. It's even possible the swap type is swapoff
663 * and swapon again and its priority is changed. In such rare
664 * case, low prority swap type might be used, but eventually
665 * high priority swap will be used after several rounds of
666 * swap.
667 */
668 if (hp_index != -1 && hp_index != type &&
669 swap_info[type]->prio < swap_info[hp_index]->prio &&
670 (swap_info[hp_index]->flags & SWP_WRITEOK)) {
671 type = hp_index;
672 swap_list.next = type;
673 }
674
675 si = swap_info[type];
676 next = si->next;
677 if (next < 0 ||
678 (!wrapped && si->prio != swap_info[next]->prio)) {
679 next = swap_list.head;
680 wrapped++;
681 }
682 649
650start_over:
651 plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
652 /* requeue si to after same-priority siblings */
653 plist_requeue(&si->avail_list, &swap_avail_head);
654 spin_unlock(&swap_avail_lock);
683 spin_lock(&si->lock); 655 spin_lock(&si->lock);
684 if (!si->highest_bit) { 656 if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
685 spin_unlock(&si->lock); 657 spin_lock(&swap_avail_lock);
686 continue; 658 if (plist_node_empty(&si->avail_list)) {
687 } 659 spin_unlock(&si->lock);
688 if (!(si->flags & SWP_WRITEOK)) { 660 goto nextsi;
661 }
662 WARN(!si->highest_bit,
663 "swap_info %d in list but !highest_bit\n",
664 si->type);
665 WARN(!(si->flags & SWP_WRITEOK),
666 "swap_info %d in list but !SWP_WRITEOK\n",
667 si->type);
668 plist_del(&si->avail_list, &swap_avail_head);
689 spin_unlock(&si->lock); 669 spin_unlock(&si->lock);
690 continue; 670 goto nextsi;
691 } 671 }
692 672
693 swap_list.next = next;
694
695 spin_unlock(&swap_lock);
696 /* This is called for allocating swap entry for cache */ 673 /* This is called for allocating swap entry for cache */
697 offset = scan_swap_map(si, SWAP_HAS_CACHE); 674 offset = scan_swap_map(si, SWAP_HAS_CACHE);
698 spin_unlock(&si->lock); 675 spin_unlock(&si->lock);
699 if (offset) 676 if (offset)
700 return swp_entry(type, offset); 677 return swp_entry(si->type, offset);
701 spin_lock(&swap_lock); 678 pr_debug("scan_swap_map of si %d failed to find offset\n",
702 next = swap_list.next; 679 si->type);
680 spin_lock(&swap_avail_lock);
681nextsi:
682 /*
683 * if we got here, it's likely that si was almost full before,
684 * and since scan_swap_map() can drop the si->lock, multiple
685 * callers probably all tried to get a page from the same si
686 * and it filled up before we could get one; or, the si filled
687 * up between us dropping swap_avail_lock and taking si->lock.
688 * Since we dropped the swap_avail_lock, the swap_avail_head
689 * list may have been modified; so if next is still in the
690 * swap_avail_head list then try it, otherwise start over.
691 */
692 if (plist_node_empty(&next->avail_list))
693 goto start_over;
703 } 694 }
704 695
696 spin_unlock(&swap_avail_lock);
697
705 atomic_long_inc(&nr_swap_pages); 698 atomic_long_inc(&nr_swap_pages);
706noswap: 699noswap:
707 spin_unlock(&swap_lock);
708 return (swp_entry_t) {0}; 700 return (swp_entry_t) {0};
709} 701}
710 702
@@ -766,27 +758,6 @@ out:
766 return NULL; 758 return NULL;
767} 759}
768 760
769/*
770 * This swap type frees swap entry, check if it is the highest priority swap
771 * type which just frees swap entry. get_swap_page() uses
772 * highest_priority_index to search highest priority swap type. The
773 * swap_info_struct.lock can't protect us if there are multiple swap types
774 * active, so we use atomic_cmpxchg.
775 */
776static void set_highest_priority_index(int type)
777{
778 int old_hp_index, new_hp_index;
779
780 do {
781 old_hp_index = atomic_read(&highest_priority_index);
782 if (old_hp_index != -1 &&
783 swap_info[old_hp_index]->prio >= swap_info[type]->prio)
784 break;
785 new_hp_index = type;
786 } while (atomic_cmpxchg(&highest_priority_index,
787 old_hp_index, new_hp_index) != old_hp_index);
788}
789
790static unsigned char swap_entry_free(struct swap_info_struct *p, 761static unsigned char swap_entry_free(struct swap_info_struct *p,
791 swp_entry_t entry, unsigned char usage) 762 swp_entry_t entry, unsigned char usage)
792{ 763{
@@ -828,9 +799,18 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
828 dec_cluster_info_page(p, p->cluster_info, offset); 799 dec_cluster_info_page(p, p->cluster_info, offset);
829 if (offset < p->lowest_bit) 800 if (offset < p->lowest_bit)
830 p->lowest_bit = offset; 801 p->lowest_bit = offset;
831 if (offset > p->highest_bit) 802 if (offset > p->highest_bit) {
803 bool was_full = !p->highest_bit;
832 p->highest_bit = offset; 804 p->highest_bit = offset;
833 set_highest_priority_index(p->type); 805 if (was_full && (p->flags & SWP_WRITEOK)) {
806 spin_lock(&swap_avail_lock);
807 WARN_ON(!plist_node_empty(&p->avail_list));
808 if (plist_node_empty(&p->avail_list))
809 plist_add(&p->avail_list,
810 &swap_avail_head);
811 spin_unlock(&swap_avail_lock);
812 }
813 }
834 atomic_long_inc(&nr_swap_pages); 814 atomic_long_inc(&nr_swap_pages);
835 p->inuse_pages--; 815 p->inuse_pages--;
836 frontswap_invalidate_page(p->type, offset); 816 frontswap_invalidate_page(p->type, offset);
@@ -1765,30 +1745,37 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
1765 unsigned char *swap_map, 1745 unsigned char *swap_map,
1766 struct swap_cluster_info *cluster_info) 1746 struct swap_cluster_info *cluster_info)
1767{ 1747{
1768 int i, prev;
1769
1770 if (prio >= 0) 1748 if (prio >= 0)
1771 p->prio = prio; 1749 p->prio = prio;
1772 else 1750 else
1773 p->prio = --least_priority; 1751 p->prio = --least_priority;
1752 /*
1753 * the plist prio is negated because plist ordering is
1754 * low-to-high, while swap ordering is high-to-low
1755 */
1756 p->list.prio = -p->prio;
1757 p->avail_list.prio = -p->prio;
1774 p->swap_map = swap_map; 1758 p->swap_map = swap_map;
1775 p->cluster_info = cluster_info; 1759 p->cluster_info = cluster_info;
1776 p->flags |= SWP_WRITEOK; 1760 p->flags |= SWP_WRITEOK;
1777 atomic_long_add(p->pages, &nr_swap_pages); 1761 atomic_long_add(p->pages, &nr_swap_pages);
1778 total_swap_pages += p->pages; 1762 total_swap_pages += p->pages;
1779 1763
1780 /* insert swap space into swap_list: */ 1764 assert_spin_locked(&swap_lock);
1781 prev = -1; 1765 /*
1782 for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { 1766 * both lists are plists, and thus priority ordered.
1783 if (p->prio >= swap_info[i]->prio) 1767 * swap_active_head needs to be priority ordered for swapoff(),
1784 break; 1768 * which on removal of any swap_info_struct with an auto-assigned
1785 prev = i; 1769 * (i.e. negative) priority increments the auto-assigned priority
1786 } 1770 * of any lower-priority swap_info_structs.
1787 p->next = i; 1771 * swap_avail_head needs to be priority ordered for get_swap_page(),
1788 if (prev < 0) 1772 * which allocates swap pages from the highest available priority
1789 swap_list.head = swap_list.next = p->type; 1773 * swap_info_struct.
1790 else 1774 */
1791 swap_info[prev]->next = p->type; 1775 plist_add(&p->list, &swap_active_head);
1776 spin_lock(&swap_avail_lock);
1777 plist_add(&p->avail_list, &swap_avail_head);
1778 spin_unlock(&swap_avail_lock);
1792} 1779}
1793 1780
1794static void enable_swap_info(struct swap_info_struct *p, int prio, 1781static void enable_swap_info(struct swap_info_struct *p, int prio,
@@ -1823,8 +1810,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1823 struct address_space *mapping; 1810 struct address_space *mapping;
1824 struct inode *inode; 1811 struct inode *inode;
1825 struct filename *pathname; 1812 struct filename *pathname;
1826 int i, type, prev; 1813 int err, found = 0;
1827 int err;
1828 unsigned int old_block_size; 1814 unsigned int old_block_size;
1829 1815
1830 if (!capable(CAP_SYS_ADMIN)) 1816 if (!capable(CAP_SYS_ADMIN))
@@ -1842,17 +1828,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1842 goto out; 1828 goto out;
1843 1829
1844 mapping = victim->f_mapping; 1830 mapping = victim->f_mapping;
1845 prev = -1;
1846 spin_lock(&swap_lock); 1831 spin_lock(&swap_lock);
1847 for (type = swap_list.head; type >= 0; type = swap_info[type]->next) { 1832 plist_for_each_entry(p, &swap_active_head, list) {
1848 p = swap_info[type];
1849 if (p->flags & SWP_WRITEOK) { 1833 if (p->flags & SWP_WRITEOK) {
1850 if (p->swap_file->f_mapping == mapping) 1834 if (p->swap_file->f_mapping == mapping) {
1835 found = 1;
1851 break; 1836 break;
1837 }
1852 } 1838 }
1853 prev = type;
1854 } 1839 }
1855 if (type < 0) { 1840 if (!found) {
1856 err = -EINVAL; 1841 err = -EINVAL;
1857 spin_unlock(&swap_lock); 1842 spin_unlock(&swap_lock);
1858 goto out_dput; 1843 goto out_dput;
@@ -1864,20 +1849,21 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1864 spin_unlock(&swap_lock); 1849 spin_unlock(&swap_lock);
1865 goto out_dput; 1850 goto out_dput;
1866 } 1851 }
1867 if (prev < 0) 1852 spin_lock(&swap_avail_lock);
1868 swap_list.head = p->next; 1853 plist_del(&p->avail_list, &swap_avail_head);
1869 else 1854 spin_unlock(&swap_avail_lock);
1870 swap_info[prev]->next = p->next;
1871 if (type == swap_list.next) {
1872 /* just pick something that's safe... */
1873 swap_list.next = swap_list.head;
1874 }
1875 spin_lock(&p->lock); 1855 spin_lock(&p->lock);
1876 if (p->prio < 0) { 1856 if (p->prio < 0) {
1877 for (i = p->next; i >= 0; i = swap_info[i]->next) 1857 struct swap_info_struct *si = p;
1878 swap_info[i]->prio = p->prio--; 1858
1859 plist_for_each_entry_continue(si, &swap_active_head, list) {
1860 si->prio++;
1861 si->list.prio--;
1862 si->avail_list.prio--;
1863 }
1879 least_priority++; 1864 least_priority++;
1880 } 1865 }
1866 plist_del(&p->list, &swap_active_head);
1881 atomic_long_sub(p->pages, &nr_swap_pages); 1867 atomic_long_sub(p->pages, &nr_swap_pages);
1882 total_swap_pages -= p->pages; 1868 total_swap_pages -= p->pages;
1883 p->flags &= ~SWP_WRITEOK; 1869 p->flags &= ~SWP_WRITEOK;
@@ -1885,7 +1871,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1885 spin_unlock(&swap_lock); 1871 spin_unlock(&swap_lock);
1886 1872
1887 set_current_oom_origin(); 1873 set_current_oom_origin();
1888 err = try_to_unuse(type, false, 0); /* force all pages to be unused */ 1874 err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
1889 clear_current_oom_origin(); 1875 clear_current_oom_origin();
1890 1876
1891 if (err) { 1877 if (err) {
@@ -1926,7 +1912,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1926 frontswap_map = frontswap_map_get(p); 1912 frontswap_map = frontswap_map_get(p);
1927 spin_unlock(&p->lock); 1913 spin_unlock(&p->lock);
1928 spin_unlock(&swap_lock); 1914 spin_unlock(&swap_lock);
1929 frontswap_invalidate_area(type); 1915 frontswap_invalidate_area(p->type);
1930 frontswap_map_set(p, NULL); 1916 frontswap_map_set(p, NULL);
1931 mutex_unlock(&swapon_mutex); 1917 mutex_unlock(&swapon_mutex);
1932 free_percpu(p->percpu_cluster); 1918 free_percpu(p->percpu_cluster);
@@ -1935,7 +1921,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1935 vfree(cluster_info); 1921 vfree(cluster_info);
1936 vfree(frontswap_map); 1922 vfree(frontswap_map);
1937 /* Destroy swap account information */ 1923 /* Destroy swap account information */
1938 swap_cgroup_swapoff(type); 1924 swap_cgroup_swapoff(p->type);
1939 1925
1940 inode = mapping->host; 1926 inode = mapping->host;
1941 if (S_ISBLK(inode->i_mode)) { 1927 if (S_ISBLK(inode->i_mode)) {
@@ -2142,8 +2128,9 @@ static struct swap_info_struct *alloc_swap_info(void)
2142 */ 2128 */
2143 } 2129 }
2144 INIT_LIST_HEAD(&p->first_swap_extent.list); 2130 INIT_LIST_HEAD(&p->first_swap_extent.list);
2131 plist_node_init(&p->list, 0);
2132 plist_node_init(&p->avail_list, 0);
2145 p->flags = SWP_USED; 2133 p->flags = SWP_USED;
2146 p->next = -1;
2147 spin_unlock(&swap_lock); 2134 spin_unlock(&swap_lock);
2148 spin_lock_init(&p->lock); 2135 spin_lock_init(&p->lock);
2149 2136
diff --git a/mm/vmacache.c b/mm/vmacache.c
index 1037a3bab505..9f25af825dec 100644
--- a/mm/vmacache.c
+++ b/mm/vmacache.c
@@ -17,6 +17,16 @@ void vmacache_flush_all(struct mm_struct *mm)
17{ 17{
18 struct task_struct *g, *p; 18 struct task_struct *g, *p;
19 19
20 /*
21 * Single threaded tasks need not iterate the entire
22 * list of process. We can avoid the flushing as well
23 * since the mm's seqnum was increased and don't have
24 * to worry about other threads' seqnum. Current's
25 * flush will occur upon the next lookup.
26 */
27 if (atomic_read(&mm->mm_users) == 1)
28 return;
29
20 rcu_read_lock(); 30 rcu_read_lock();
21 for_each_process_thread(g, p) { 31 for_each_process_thread(g, p) {
22 /* 32 /*
@@ -78,6 +88,8 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
78 if (!vmacache_valid(mm)) 88 if (!vmacache_valid(mm))
79 return NULL; 89 return NULL;
80 90
91 count_vm_vmacache_event(VMACACHE_FIND_CALLS);
92
81 for (i = 0; i < VMACACHE_SIZE; i++) { 93 for (i = 0; i < VMACACHE_SIZE; i++) {
82 struct vm_area_struct *vma = current->vmacache[i]; 94 struct vm_area_struct *vma = current->vmacache[i];
83 95
@@ -85,8 +97,10 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
85 continue; 97 continue;
86 if (WARN_ON_ONCE(vma->vm_mm != mm)) 98 if (WARN_ON_ONCE(vma->vm_mm != mm))
87 break; 99 break;
88 if (vma->vm_start <= addr && vma->vm_end > addr) 100 if (vma->vm_start <= addr && vma->vm_end > addr) {
101 count_vm_vmacache_event(VMACACHE_FIND_HITS);
89 return vma; 102 return vma;
103 }
90 } 104 }
91 105
92 return NULL; 106 return NULL;
@@ -102,11 +116,15 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
102 if (!vmacache_valid(mm)) 116 if (!vmacache_valid(mm))
103 return NULL; 117 return NULL;
104 118
119 count_vm_vmacache_event(VMACACHE_FIND_CALLS);
120
105 for (i = 0; i < VMACACHE_SIZE; i++) { 121 for (i = 0; i < VMACACHE_SIZE; i++) {
106 struct vm_area_struct *vma = current->vmacache[i]; 122 struct vm_area_struct *vma = current->vmacache[i];
107 123
108 if (vma && vma->vm_start == start && vma->vm_end == end) 124 if (vma && vma->vm_start == start && vma->vm_end == end) {
125 count_vm_vmacache_event(VMACACHE_FIND_HITS);
109 return vma; 126 return vma;
127 }
110 } 128 }
111 129
112 return NULL; 130 return NULL;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index bf233b283319..f64632b67196 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1268,6 +1268,7 @@ void unmap_kernel_range(unsigned long addr, unsigned long size)
1268 vunmap_page_range(addr, end); 1268 vunmap_page_range(addr, end);
1269 flush_tlb_kernel_range(addr, end); 1269 flush_tlb_kernel_range(addr, end);
1270} 1270}
1271EXPORT_SYMBOL_GPL(unmap_kernel_range);
1271 1272
1272int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) 1273int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
1273{ 1274{
@@ -1496,7 +1497,7 @@ void vfree(const void *addr)
1496 if (!addr) 1497 if (!addr)
1497 return; 1498 return;
1498 if (unlikely(in_interrupt())) { 1499 if (unlikely(in_interrupt())) {
1499 struct vfree_deferred *p = &__get_cpu_var(vfree_deferred); 1500 struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred);
1500 if (llist_add((struct llist_node *)addr, &p->list)) 1501 if (llist_add((struct llist_node *)addr, &p->list))
1501 schedule_work(&p->wq); 1502 schedule_work(&p->wq);
1502 } else 1503 } else
@@ -2619,19 +2620,19 @@ static int s_show(struct seq_file *m, void *p)
2619 seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr); 2620 seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr);
2620 2621
2621 if (v->flags & VM_IOREMAP) 2622 if (v->flags & VM_IOREMAP)
2622 seq_printf(m, " ioremap"); 2623 seq_puts(m, " ioremap");
2623 2624
2624 if (v->flags & VM_ALLOC) 2625 if (v->flags & VM_ALLOC)
2625 seq_printf(m, " vmalloc"); 2626 seq_puts(m, " vmalloc");
2626 2627
2627 if (v->flags & VM_MAP) 2628 if (v->flags & VM_MAP)
2628 seq_printf(m, " vmap"); 2629 seq_puts(m, " vmap");
2629 2630
2630 if (v->flags & VM_USERMAP) 2631 if (v->flags & VM_USERMAP)
2631 seq_printf(m, " user"); 2632 seq_puts(m, " user");
2632 2633
2633 if (v->flags & VM_VPAGES) 2634 if (v->flags & VM_VPAGES)
2634 seq_printf(m, " vpages"); 2635 seq_puts(m, " vpages");
2635 2636
2636 show_numa_info(m, v); 2637 show_numa_info(m, v);
2637 seq_putc(m, '\n'); 2638 seq_putc(m, '\n');
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 32c661d66a45..9149444f947d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -324,7 +324,7 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
324 else 324 else
325 new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); 325 new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
326 326
327 trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr); 327 trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);
328 return freed; 328 return freed;
329} 329}
330 330
@@ -1121,7 +1121,7 @@ keep:
1121 VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); 1121 VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
1122 } 1122 }
1123 1123
1124 free_hot_cold_page_list(&free_pages, 1); 1124 free_hot_cold_page_list(&free_pages, true);
1125 1125
1126 list_splice(&ret_pages, page_list); 1126 list_splice(&ret_pages, page_list);
1127 count_vm_events(PGACTIVATE, pgactivate); 1127 count_vm_events(PGACTIVATE, pgactivate);
@@ -1439,6 +1439,19 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
1439} 1439}
1440 1440
1441/* 1441/*
1442 * If a kernel thread (such as nfsd for loop-back mounts) services
1443 * a backing device by writing to the page cache it sets PF_LESS_THROTTLE.
1444 * In that case we should only throttle if the backing device it is
1445 * writing to is congested. In other cases it is safe to throttle.
1446 */
1447static int current_may_throttle(void)
1448{
1449 return !(current->flags & PF_LESS_THROTTLE) ||
1450 current->backing_dev_info == NULL ||
1451 bdi_write_congested(current->backing_dev_info);
1452}
1453
1454/*
1442 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number 1455 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
1443 * of reclaimed pages 1456 * of reclaimed pages
1444 */ 1457 */
@@ -1519,7 +1532,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1519 1532
1520 spin_unlock_irq(&zone->lru_lock); 1533 spin_unlock_irq(&zone->lru_lock);
1521 1534
1522 free_hot_cold_page_list(&page_list, 1); 1535 free_hot_cold_page_list(&page_list, true);
1523 1536
1524 /* 1537 /*
1525 * If reclaim is isolating dirty pages under writeback, it implies 1538 * If reclaim is isolating dirty pages under writeback, it implies
@@ -1566,7 +1579,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1566 * implies that pages are cycling through the LRU faster than 1579 * implies that pages are cycling through the LRU faster than
1567 * they are written so also forcibly stall. 1580 * they are written so also forcibly stall.
1568 */ 1581 */
1569 if (nr_unqueued_dirty == nr_taken || nr_immediate) 1582 if ((nr_unqueued_dirty == nr_taken || nr_immediate) &&
1583 current_may_throttle())
1570 congestion_wait(BLK_RW_ASYNC, HZ/10); 1584 congestion_wait(BLK_RW_ASYNC, HZ/10);
1571 } 1585 }
1572 1586
@@ -1575,7 +1589,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1575 * is congested. Allow kswapd to continue until it starts encountering 1589 * is congested. Allow kswapd to continue until it starts encountering
1576 * unqueued dirty pages or cycling through the LRU too quickly. 1590 * unqueued dirty pages or cycling through the LRU too quickly.
1577 */ 1591 */
1578 if (!sc->hibernation_mode && !current_is_kswapd()) 1592 if (!sc->hibernation_mode && !current_is_kswapd() &&
1593 current_may_throttle())
1579 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); 1594 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
1580 1595
1581 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, 1596 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
@@ -1740,7 +1755,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
1740 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); 1755 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1741 spin_unlock_irq(&zone->lru_lock); 1756 spin_unlock_irq(&zone->lru_lock);
1742 1757
1743 free_hot_cold_page_list(&l_hold, 1); 1758 free_hot_cold_page_list(&l_hold, true);
1744} 1759}
1745 1760
1746#ifdef CONFIG_SWAP 1761#ifdef CONFIG_SWAP
@@ -1866,6 +1881,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1866 bool force_scan = false; 1881 bool force_scan = false;
1867 unsigned long ap, fp; 1882 unsigned long ap, fp;
1868 enum lru_list lru; 1883 enum lru_list lru;
1884 bool some_scanned;
1885 int pass;
1869 1886
1870 /* 1887 /*
1871 * If the zone or memcg is small, nr[l] can be 0. This 1888 * If the zone or memcg is small, nr[l] can be 0. This
@@ -1989,39 +2006,49 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1989 fraction[1] = fp; 2006 fraction[1] = fp;
1990 denominator = ap + fp + 1; 2007 denominator = ap + fp + 1;
1991out: 2008out:
1992 for_each_evictable_lru(lru) { 2009 some_scanned = false;
1993 int file = is_file_lru(lru); 2010 /* Only use force_scan on second pass. */
1994 unsigned long size; 2011 for (pass = 0; !some_scanned && pass < 2; pass++) {
1995 unsigned long scan; 2012 for_each_evictable_lru(lru) {
2013 int file = is_file_lru(lru);
2014 unsigned long size;
2015 unsigned long scan;
1996 2016
1997 size = get_lru_size(lruvec, lru); 2017 size = get_lru_size(lruvec, lru);
1998 scan = size >> sc->priority; 2018 scan = size >> sc->priority;
1999 2019
2000 if (!scan && force_scan) 2020 if (!scan && pass && force_scan)
2001 scan = min(size, SWAP_CLUSTER_MAX); 2021 scan = min(size, SWAP_CLUSTER_MAX);
2002 2022
2003 switch (scan_balance) { 2023 switch (scan_balance) {
2004 case SCAN_EQUAL: 2024 case SCAN_EQUAL:
2005 /* Scan lists relative to size */ 2025 /* Scan lists relative to size */
2006 break; 2026 break;
2007 case SCAN_FRACT: 2027 case SCAN_FRACT:
2028 /*
2029 * Scan types proportional to swappiness and
2030 * their relative recent reclaim efficiency.
2031 */
2032 scan = div64_u64(scan * fraction[file],
2033 denominator);
2034 break;
2035 case SCAN_FILE:
2036 case SCAN_ANON:
2037 /* Scan one type exclusively */
2038 if ((scan_balance == SCAN_FILE) != file)
2039 scan = 0;
2040 break;
2041 default:
2042 /* Look ma, no brain */
2043 BUG();
2044 }
2045 nr[lru] = scan;
2008 /* 2046 /*
2009 * Scan types proportional to swappiness and 2047 * Skip the second pass and don't force_scan,
2010 * their relative recent reclaim efficiency. 2048 * if we found something to scan.
2011 */ 2049 */
2012 scan = div64_u64(scan * fraction[file], denominator); 2050 some_scanned |= !!scan;
2013 break;
2014 case SCAN_FILE:
2015 case SCAN_ANON:
2016 /* Scan one type exclusively */
2017 if ((scan_balance == SCAN_FILE) != file)
2018 scan = 0;
2019 break;
2020 default:
2021 /* Look ma, no brain */
2022 BUG();
2023 } 2051 }
2024 nr[lru] = scan;
2025 } 2052 }
2026} 2053}
2027 2054
@@ -2037,13 +2064,27 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
2037 unsigned long nr_reclaimed = 0; 2064 unsigned long nr_reclaimed = 0;
2038 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 2065 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
2039 struct blk_plug plug; 2066 struct blk_plug plug;
2040 bool scan_adjusted = false; 2067 bool scan_adjusted;
2041 2068
2042 get_scan_count(lruvec, sc, nr); 2069 get_scan_count(lruvec, sc, nr);
2043 2070
2044 /* Record the original scan target for proportional adjustments later */ 2071 /* Record the original scan target for proportional adjustments later */
2045 memcpy(targets, nr, sizeof(nr)); 2072 memcpy(targets, nr, sizeof(nr));
2046 2073
2074 /*
2075 * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
2076 * event that can occur when there is little memory pressure e.g.
2077 * multiple streaming readers/writers. Hence, we do not abort scanning
2078 * when the requested number of pages are reclaimed when scanning at
2079 * DEF_PRIORITY on the assumption that the fact we are direct
2080 * reclaiming implies that kswapd is not keeping up and it is best to
2081 * do a batch of work at once. For memcg reclaim one check is made to
2082 * abort proportional reclaim if either the file or anon lru has already
2083 * dropped to zero at the first pass.
2084 */
2085 scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
2086 sc->priority == DEF_PRIORITY);
2087
2047 blk_start_plug(&plug); 2088 blk_start_plug(&plug);
2048 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 2089 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
2049 nr[LRU_INACTIVE_FILE]) { 2090 nr[LRU_INACTIVE_FILE]) {
@@ -2064,17 +2105,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
2064 continue; 2105 continue;
2065 2106
2066 /* 2107 /*
2067 * For global direct reclaim, reclaim only the number of pages
2068 * requested. Less care is taken to scan proportionally as it
2069 * is more important to minimise direct reclaim stall latency
2070 * than it is to properly age the LRU lists.
2071 */
2072 if (global_reclaim(sc) && !current_is_kswapd())
2073 break;
2074
2075 /*
2076 * For kswapd and memcg, reclaim at least the number of pages 2108 * For kswapd and memcg, reclaim at least the number of pages
2077 * requested. Ensure that the anon and file LRUs shrink 2109 * requested. Ensure that the anon and file LRUs are scanned
2078 * proportionally what was requested by get_scan_count(). We 2110 * proportionally what was requested by get_scan_count(). We
2079 * stop reclaiming one LRU and reduce the amount scanning 2111 * stop reclaiming one LRU and reduce the amount scanning
2080 * proportional to the original scan target. 2112 * proportional to the original scan target.
@@ -2082,6 +2114,15 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
2082 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; 2114 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
2083 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; 2115 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
2084 2116
2117 /*
2118 * It's just vindictive to attack the larger once the smaller
2119 * has gone to zero. And given the way we stop scanning the
2120 * smaller below, this makes sure that we only make one nudge
2121 * towards proportionality once we've got nr_to_reclaim.
2122 */
2123 if (!nr_file || !nr_anon)
2124 break;
2125
2085 if (nr_file > nr_anon) { 2126 if (nr_file > nr_anon) {
2086 unsigned long scan_target = targets[LRU_INACTIVE_ANON] + 2127 unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
2087 targets[LRU_ACTIVE_ANON] + 1; 2128 targets[LRU_ACTIVE_ANON] + 1;
@@ -2268,9 +2309,8 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2268 * there is a buffer of free pages available to give compaction 2309 * there is a buffer of free pages available to give compaction
2269 * a reasonable chance of completing and allocating the page 2310 * a reasonable chance of completing and allocating the page
2270 */ 2311 */
2271 balance_gap = min(low_wmark_pages(zone), 2312 balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
2272 (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / 2313 zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
2273 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2274 watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); 2314 watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
2275 watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); 2315 watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
2276 2316
@@ -2525,10 +2565,17 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
2525 2565
2526 for (i = 0; i <= ZONE_NORMAL; i++) { 2566 for (i = 0; i <= ZONE_NORMAL; i++) {
2527 zone = &pgdat->node_zones[i]; 2567 zone = &pgdat->node_zones[i];
2568 if (!populated_zone(zone))
2569 continue;
2570
2528 pfmemalloc_reserve += min_wmark_pages(zone); 2571 pfmemalloc_reserve += min_wmark_pages(zone);
2529 free_pages += zone_page_state(zone, NR_FREE_PAGES); 2572 free_pages += zone_page_state(zone, NR_FREE_PAGES);
2530 } 2573 }
2531 2574
2575 /* If there are no reserves (unexpected config) then do not throttle */
2576 if (!pfmemalloc_reserve)
2577 return true;
2578
2532 wmark_ok = free_pages > pfmemalloc_reserve / 2; 2579 wmark_ok = free_pages > pfmemalloc_reserve / 2;
2533 2580
2534 /* kswapd must be awake if processes are being throttled */ 2581 /* kswapd must be awake if processes are being throttled */
@@ -2553,9 +2600,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
2553static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, 2600static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
2554 nodemask_t *nodemask) 2601 nodemask_t *nodemask)
2555{ 2602{
2603 struct zoneref *z;
2556 struct zone *zone; 2604 struct zone *zone;
2557 int high_zoneidx = gfp_zone(gfp_mask); 2605 pg_data_t *pgdat = NULL;
2558 pg_data_t *pgdat;
2559 2606
2560 /* 2607 /*
2561 * Kernel threads should not be throttled as they may be indirectly 2608 * Kernel threads should not be throttled as they may be indirectly
@@ -2574,10 +2621,34 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
2574 if (fatal_signal_pending(current)) 2621 if (fatal_signal_pending(current))
2575 goto out; 2622 goto out;
2576 2623
2577 /* Check if the pfmemalloc reserves are ok */ 2624 /*
2578 first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); 2625 * Check if the pfmemalloc reserves are ok by finding the first node
2579 pgdat = zone->zone_pgdat; 2626 * with a usable ZONE_NORMAL or lower zone. The expectation is that
2580 if (pfmemalloc_watermark_ok(pgdat)) 2627 * GFP_KERNEL will be required for allocating network buffers when
2628 * swapping over the network so ZONE_HIGHMEM is unusable.
2629 *
2630 * Throttling is based on the first usable node and throttled processes
2631 * wait on a queue until kswapd makes progress and wakes them. There
2632 * is an affinity then between processes waking up and where reclaim
2633 * progress has been made assuming the process wakes on the same node.
2634 * More importantly, processes running on remote nodes will not compete
2635 * for remote pfmemalloc reserves and processes on different nodes
2636 * should make reasonable progress.
2637 */
2638 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2639 gfp_mask, nodemask) {
2640 if (zone_idx(zone) > ZONE_NORMAL)
2641 continue;
2642
2643 /* Throttle based on the first usable node */
2644 pgdat = zone->zone_pgdat;
2645 if (pfmemalloc_watermark_ok(pgdat))
2646 goto out;
2647 break;
2648 }
2649
2650 /* If no zone was usable by the allocation flags then do not throttle */
2651 if (!pgdat)
2581 goto out; 2652 goto out;
2582 2653
2583 /* Account for the throttling */ 2654 /* Account for the throttling */
@@ -2891,9 +2962,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
2891 * high wmark plus a "gap" where the gap is either the low 2962 * high wmark plus a "gap" where the gap is either the low
2892 * watermark or 1% of the zone, whichever is smaller. 2963 * watermark or 1% of the zone, whichever is smaller.
2893 */ 2964 */
2894 balance_gap = min(low_wmark_pages(zone), 2965 balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
2895 (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / 2966 zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
2896 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2897 2967
2898 /* 2968 /*
2899 * If there is no low memory pressure or the zone is balanced then no 2969 * If there is no low memory pressure or the zone is balanced then no
@@ -3422,7 +3492,7 @@ int kswapd_run(int nid)
3422 3492
3423/* 3493/*
3424 * Called by memory hotplug when all memory in a node is offlined. Caller must 3494 * Called by memory hotplug when all memory in a node is offlined. Caller must
3425 * hold lock_memory_hotplug(). 3495 * hold mem_hotplug_begin/end().
3426 */ 3496 */
3427void kswapd_stop(int nid) 3497void kswapd_stop(int nid)
3428{ 3498{
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 302dd076b8bf..b37bd49bfd55 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -207,7 +207,9 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
207} 207}
208 208
209/* 209/*
210 * For use when we know that interrupts are disabled. 210 * For use when we know that interrupts are disabled,
211 * or when we know that preemption is disabled and that
212 * particular counter cannot be updated from interrupt context.
211 */ 213 */
212void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, 214void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
213 int delta) 215 int delta)
@@ -489,7 +491,7 @@ static void refresh_cpu_vm_stats(void)
489 continue; 491 continue;
490 492
491 if (__this_cpu_read(p->pcp.count)) 493 if (__this_cpu_read(p->pcp.count))
492 drain_zone_pages(zone, __this_cpu_ptr(&p->pcp)); 494 drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
493#endif 495#endif
494 } 496 }
495 fold_diff(global_diff); 497 fold_diff(global_diff);
@@ -866,6 +868,10 @@ const char * const vmstat_text[] = {
866 "nr_tlb_local_flush_one", 868 "nr_tlb_local_flush_one",
867#endif /* CONFIG_DEBUG_TLBFLUSH */ 869#endif /* CONFIG_DEBUG_TLBFLUSH */
868 870
871#ifdef CONFIG_DEBUG_VM_VMACACHE
872 "vmacache_find_calls",
873 "vmacache_find_hits",
874#endif
869#endif /* CONFIG_VM_EVENTS_COUNTERS */ 875#endif /* CONFIG_VM_EVENTS_COUNTERS */
870}; 876};
871#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ 877#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
@@ -1226,7 +1232,7 @@ int sysctl_stat_interval __read_mostly = HZ;
1226static void vmstat_update(struct work_struct *w) 1232static void vmstat_update(struct work_struct *w)
1227{ 1233{
1228 refresh_cpu_vm_stats(); 1234 refresh_cpu_vm_stats();
1229 schedule_delayed_work(&__get_cpu_var(vmstat_work), 1235 schedule_delayed_work(this_cpu_ptr(&vmstat_work),
1230 round_jiffies_relative(sysctl_stat_interval)); 1236 round_jiffies_relative(sysctl_stat_interval));
1231} 1237}
1232 1238
diff --git a/mm/zbud.c b/mm/zbud.c
index 9451361e6aa7..01df13a7e2e1 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -247,7 +247,7 @@ void zbud_destroy_pool(struct zbud_pool *pool)
247 * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate 247 * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
248 * a new page. 248 * a new page.
249 */ 249 */
250int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp, 250int zbud_alloc(struct zbud_pool *pool, unsigned int size, gfp_t gfp,
251 unsigned long *handle) 251 unsigned long *handle)
252{ 252{
253 int chunks, i, freechunks; 253 int chunks, i, freechunks;
@@ -255,7 +255,7 @@ int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp,
255 enum buddy bud; 255 enum buddy bud;
256 struct page *page; 256 struct page *page;
257 257
258 if (size <= 0 || gfp & __GFP_HIGHMEM) 258 if (!size || (gfp & __GFP_HIGHMEM))
259 return -EINVAL; 259 return -EINVAL;
260 if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) 260 if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
261 return -ENOSPC; 261 return -ENOSPC;
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 36b4591a7a2d..fe78189624cf 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -141,7 +141,7 @@
141#define ZS_MAX_ALLOC_SIZE PAGE_SIZE 141#define ZS_MAX_ALLOC_SIZE PAGE_SIZE
142 142
143/* 143/*
144 * On systems with 4K page size, this gives 254 size classes! There is a 144 * On systems with 4K page size, this gives 255 size classes! There is a
145 * trader-off here: 145 * trader-off here:
146 * - Large number of size classes is potentially wasteful as free page are 146 * - Large number of size classes is potentially wasteful as free page are
147 * spread across these classes 147 * spread across these classes
@@ -1082,7 +1082,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
1082 class = &pool->size_class[class_idx]; 1082 class = &pool->size_class[class_idx];
1083 off = obj_idx_to_offset(page, obj_idx, class->size); 1083 off = obj_idx_to_offset(page, obj_idx, class->size);
1084 1084
1085 area = &__get_cpu_var(zs_map_area); 1085 area = this_cpu_ptr(&zs_map_area);
1086 if (off + class->size <= PAGE_SIZE) 1086 if (off + class->size <= PAGE_SIZE)
1087 kunmap_atomic(area->vm_addr); 1087 kunmap_atomic(area->vm_addr);
1088 else { 1088 else {
diff --git a/mm/zswap.c b/mm/zswap.c
index aeaef0fb5624..008388fe7b0f 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -347,7 +347,7 @@ static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu)
347 return NOTIFY_BAD; 347 return NOTIFY_BAD;
348 } 348 }
349 *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm; 349 *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm;
350 dst = kmalloc(PAGE_SIZE * 2, GFP_KERNEL); 350 dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
351 if (!dst) { 351 if (!dst) {
352 pr_err("can't allocate compressor buffer\n"); 352 pr_err("can't allocate compressor buffer\n");
353 crypto_free_comp(tfm); 353 crypto_free_comp(tfm);
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 34eb2160489d..010b18ef4ea0 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -24,6 +24,7 @@ my $emacs = 0;
24my $terse = 0; 24my $terse = 0;
25my $file = 0; 25my $file = 0;
26my $check = 0; 26my $check = 0;
27my $check_orig = 0;
27my $summary = 1; 28my $summary = 1;
28my $mailback = 0; 29my $mailback = 0;
29my $summary_file = 0; 30my $summary_file = 0;
@@ -146,6 +147,7 @@ GetOptions(
146help(0) if ($help); 147help(0) if ($help);
147 148
148$fix = 1 if ($fix_inplace); 149$fix = 1 if ($fix_inplace);
150$check_orig = $check;
149 151
150my $exit = 0; 152my $exit = 0;
151 153
@@ -397,6 +399,11 @@ foreach my $entry (@mode_permission_funcs) {
397 $mode_perms_search .= $entry->[0]; 399 $mode_perms_search .= $entry->[0];
398} 400}
399 401
402our $declaration_macros = qr{(?x:
403 (?:$Storage\s+)?(?:DECLARE|DEFINE)_[A-Z]+\s*\(|
404 (?:$Storage\s+)?LIST_HEAD\s*\(
405)};
406
400our $allowed_asm_includes = qr{(?x: 407our $allowed_asm_includes = qr{(?x:
401 irq| 408 irq|
402 memory 409 memory
@@ -1808,11 +1815,13 @@ sub process {
1808 $here = "#$linenr: " if (!$file); 1815 $here = "#$linenr: " if (!$file);
1809 $here = "#$realline: " if ($file); 1816 $here = "#$realline: " if ($file);
1810 1817
1818 my $found_file = 0;
1811 # extract the filename as it passes 1819 # extract the filename as it passes
1812 if ($line =~ /^diff --git.*?(\S+)$/) { 1820 if ($line =~ /^diff --git.*?(\S+)$/) {
1813 $realfile = $1; 1821 $realfile = $1;
1814 $realfile =~ s@^([^/]*)/@@ if (!$file); 1822 $realfile =~ s@^([^/]*)/@@ if (!$file);
1815 $in_commit_log = 0; 1823 $in_commit_log = 0;
1824 $found_file = 1;
1816 } elsif ($line =~ /^\+\+\+\s+(\S+)/) { 1825 } elsif ($line =~ /^\+\+\+\s+(\S+)/) {
1817 $realfile = $1; 1826 $realfile = $1;
1818 $realfile =~ s@^([^/]*)/@@ if (!$file); 1827 $realfile =~ s@^([^/]*)/@@ if (!$file);
@@ -1829,6 +1838,15 @@ sub process {
1829 ERROR("MODIFIED_INCLUDE_ASM", 1838 ERROR("MODIFIED_INCLUDE_ASM",
1830 "do not modify files in include/asm, change architecture specific files in include/asm-<architecture>\n" . "$here$rawline\n"); 1839 "do not modify files in include/asm, change architecture specific files in include/asm-<architecture>\n" . "$here$rawline\n");
1831 } 1840 }
1841 $found_file = 1;
1842 }
1843
1844 if ($found_file) {
1845 if ($realfile =~ m@^(drivers/net/|net/)@) {
1846 $check = 1;
1847 } else {
1848 $check = $check_orig;
1849 }
1832 next; 1850 next;
1833 } 1851 }
1834 1852
@@ -1926,6 +1944,12 @@ sub process {
1926 } 1944 }
1927 } 1945 }
1928 1946
1947# Check for old stable address
1948 if ($line =~ /^\s*cc:\s*.*<?\bstable\@kernel\.org\b>?.*$/i) {
1949 ERROR("STABLE_ADDRESS",
1950 "The 'stable' address should be 'stable\@vger.kernel.org'\n" . $herecurr);
1951 }
1952
1929# Check for unwanted Gerrit info 1953# Check for unwanted Gerrit info
1930 if ($in_commit_log && $line =~ /^\s*change-id:/i) { 1954 if ($in_commit_log && $line =~ /^\s*change-id:/i) {
1931 ERROR("GERRIT_CHANGE_ID", 1955 ERROR("GERRIT_CHANGE_ID",
@@ -2093,8 +2117,10 @@ sub process {
2093 2117
2094 foreach my $compat (@compats) { 2118 foreach my $compat (@compats) {
2095 my $compat2 = $compat; 2119 my $compat2 = $compat;
2096 $compat2 =~ s/\,[a-z]*\-/\,<\.\*>\-/; 2120 $compat2 =~ s/\,[a-zA-Z0-9]*\-/\,<\.\*>\-/;
2097 `grep -Erq "$compat|$compat2" $dt_path`; 2121 my $compat3 = $compat;
2122 $compat3 =~ s/\,([a-z]*)[0-9]*\-/\,$1<\.\*>\-/;
2123 `grep -Erq "$compat|$compat2|$compat3" $dt_path`;
2098 if ( $? >> 8 ) { 2124 if ( $? >> 8 ) {
2099 WARN("UNDOCUMENTED_DT_STRING", 2125 WARN("UNDOCUMENTED_DT_STRING",
2100 "DT compatible string \"$compat\" appears un-documented -- check $dt_path\n" . $herecurr); 2126 "DT compatible string \"$compat\" appears un-documented -- check $dt_path\n" . $herecurr);
@@ -2266,18 +2292,37 @@ sub process {
2266 } 2292 }
2267 2293
2268# check for missing blank lines after declarations 2294# check for missing blank lines after declarations
2269 if ($realfile =~ m@^(drivers/net/|net/)@ && 2295 if ($sline =~ /^\+\s+\S/ && #Not at char 1
2270 $prevline =~ /^\+\s+$Declare\s+$Ident/ && 2296 # actual declarations
2271 !($prevline =~ /(?:$Compare|$Assignment|$Operators)\s*$/ || 2297 ($prevline =~ /^\+\s+$Declare\s*$Ident\s*[=,;:\[]/ ||
2272 $prevline =~ /(?:\{\s*|\\)$/) && #extended lines 2298 # foo bar; where foo is some local typedef or #define
2273 $sline =~ /^\+\s+/ && #Not at char 1 2299 $prevline =~ /^\+\s+$Ident(?:\s+|\s*\*\s*)$Ident\s*[=,;\[]/ ||
2274 !($sline =~ /^\+\s+$Declare/ || 2300 # known declaration macros
2275 $sline =~ /^\+\s+$Ident\s+$Ident/ || #eg: typedef foo 2301 $prevline =~ /^\+\s+$declaration_macros/) &&
2302 # for "else if" which can look like "$Ident $Ident"
2303 !($prevline =~ /^\+\s+$c90_Keywords\b/ ||
2304 # other possible extensions of declaration lines
2305 $prevline =~ /(?:$Compare|$Assignment|$Operators)\s*$/ ||
2306 # not starting a section or a macro "\" extended line
2307 $prevline =~ /(?:\{\s*|\\)$/) &&
2308 # looks like a declaration
2309 !($sline =~ /^\+\s+$Declare\s*$Ident\s*[=,;:\[]/ ||
2310 # foo bar; where foo is some local typedef or #define
2311 $sline =~ /^\+\s+$Ident(?:\s+|\s*\*\s*)$Ident\s*[=,;\[]/ ||
2312 # known declaration macros
2313 $sline =~ /^\+\s+$declaration_macros/ ||
2314 # start of struct or union or enum
2276 $sline =~ /^\+\s+(?:union|struct|enum|typedef)\b/ || 2315 $sline =~ /^\+\s+(?:union|struct|enum|typedef)\b/ ||
2277 $sline =~ /^\+\s+(?:$|[\{\}\.\#\"\?\:\(])/ || 2316 # start or end of block or continuation of declaration
2278 $sline =~ /^\+\s+\(?\s*(?:$Compare|$Assignment|$Operators)/)) { 2317 $sline =~ /^\+\s+(?:$|[\{\}\.\#\"\?\:\(\[])/ ||
2318 # bitfield continuation
2319 $sline =~ /^\+\s+$Ident\s*:\s*\d+\s*[,;]/ ||
2320 # other possible extensions of declaration lines
2321 $sline =~ /^\+\s+\(?\s*(?:$Compare|$Assignment|$Operators)/) &&
2322 # indentation of previous and current line are the same
2323 (($prevline =~ /\+(\s+)\S/) && $sline =~ /^\+$1\S/)) {
2279 WARN("SPACING", 2324 WARN("SPACING",
2280 "networking uses a blank line after declarations\n" . $hereprev); 2325 "Missing a blank line after declarations\n" . $hereprev);
2281 } 2326 }
2282 2327
2283# check for spaces at the beginning of a line. 2328# check for spaces at the beginning of a line.
@@ -3431,6 +3476,13 @@ sub process {
3431 } 3476 }
3432 } 3477 }
3433 3478
3479# unnecessary return in a void function? (a single leading tab, then return;)
3480 if ($sline =~ /^\+\treturn\s*;\s*$/ &&
3481 $prevline =~ /^\+/) {
3482 WARN("RETURN_VOID",
3483 "void function return statements are not generally useful\n" . $herecurr);
3484 }
3485
3434# if statements using unnecessary parentheses - ie: if ((foo == bar)) 3486# if statements using unnecessary parentheses - ie: if ((foo == bar))
3435 if ($^V && $^V ge 5.10.0 && 3487 if ($^V && $^V ge 5.10.0 &&
3436 $line =~ /\bif\s*((?:\(\s*){2,})/) { 3488 $line =~ /\bif\s*((?:\(\s*){2,})/) {
@@ -3782,6 +3834,17 @@ sub process {
3782 WARN("DO_WHILE_MACRO_WITH_TRAILING_SEMICOLON", 3834 WARN("DO_WHILE_MACRO_WITH_TRAILING_SEMICOLON",
3783 "do {} while (0) macros should not be semicolon terminated\n" . "$herectx"); 3835 "do {} while (0) macros should not be semicolon terminated\n" . "$herectx");
3784 } 3836 }
3837 } elsif ($dstat =~ /^\+\s*#\s*define\s+$Ident.*;\s*$/) {
3838 $ctx =~ s/\n*$//;
3839 my $cnt = statement_rawlines($ctx);
3840 my $herectx = $here . "\n";
3841
3842 for (my $n = 0; $n < $cnt; $n++) {
3843 $herectx .= raw_line($linenr, $n) . "\n";
3844 }
3845
3846 WARN("TRAILING_SEMICOLON",
3847 "macros should not use a trailing semicolon\n" . "$herectx");
3785 } 3848 }
3786 } 3849 }
3787 3850
@@ -4264,6 +4327,27 @@ sub process {
4264 "unchecked sscanf return value\n" . "$here\n$stat_real\n"); 4327 "unchecked sscanf return value\n" . "$here\n$stat_real\n");
4265 } 4328 }
4266 4329
4330# check for simple sscanf that should be kstrto<foo>
4331 if ($^V && $^V ge 5.10.0 &&
4332 defined $stat &&
4333 $line =~ /\bsscanf\b/) {
4334 my $lc = $stat =~ tr@\n@@;
4335 $lc = $lc + $linenr;
4336 my $stat_real = raw_line($linenr, 0);
4337 for (my $count = $linenr + 1; $count <= $lc; $count++) {
4338 $stat_real = $stat_real . "\n" . raw_line($count, 0);
4339 }
4340 if ($stat_real =~ /\bsscanf\b\s*\(\s*$FuncArg\s*,\s*("[^"]+")/) {
4341 my $format = $6;
4342 my $count = $format =~ tr@%@%@;
4343 if ($count == 1 &&
4344 $format =~ /^"\%(?i:ll[udxi]|[udxi]ll|ll|[hl]h?[udxi]|[udxi][hl]h?|[hl]h?|[udxi])"$/) {
4345 WARN("SSCANF_TO_KSTRTO",
4346 "Prefer kstrto<type> to single variable sscanf\n" . "$here\n$stat_real\n");
4347 }
4348 }
4349 }
4350
4267# check for new externs in .h files. 4351# check for new externs in .h files.
4268 if ($realfile =~ /\.h$/ && 4352 if ($realfile =~ /\.h$/ &&
4269 $line =~ /^\+\s*(extern\s+)$Type\s*$Ident\s*\(/s) { 4353 $line =~ /^\+\s*(extern\s+)$Type\s*$Ident\s*\(/s) {
@@ -4328,6 +4412,30 @@ sub process {
4328 "Prefer $3(sizeof(*$1)...) over $3($4...)\n" . $herecurr); 4412 "Prefer $3(sizeof(*$1)...) over $3($4...)\n" . $herecurr);
4329 } 4413 }
4330 4414
4415# check for k[mz]alloc with multiplies that could be kmalloc_array/kcalloc
4416 if ($^V && $^V ge 5.10.0 &&
4417 $line =~ /\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*(k[mz]alloc)\s*\(\s*($FuncArg)\s*\*\s*($FuncArg)/) {
4418 my $oldfunc = $3;
4419 my $a1 = $4;
4420 my $a2 = $10;
4421 my $newfunc = "kmalloc_array";
4422 $newfunc = "kcalloc" if ($oldfunc eq "kzalloc");
4423 if ($a1 =~ /^sizeof\s*\S/ || $a2 =~ /^sizeof\s*\S/) {
4424 if (WARN("ALLOC_WITH_MULTIPLY",
4425 "Prefer $newfunc over $oldfunc with multiply\n" . $herecurr) &&
4426 $fix) {
4427 my $r1 = $a1;
4428 my $r2 = $a2;
4429 if ($a1 =~ /^sizeof\s*\S/) {
4430 $r1 = $a2;
4431 $r2 = $a1;
4432 }
4433 $fixed[$linenr - 1] =~ s/\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*(k[mz]alloc)\s*\(\s*($FuncArg)\s*\*\s*($FuncArg)/$1 . ' = ' . "$newfunc(" . trim($r1) . ', ' . trim($r2)/e;
4434
4435 }
4436 }
4437 }
4438
4331# check for krealloc arg reuse 4439# check for krealloc arg reuse
4332 if ($^V && $^V ge 5.10.0 && 4440 if ($^V && $^V ge 5.10.0 &&
4333 $line =~ /\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*krealloc\s*\(\s*\1\s*,/) { 4441 $line =~ /\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*krealloc\s*\(\s*\1\s*,/) {
@@ -4443,10 +4551,10 @@ sub process {
4443 "$1 is obsolete, use k$3 instead\n" . $herecurr); 4551 "$1 is obsolete, use k$3 instead\n" . $herecurr);
4444 } 4552 }
4445 4553
4446# check for __initcall(), use device_initcall() explicitly please 4554# check for __initcall(), use device_initcall() explicitly or more appropriate function please
4447 if ($line =~ /^.\s*__initcall\s*\(/) { 4555 if ($line =~ /^.\s*__initcall\s*\(/) {
4448 WARN("USE_DEVICE_INITCALL", 4556 WARN("USE_DEVICE_INITCALL",
4449 "please use device_initcall() instead of __initcall()\n" . $herecurr); 4557 "please use device_initcall() or more appropriate function instead of __initcall() (see include/linux/init.h)\n" . $herecurr);
4450 } 4558 }
4451 4559
4452# check for various ops structs, ensure they are const. 4560# check for various ops structs, ensure they are const.
diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c
index 05654f5e48d5..c4d6d2e20e0d 100644
--- a/tools/vm/page-types.c
+++ b/tools/vm/page-types.c
@@ -32,6 +32,8 @@
32#include <assert.h> 32#include <assert.h>
33#include <ftw.h> 33#include <ftw.h>
34#include <time.h> 34#include <time.h>
35#include <setjmp.h>
36#include <signal.h>
35#include <sys/types.h> 37#include <sys/types.h>
36#include <sys/errno.h> 38#include <sys/errno.h>
37#include <sys/fcntl.h> 39#include <sys/fcntl.h>
@@ -824,21 +826,38 @@ static void show_file(const char *name, const struct stat *st)
824 atime, now - st->st_atime); 826 atime, now - st->st_atime);
825} 827}
826 828
829static sigjmp_buf sigbus_jmp;
830
831static void * volatile sigbus_addr;
832
833static void sigbus_handler(int sig, siginfo_t *info, void *ucontex)
834{
835 (void)sig;
836 (void)ucontex;
837 sigbus_addr = info ? info->si_addr : NULL;
838 siglongjmp(sigbus_jmp, 1);
839}
840
841static struct sigaction sigbus_action = {
842 .sa_sigaction = sigbus_handler,
843 .sa_flags = SA_SIGINFO,
844};
845
827static void walk_file(const char *name, const struct stat *st) 846static void walk_file(const char *name, const struct stat *st)
828{ 847{
829 uint8_t vec[PAGEMAP_BATCH]; 848 uint8_t vec[PAGEMAP_BATCH];
830 uint64_t buf[PAGEMAP_BATCH], flags; 849 uint64_t buf[PAGEMAP_BATCH], flags;
831 unsigned long nr_pages, pfn, i; 850 unsigned long nr_pages, pfn, i;
851 off_t off, end = st->st_size;
832 int fd; 852 int fd;
833 off_t off;
834 ssize_t len; 853 ssize_t len;
835 void *ptr; 854 void *ptr;
836 int first = 1; 855 int first = 1;
837 856
838 fd = checked_open(name, O_RDONLY|O_NOATIME|O_NOFOLLOW); 857 fd = checked_open(name, O_RDONLY|O_NOATIME|O_NOFOLLOW);
839 858
840 for (off = 0; off < st->st_size; off += len) { 859 for (off = 0; off < end; off += len) {
841 nr_pages = (st->st_size - off + page_size - 1) / page_size; 860 nr_pages = (end - off + page_size - 1) / page_size;
842 if (nr_pages > PAGEMAP_BATCH) 861 if (nr_pages > PAGEMAP_BATCH)
843 nr_pages = PAGEMAP_BATCH; 862 nr_pages = PAGEMAP_BATCH;
844 len = nr_pages * page_size; 863 len = nr_pages * page_size;
@@ -855,11 +874,19 @@ static void walk_file(const char *name, const struct stat *st)
855 if (madvise(ptr, len, MADV_RANDOM)) 874 if (madvise(ptr, len, MADV_RANDOM))
856 fatal("madvice failed: %s", name); 875 fatal("madvice failed: %s", name);
857 876
877 if (sigsetjmp(sigbus_jmp, 1)) {
878 end = off + sigbus_addr ? sigbus_addr - ptr : 0;
879 fprintf(stderr, "got sigbus at offset %lld: %s\n",
880 (long long)end, name);
881 goto got_sigbus;
882 }
883
858 /* populate ptes */ 884 /* populate ptes */
859 for (i = 0; i < nr_pages ; i++) { 885 for (i = 0; i < nr_pages ; i++) {
860 if (vec[i] & 1) 886 if (vec[i] & 1)
861 (void)*(volatile int *)(ptr + i * page_size); 887 (void)*(volatile int *)(ptr + i * page_size);
862 } 888 }
889got_sigbus:
863 890
864 /* turn off harvesting reference bits */ 891 /* turn off harvesting reference bits */
865 if (madvise(ptr, len, MADV_SEQUENTIAL)) 892 if (madvise(ptr, len, MADV_SEQUENTIAL))
@@ -910,6 +937,7 @@ static void walk_page_cache(void)
910 937
911 kpageflags_fd = checked_open(PROC_KPAGEFLAGS, O_RDONLY); 938 kpageflags_fd = checked_open(PROC_KPAGEFLAGS, O_RDONLY);
912 pagemap_fd = checked_open("/proc/self/pagemap", O_RDONLY); 939 pagemap_fd = checked_open("/proc/self/pagemap", O_RDONLY);
940 sigaction(SIGBUS, &sigbus_action, NULL);
913 941
914 if (stat(opt_file, &st)) 942 if (stat(opt_file, &st))
915 fatal("stat failed: %s\n", opt_file); 943 fatal("stat failed: %s\n", opt_file);
@@ -925,6 +953,7 @@ static void walk_page_cache(void)
925 953
926 close(kpageflags_fd); 954 close(kpageflags_fd);
927 close(pagemap_fd); 955 close(pagemap_fd);
956 signal(SIGBUS, SIG_DFL);
928} 957}
929 958
930static void parse_file(const char *name) 959static void parse_file(const char *name)