aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/stable/sysfs-devices-node8
-rw-r--r--Documentation/ABI/testing/sysfs-block-zram32
-rw-r--r--Documentation/ABI/testing/sysfs-devices-memory8
-rw-r--r--Documentation/blockdev/zram.txt25
-rw-r--r--Documentation/kernel-parameters.txt17
-rw-r--r--Documentation/memory-hotplug.txt11
-rw-r--r--arch/alpha/include/asm/Kbuild1
-rw-r--r--arch/alpha/include/asm/sections.h7
-rw-r--r--arch/arm/Kconfig6
-rw-r--r--arch/arm/include/asm/pgtable-2level.h2
-rw-r--r--arch/arm/include/asm/pgtable-3level.h15
-rw-r--r--arch/arm/include/asm/pgtable.h6
-rw-r--r--arch/arm/include/asm/tlb.h38
-rw-r--r--arch/arm/kernel/hibernate.c3
-rw-r--r--arch/arm/mm/dma-mapping.c210
-rw-r--r--arch/arm/mm/flush.c15
-rw-r--r--arch/arm/mm/init.c2
-rw-r--r--arch/arm64/Kconfig5
-rw-r--r--arch/arm64/include/asm/pgtable.h21
-rw-r--r--arch/arm64/include/asm/tlb.h20
-rw-r--r--arch/arm64/mm/dma-mapping.c164
-rw-r--r--arch/arm64/mm/flush.c16
-rw-r--r--arch/cris/include/asm/Kbuild1
-rw-r--r--arch/cris/include/asm/sections.h7
-rw-r--r--arch/frv/include/asm/processor.h16
-rw-r--r--arch/frv/kernel/irq-mb93091.c8
-rw-r--r--arch/frv/kernel/irq-mb93093.c1
-rw-r--r--arch/frv/kernel/irq-mb93493.c4
-rw-r--r--arch/frv/kernel/setup.c2
-rw-r--r--arch/frv/kernel/time.c1
-rw-r--r--arch/m32r/include/asm/Kbuild1
-rw-r--r--arch/m32r/include/asm/sections.h7
-rw-r--r--arch/m32r/kernel/time.c1
-rw-r--r--arch/m68k/kernel/sys_m68k.c21
-rw-r--r--arch/mips/include/asm/suspend.h7
-rw-r--r--arch/mips/power/cpu.c2
-rw-r--r--arch/mn10300/include/asm/Kbuild1
-rw-r--r--arch/mn10300/include/asm/sections.h1
-rw-r--r--arch/powerpc/include/asm/pgtable.h57
-rw-r--r--arch/powerpc/include/asm/pte-common.h5
-rw-r--r--arch/powerpc/kernel/suspend.c4
-rw-r--r--arch/s390/kernel/suspend.c6
-rw-r--r--arch/score/include/asm/Kbuild1
-rw-r--r--arch/score/include/asm/sections.h6
-rw-r--r--arch/sh/include/asm/sections.h1
-rw-r--r--arch/sparc/power/hibernate.c4
-rw-r--r--arch/unicore32/include/mach/pm.h3
-rw-r--r--arch/unicore32/kernel/hibernate.c1
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/include/asm/pgtable_types.h14
-rw-r--r--arch/x86/power/hibernate_32.c4
-rw-r--r--arch/x86/power/hibernate_64.c4
-rw-r--r--drivers/base/Kconfig3
-rw-r--r--drivers/base/dma-mapping.c72
-rw-r--r--drivers/base/memory.c42
-rw-r--r--drivers/base/node.c3
-rw-r--r--drivers/block/zram/zram_drv.c106
-rw-r--r--drivers/block/zram/zram_drv.h6
-rw-r--r--drivers/firmware/memmap.c3
-rw-r--r--drivers/virtio/Kconfig1
-rw-r--r--drivers/virtio/virtio_balloon.c76
-rw-r--r--fs/block_dev.c7
-rw-r--r--fs/buffer.c28
-rw-r--r--fs/internal.h5
-rw-r--r--fs/mpage.c2
-rw-r--r--fs/notify/fanotify/fanotify_user.c2
-rw-r--r--fs/notify/fsnotify.h3
-rw-r--r--fs/notify/group.c2
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c6
-rw-r--r--fs/ntfs/debug.c2
-rw-r--r--fs/ntfs/file.c5
-rw-r--r--fs/ntfs/super.c2
-rw-r--r--fs/ocfs2/aops.c15
-rw-r--r--fs/ocfs2/cluster/heartbeat.c19
-rw-r--r--fs/ocfs2/cluster/heartbeat.h1
-rw-r--r--fs/ocfs2/cluster/netdebug.c78
-rw-r--r--fs/ocfs2/cluster/tcp.c43
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c39
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c44
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c3
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c7
-rw-r--r--fs/ocfs2/dlmglue.c23
-rw-r--r--fs/ocfs2/file.c47
-rw-r--r--fs/ocfs2/inode.h2
-rw-r--r--fs/ocfs2/move_extents.c2
-rw-r--r--fs/ocfs2/stack_user.c2
-rw-r--r--fs/proc/base.c36
-rw-r--r--fs/proc/internal.h5
-rw-r--r--fs/proc/kcore.c4
-rw-r--r--fs/proc/page.c3
-rw-r--r--fs/proc/task_mmu.c332
-rw-r--r--fs/proc/task_nommu.c88
-rw-r--r--include/asm-generic/dma-mapping-common.h9
-rw-r--r--include/asm-generic/pgtable.h27
-rw-r--r--include/asm-generic/sections.h4
-rw-r--r--include/linux/balloon_compaction.h169
-rw-r--r--include/linux/blkdev.h2
-rw-r--r--include/linux/compaction.h24
-rw-r--r--include/linux/genalloc.h7
-rw-r--r--include/linux/gfp.h2
-rw-r--r--include/linux/huge_mm.h2
-rw-r--r--include/linux/kernel.h56
-rw-r--r--include/linux/memcontrol.h15
-rw-r--r--include/linux/memory_hotplug.h1
-rw-r--r--include/linux/mempolicy.h7
-rw-r--r--include/linux/migrate.h14
-rw-r--r--include/linux/mm.h38
-rw-r--r--include/linux/mmdebug.h20
-rw-r--r--include/linux/mmzone.h51
-rw-r--r--include/linux/pagemap.h18
-rw-r--r--include/linux/rmap.h2
-rw-r--r--include/linux/sched.h6
-rw-r--r--include/linux/screen_info.h8
-rw-r--r--include/linux/slab.h64
-rw-r--r--include/linux/slab_def.h20
-rw-r--r--include/linux/swap.h22
-rw-r--r--include/linux/topology.h17
-rw-r--r--include/linux/vm_event_item.h7
-rw-r--r--include/linux/zsmalloc.h2
-rw-r--r--include/uapi/linux/kernel-page-flags.h1
-rw-r--r--include/uapi/linux/prctl.h27
-rw-r--r--init/Kconfig11
-rw-r--r--kernel/acct.c14
-rw-r--r--kernel/async.c8
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/sched/fair.c2
-rw-r--r--kernel/sys.c489
-rw-r--r--kernel/sysctl.c7
-rw-r--r--kernel/watchdog.c18
-rw-r--r--lib/genalloc.c49
-rw-r--r--mm/Kconfig10
-rw-r--r--mm/Makefile5
-rw-r--r--mm/backing-dev.c2
-rw-r--r--mm/balloon_compaction.c123
-rw-r--r--mm/bootmem.c4
-rw-r--r--mm/cma.c21
-rw-r--r--mm/compaction.c674
-rw-r--r--mm/debug.c237
-rw-r--r--mm/dmapool.c58
-rw-r--r--mm/filemap.c4
-rw-r--r--mm/gup.c354
-rw-r--r--mm/huge_memory.c28
-rw-r--r--mm/hugetlb.c14
-rw-r--r--mm/internal.h26
-rw-r--r--mm/interval_tree.c2
-rw-r--r--mm/kmemcheck.c1
-rw-r--r--mm/ksm.c4
-rw-r--r--mm/memcontrol.c282
-rw-r--r--mm/memory_hotplug.c2
-rw-r--r--mm/mempolicy.c134
-rw-r--r--mm/migrate.c16
-rw-r--r--mm/mlock.c6
-rw-r--r--mm/mmap.c74
-rw-r--r--mm/mremap.c5
-rw-r--r--mm/oom_kill.c6
-rw-r--r--mm/page-writeback.c8
-rw-r--r--mm/page_alloc.c348
-rw-r--r--mm/pagewalk.c2
-rw-r--r--mm/rmap.c8
-rw-r--r--mm/shmem.c2
-rw-r--r--mm/slab.c349
-rw-r--r--mm/slab.h57
-rw-r--r--mm/slab_common.c178
-rw-r--r--mm/slob.c2
-rw-r--r--mm/slub.c126
-rw-r--r--mm/swap.c30
-rw-r--r--mm/swap_state.c16
-rw-r--r--mm/util.c23
-rw-r--r--mm/vmalloc.c20
-rw-r--r--mm/vmscan.c112
-rw-r--r--mm/vmstat.c153
-rw-r--r--mm/zbud.c13
-rw-r--r--mm/zsmalloc.c46
-rw-r--r--tools/testing/selftests/vm/Makefile1
-rw-r--r--tools/testing/selftests/vm/transhuge-stress.c144
-rw-r--r--tools/vm/page-types.c1
177 files changed, 4103 insertions, 2866 deletions
diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node
index ce259c13c36a..5b2d0f08867c 100644
--- a/Documentation/ABI/stable/sysfs-devices-node
+++ b/Documentation/ABI/stable/sysfs-devices-node
@@ -85,14 +85,6 @@ Description:
85 will be compacted. When it completes, memory will be freed 85 will be compacted. When it completes, memory will be freed
86 into blocks which have as many contiguous pages as possible 86 into blocks which have as many contiguous pages as possible
87 87
88What: /sys/devices/system/node/nodeX/scan_unevictable_pages
89Date: October 2008
90Contact: Lee Schermerhorn <lee.schermerhorn@hp.com>
91Description:
92 When set, it triggers scanning the node's unevictable lists
93 and move any pages that have become evictable onto the respective
94 zone's inactive list. See mm/vmscan.c
95
96What: /sys/devices/system/node/nodeX/hugepages/hugepages-<size>/ 88What: /sys/devices/system/node/nodeX/hugepages/hugepages-<size>/
97Date: December 2009 89Date: December 2009
98Contact: Lee Schermerhorn <lee.schermerhorn@hp.com> 90Contact: Lee Schermerhorn <lee.schermerhorn@hp.com>
diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram
index 70ec992514d0..a6148eaf91e5 100644
--- a/Documentation/ABI/testing/sysfs-block-zram
+++ b/Documentation/ABI/testing/sysfs-block-zram
@@ -77,11 +77,14 @@ What: /sys/block/zram<id>/notify_free
77Date: August 2010 77Date: August 2010
78Contact: Nitin Gupta <ngupta@vflare.org> 78Contact: Nitin Gupta <ngupta@vflare.org>
79Description: 79Description:
80 The notify_free file is read-only and specifies the number of 80 The notify_free file is read-only. Depending on device usage
81 swap slot free notifications received by this device. These 81 scenario it may account a) the number of pages freed because
82 notifications are sent to a swap block device when a swap slot 82 of swap slot free notifications or b) the number of pages freed
83 is freed. This statistic is applicable only when this disk is 83 because of REQ_DISCARD requests sent by bio. The former ones
84 being used as a swap disk. 84 are sent to a swap block device when a swap slot is freed, which
85 implies that this disk is being used as a swap disk. The latter
86 ones are sent by filesystem mounted with discard option,
87 whenever some data blocks are getting discarded.
85 88
86What: /sys/block/zram<id>/zero_pages 89What: /sys/block/zram<id>/zero_pages
87Date: August 2010 90Date: August 2010
@@ -119,3 +122,22 @@ Description:
119 efficiency can be calculated using compr_data_size and this 122 efficiency can be calculated using compr_data_size and this
120 statistic. 123 statistic.
121 Unit: bytes 124 Unit: bytes
125
126What: /sys/block/zram<id>/mem_used_max
127Date: August 2014
128Contact: Minchan Kim <minchan@kernel.org>
129Description:
130 The mem_used_max file is read/write and specifies the amount
131 of maximum memory zram have consumed to store compressed data.
132 For resetting the value, you should write "0". Otherwise,
133 you could see -EINVAL.
134 Unit: bytes
135
136What: /sys/block/zram<id>/mem_limit
137Date: August 2014
138Contact: Minchan Kim <minchan@kernel.org>
139Description:
140 The mem_limit file is read/write and specifies the maximum
141 amount of memory ZRAM can use to store the compressed data. The
142 limit could be changed in run time and "0" means disable the
143 limit. No limit is the initial state. Unit: bytes
diff --git a/Documentation/ABI/testing/sysfs-devices-memory b/Documentation/ABI/testing/sysfs-devices-memory
index 7405de26ee60..deef3b5723cf 100644
--- a/Documentation/ABI/testing/sysfs-devices-memory
+++ b/Documentation/ABI/testing/sysfs-devices-memory
@@ -61,6 +61,14 @@ Users: hotplug memory remove tools
61 http://www.ibm.com/developerworks/wikis/display/LinuxP/powerpc-utils 61 http://www.ibm.com/developerworks/wikis/display/LinuxP/powerpc-utils
62 62
63 63
64What: /sys/devices/system/memory/memoryX/valid_zones
65Date: July 2014
66Contact: Zhang Zhen <zhenzhang.zhang@huawei.com>
67Description:
68 The file /sys/devices/system/memory/memoryX/valid_zones is
69 read-only and is designed to show which zone this memory
70 block can be onlined to.
71
64What: /sys/devices/system/memoryX/nodeY 72What: /sys/devices/system/memoryX/nodeY
65Date: October 2009 73Date: October 2009
66Contact: Linux Memory Management list <linux-mm@kvack.org> 74Contact: Linux Memory Management list <linux-mm@kvack.org>
diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 0595c3f56ccf..7fcf9c6592ec 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -74,14 +74,30 @@ There is little point creating a zram of greater than twice the size of memory
74since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the 74since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the
75size of the disk when not in use so a huge zram is wasteful. 75size of the disk when not in use so a huge zram is wasteful.
76 76
775) Activate: 775) Set memory limit: Optional
78 Set memory limit by writing the value to sysfs node 'mem_limit'.
79 The value can be either in bytes or you can use mem suffixes.
80 In addition, you could change the value in runtime.
81 Examples:
82 # limit /dev/zram0 with 50MB memory
83 echo $((50*1024*1024)) > /sys/block/zram0/mem_limit
84
85 # Using mem suffixes
86 echo 256K > /sys/block/zram0/mem_limit
87 echo 512M > /sys/block/zram0/mem_limit
88 echo 1G > /sys/block/zram0/mem_limit
89
90 # To disable memory limit
91 echo 0 > /sys/block/zram0/mem_limit
92
936) Activate:
78 mkswap /dev/zram0 94 mkswap /dev/zram0
79 swapon /dev/zram0 95 swapon /dev/zram0
80 96
81 mkfs.ext4 /dev/zram1 97 mkfs.ext4 /dev/zram1
82 mount /dev/zram1 /tmp 98 mount /dev/zram1 /tmp
83 99
846) Stats: 1007) Stats:
85 Per-device statistics are exported as various nodes under 101 Per-device statistics are exported as various nodes under
86 /sys/block/zram<id>/ 102 /sys/block/zram<id>/
87 disksize 103 disksize
@@ -95,12 +111,13 @@ size of the disk when not in use so a huge zram is wasteful.
95 orig_data_size 111 orig_data_size
96 compr_data_size 112 compr_data_size
97 mem_used_total 113 mem_used_total
114 mem_used_max
98 115
997) Deactivate: 1168) Deactivate:
100 swapoff /dev/zram0 117 swapoff /dev/zram0
101 umount /dev/zram1 118 umount /dev/zram1
102 119
1038) Reset: 1209) Reset:
104 Write any positive value to 'reset' sysfs node 121 Write any positive value to 'reset' sysfs node
105 echo 1 > /sys/block/zram0/reset 122 echo 1 > /sys/block/zram0/reset
106 echo 1 > /sys/block/zram1/reset 123 echo 1 > /sys/block/zram1/reset
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index cc4ab2517abc..41f7ec1fcf61 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -656,7 +656,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
656 Sets the size of kernel global memory area for 656 Sets the size of kernel global memory area for
657 contiguous memory allocations and optionally the 657 contiguous memory allocations and optionally the
658 placement constraint by the physical address range of 658 placement constraint by the physical address range of
659 memory allocations. For more information, see 659 memory allocations. A value of 0 disables CMA
660 altogether. For more information, see
660 include/linux/dma-contiguous.h 661 include/linux/dma-contiguous.h
661 662
662 cmo_free_hint= [PPC] Format: { yes | no } 663 cmo_free_hint= [PPC] Format: { yes | no }
@@ -3158,6 +3159,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
3158 3159
3159 slram= [HW,MTD] 3160 slram= [HW,MTD]
3160 3161
3162 slab_nomerge [MM]
3163 Disable merging of slabs with similar size. May be
3164 necessary if there is some reason to distinguish
3165 allocs to different slabs. Debug options disable
3166 merging on their own.
3167 For more information see Documentation/vm/slub.txt.
3168
3161 slab_max_order= [MM, SLAB] 3169 slab_max_order= [MM, SLAB]
3162 Determines the maximum allowed order for slabs. 3170 Determines the maximum allowed order for slabs.
3163 A high setting may cause OOMs due to memory 3171 A high setting may cause OOMs due to memory
@@ -3193,11 +3201,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
3193 For more information see Documentation/vm/slub.txt. 3201 For more information see Documentation/vm/slub.txt.
3194 3202
3195 slub_nomerge [MM, SLUB] 3203 slub_nomerge [MM, SLUB]
3196 Disable merging of slabs with similar size. May be 3204 Same with slab_nomerge. This is supported for legacy.
3197 necessary if there is some reason to distinguish 3205 See slab_nomerge for more information.
3198 allocs to different slabs. Debug options disable
3199 merging on their own.
3200 For more information see Documentation/vm/slub.txt.
3201 3206
3202 smart2= [HW] 3207 smart2= [HW]
3203 Format: <io1>[,<io2>[,...,<io8>]] 3208 Format: <io1>[,<io2>[,...,<io8>]]
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
index 45134dc23854..ea03abfc97e9 100644
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -155,6 +155,7 @@ Under each memory block, you can see 4 files:
155/sys/devices/system/memory/memoryXXX/phys_device 155/sys/devices/system/memory/memoryXXX/phys_device
156/sys/devices/system/memory/memoryXXX/state 156/sys/devices/system/memory/memoryXXX/state
157/sys/devices/system/memory/memoryXXX/removable 157/sys/devices/system/memory/memoryXXX/removable
158/sys/devices/system/memory/memoryXXX/valid_zones
158 159
159'phys_index' : read-only and contains memory block id, same as XXX. 160'phys_index' : read-only and contains memory block id, same as XXX.
160'state' : read-write 161'state' : read-write
@@ -170,6 +171,15 @@ Under each memory block, you can see 4 files:
170 block is removable and a value of 0 indicates that 171 block is removable and a value of 0 indicates that
171 it is not removable. A memory block is removable only if 172 it is not removable. A memory block is removable only if
172 every section in the block is removable. 173 every section in the block is removable.
174'valid_zones' : read-only: designed to show which zones this memory block
175 can be onlined to.
176 The first column shows it's default zone.
177 "memory6/valid_zones: Normal Movable" shows this memoryblock
178 can be onlined to ZONE_NORMAL by default and to ZONE_MOVABLE
179 by online_movable.
180 "memory7/valid_zones: Movable Normal" shows this memoryblock
181 can be onlined to ZONE_MOVABLE by default and to ZONE_NORMAL
182 by online_kernel.
173 183
174NOTE: 184NOTE:
175 These directories/files appear after physical memory hotplug phase. 185 These directories/files appear after physical memory hotplug phase.
@@ -408,7 +418,6 @@ node if necessary.
408 - allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like 418 - allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like
409 sysctl or new control file. 419 sysctl or new control file.
410 - showing memory block and physical device relationship. 420 - showing memory block and physical device relationship.
411 - showing memory block is under ZONE_MOVABLE or not
412 - test and make it better memory offlining. 421 - test and make it better memory offlining.
413 - support HugeTLB page migration and offlining. 422 - support HugeTLB page migration and offlining.
414 - memmap removing at memory offline. 423 - memmap removing at memory offline.
diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild
index a52cbf178c3a..25b49725df07 100644
--- a/arch/alpha/include/asm/Kbuild
+++ b/arch/alpha/include/asm/Kbuild
@@ -8,4 +8,5 @@ generic-y += irq_work.h
8generic-y += mcs_spinlock.h 8generic-y += mcs_spinlock.h
9generic-y += preempt.h 9generic-y += preempt.h
10generic-y += scatterlist.h 10generic-y += scatterlist.h
11generic-y += sections.h
11generic-y += trace_clock.h 12generic-y += trace_clock.h
diff --git a/arch/alpha/include/asm/sections.h b/arch/alpha/include/asm/sections.h
deleted file mode 100644
index 43b40edd6e44..000000000000
--- a/arch/alpha/include/asm/sections.h
+++ /dev/null
@@ -1,7 +0,0 @@
1#ifndef _ALPHA_SECTIONS_H
2#define _ALPHA_SECTIONS_H
3
4/* nothing to see, move along */
5#include <asm-generic/sections.h>
6
7#endif
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index d9d32de9628c..18f392f8b744 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -14,6 +14,7 @@ config ARM
14 select CLONE_BACKWARDS 14 select CLONE_BACKWARDS
15 select CPU_PM if (SUSPEND || CPU_IDLE) 15 select CPU_PM if (SUSPEND || CPU_IDLE)
16 select DCACHE_WORD_ACCESS if HAVE_EFFICIENT_UNALIGNED_ACCESS 16 select DCACHE_WORD_ACCESS if HAVE_EFFICIENT_UNALIGNED_ACCESS
17 select GENERIC_ALLOCATOR
17 select GENERIC_ATOMIC64 if (CPU_V7M || CPU_V6 || !CPU_32v6K || !AEABI) 18 select GENERIC_ATOMIC64 if (CPU_V7M || CPU_V6 || !CPU_32v6K || !AEABI)
18 select GENERIC_CLOCKEVENTS_BROADCAST if SMP 19 select GENERIC_CLOCKEVENTS_BROADCAST if SMP
19 select GENERIC_IDLE_POLL_SETUP 20 select GENERIC_IDLE_POLL_SETUP
@@ -61,6 +62,7 @@ config ARM
61 select HAVE_PERF_EVENTS 62 select HAVE_PERF_EVENTS
62 select HAVE_PERF_REGS 63 select HAVE_PERF_REGS
63 select HAVE_PERF_USER_STACK_DUMP 64 select HAVE_PERF_USER_STACK_DUMP
65 select HAVE_RCU_TABLE_FREE if (SMP && ARM_LPAE)
64 select HAVE_REGS_AND_STACK_ACCESS_API 66 select HAVE_REGS_AND_STACK_ACCESS_API
65 select HAVE_SYSCALL_TRACEPOINTS 67 select HAVE_SYSCALL_TRACEPOINTS
66 select HAVE_UID16 68 select HAVE_UID16
@@ -1659,6 +1661,10 @@ config ARCH_SELECT_MEMORY_MODEL
1659config HAVE_ARCH_PFN_VALID 1661config HAVE_ARCH_PFN_VALID
1660 def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM 1662 def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM
1661 1663
1664config HAVE_GENERIC_RCU_GUP
1665 def_bool y
1666 depends on ARM_LPAE
1667
1662config HIGHMEM 1668config HIGHMEM
1663 bool "High Memory Support" 1669 bool "High Memory Support"
1664 depends on MMU 1670 depends on MMU
diff --git a/arch/arm/include/asm/pgtable-2level.h b/arch/arm/include/asm/pgtable-2level.h
index 219ac88a9542..f0279411847d 100644
--- a/arch/arm/include/asm/pgtable-2level.h
+++ b/arch/arm/include/asm/pgtable-2level.h
@@ -182,6 +182,8 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
182#define pmd_addr_end(addr,end) (end) 182#define pmd_addr_end(addr,end) (end)
183 183
184#define set_pte_ext(ptep,pte,ext) cpu_set_pte_ext(ptep,pte,ext) 184#define set_pte_ext(ptep,pte,ext) cpu_set_pte_ext(ptep,pte,ext)
185#define pte_special(pte) (0)
186static inline pte_t pte_mkspecial(pte_t pte) { return pte; }
185 187
186/* 188/*
187 * We don't have huge page support for short descriptors, for the moment 189 * We don't have huge page support for short descriptors, for the moment
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index 06e0bc0f8b00..a31ecdad4b59 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -213,10 +213,19 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
213#define pmd_isclear(pmd, val) (!(pmd_val(pmd) & (val))) 213#define pmd_isclear(pmd, val) (!(pmd_val(pmd) & (val)))
214 214
215#define pmd_young(pmd) (pmd_isset((pmd), PMD_SECT_AF)) 215#define pmd_young(pmd) (pmd_isset((pmd), PMD_SECT_AF))
216#define pte_special(pte) (pte_isset((pte), L_PTE_SPECIAL))
217static inline pte_t pte_mkspecial(pte_t pte)
218{
219 pte_val(pte) |= L_PTE_SPECIAL;
220 return pte;
221}
222#define __HAVE_ARCH_PTE_SPECIAL
216 223
217#define __HAVE_ARCH_PMD_WRITE 224#define __HAVE_ARCH_PMD_WRITE
218#define pmd_write(pmd) (pmd_isclear((pmd), L_PMD_SECT_RDONLY)) 225#define pmd_write(pmd) (pmd_isclear((pmd), L_PMD_SECT_RDONLY))
219#define pmd_dirty(pmd) (pmd_isset((pmd), L_PMD_SECT_DIRTY)) 226#define pmd_dirty(pmd) (pmd_isset((pmd), L_PMD_SECT_DIRTY))
227#define pud_page(pud) pmd_page(__pmd(pud_val(pud)))
228#define pud_write(pud) pmd_write(__pmd(pud_val(pud)))
220 229
221#define pmd_hugewillfault(pmd) (!pmd_young(pmd) || !pmd_write(pmd)) 230#define pmd_hugewillfault(pmd) (!pmd_young(pmd) || !pmd_write(pmd))
222#define pmd_thp_or_huge(pmd) (pmd_huge(pmd) || pmd_trans_huge(pmd)) 231#define pmd_thp_or_huge(pmd) (pmd_huge(pmd) || pmd_trans_huge(pmd))
@@ -224,6 +233,12 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
224#ifdef CONFIG_TRANSPARENT_HUGEPAGE 233#ifdef CONFIG_TRANSPARENT_HUGEPAGE
225#define pmd_trans_huge(pmd) (pmd_val(pmd) && !pmd_table(pmd)) 234#define pmd_trans_huge(pmd) (pmd_val(pmd) && !pmd_table(pmd))
226#define pmd_trans_splitting(pmd) (pmd_isset((pmd), L_PMD_SECT_SPLITTING)) 235#define pmd_trans_splitting(pmd) (pmd_isset((pmd), L_PMD_SECT_SPLITTING))
236
237#ifdef CONFIG_HAVE_RCU_TABLE_FREE
238#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
239void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
240 pmd_t *pmdp);
241#endif
227#endif 242#endif
228 243
229#define PMD_BIT_FUNC(fn,op) \ 244#define PMD_BIT_FUNC(fn,op) \
diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
index 01baef07cd0c..90aa4583b308 100644
--- a/arch/arm/include/asm/pgtable.h
+++ b/arch/arm/include/asm/pgtable.h
@@ -226,7 +226,6 @@ static inline pte_t *pmd_page_vaddr(pmd_t pmd)
226#define pte_dirty(pte) (pte_isset((pte), L_PTE_DIRTY)) 226#define pte_dirty(pte) (pte_isset((pte), L_PTE_DIRTY))
227#define pte_young(pte) (pte_isset((pte), L_PTE_YOUNG)) 227#define pte_young(pte) (pte_isset((pte), L_PTE_YOUNG))
228#define pte_exec(pte) (pte_isclear((pte), L_PTE_XN)) 228#define pte_exec(pte) (pte_isclear((pte), L_PTE_XN))
229#define pte_special(pte) (0)
230 229
231#define pte_valid_user(pte) \ 230#define pte_valid_user(pte) \
232 (pte_valid(pte) && pte_isset((pte), L_PTE_USER) && pte_young(pte)) 231 (pte_valid(pte) && pte_isset((pte), L_PTE_USER) && pte_young(pte))
@@ -245,7 +244,8 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
245 unsigned long ext = 0; 244 unsigned long ext = 0;
246 245
247 if (addr < TASK_SIZE && pte_valid_user(pteval)) { 246 if (addr < TASK_SIZE && pte_valid_user(pteval)) {
248 __sync_icache_dcache(pteval); 247 if (!pte_special(pteval))
248 __sync_icache_dcache(pteval);
249 ext |= PTE_EXT_NG; 249 ext |= PTE_EXT_NG;
250 } 250 }
251 251
@@ -264,8 +264,6 @@ PTE_BIT_FUNC(mkyoung, |= L_PTE_YOUNG);
264PTE_BIT_FUNC(mkexec, &= ~L_PTE_XN); 264PTE_BIT_FUNC(mkexec, &= ~L_PTE_XN);
265PTE_BIT_FUNC(mknexec, |= L_PTE_XN); 265PTE_BIT_FUNC(mknexec, |= L_PTE_XN);
266 266
267static inline pte_t pte_mkspecial(pte_t pte) { return pte; }
268
269static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) 267static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
270{ 268{
271 const pteval_t mask = L_PTE_XN | L_PTE_RDONLY | L_PTE_USER | 269 const pteval_t mask = L_PTE_XN | L_PTE_RDONLY | L_PTE_USER |
diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h
index f1a0dace3efe..3cadb726ec88 100644
--- a/arch/arm/include/asm/tlb.h
+++ b/arch/arm/include/asm/tlb.h
@@ -35,12 +35,39 @@
35 35
36#define MMU_GATHER_BUNDLE 8 36#define MMU_GATHER_BUNDLE 8
37 37
38#ifdef CONFIG_HAVE_RCU_TABLE_FREE
39static inline void __tlb_remove_table(void *_table)
40{
41 free_page_and_swap_cache((struct page *)_table);
42}
43
44struct mmu_table_batch {
45 struct rcu_head rcu;
46 unsigned int nr;
47 void *tables[0];
48};
49
50#define MAX_TABLE_BATCH \
51 ((PAGE_SIZE - sizeof(struct mmu_table_batch)) / sizeof(void *))
52
53extern void tlb_table_flush(struct mmu_gather *tlb);
54extern void tlb_remove_table(struct mmu_gather *tlb, void *table);
55
56#define tlb_remove_entry(tlb, entry) tlb_remove_table(tlb, entry)
57#else
58#define tlb_remove_entry(tlb, entry) tlb_remove_page(tlb, entry)
59#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
60
38/* 61/*
39 * TLB handling. This allows us to remove pages from the page 62 * TLB handling. This allows us to remove pages from the page
40 * tables, and efficiently handle the TLB issues. 63 * tables, and efficiently handle the TLB issues.
41 */ 64 */
42struct mmu_gather { 65struct mmu_gather {
43 struct mm_struct *mm; 66 struct mm_struct *mm;
67#ifdef CONFIG_HAVE_RCU_TABLE_FREE
68 struct mmu_table_batch *batch;
69 unsigned int need_flush;
70#endif
44 unsigned int fullmm; 71 unsigned int fullmm;
45 struct vm_area_struct *vma; 72 struct vm_area_struct *vma;
46 unsigned long start, end; 73 unsigned long start, end;
@@ -101,6 +128,9 @@ static inline void __tlb_alloc_page(struct mmu_gather *tlb)
101static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) 128static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
102{ 129{
103 tlb_flush(tlb); 130 tlb_flush(tlb);
131#ifdef CONFIG_HAVE_RCU_TABLE_FREE
132 tlb_table_flush(tlb);
133#endif
104} 134}
105 135
106static inline void tlb_flush_mmu_free(struct mmu_gather *tlb) 136static inline void tlb_flush_mmu_free(struct mmu_gather *tlb)
@@ -129,6 +159,10 @@ tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start
129 tlb->pages = tlb->local; 159 tlb->pages = tlb->local;
130 tlb->nr = 0; 160 tlb->nr = 0;
131 __tlb_alloc_page(tlb); 161 __tlb_alloc_page(tlb);
162
163#ifdef CONFIG_HAVE_RCU_TABLE_FREE
164 tlb->batch = NULL;
165#endif
132} 166}
133 167
134static inline void 168static inline void
@@ -205,7 +239,7 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
205 tlb_add_flush(tlb, addr + SZ_1M); 239 tlb_add_flush(tlb, addr + SZ_1M);
206#endif 240#endif
207 241
208 tlb_remove_page(tlb, pte); 242 tlb_remove_entry(tlb, pte);
209} 243}
210 244
211static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, 245static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp,
@@ -213,7 +247,7 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp,
213{ 247{
214#ifdef CONFIG_ARM_LPAE 248#ifdef CONFIG_ARM_LPAE
215 tlb_add_flush(tlb, addr); 249 tlb_add_flush(tlb, addr);
216 tlb_remove_page(tlb, virt_to_page(pmdp)); 250 tlb_remove_entry(tlb, virt_to_page(pmdp));
217#endif 251#endif
218} 252}
219 253
diff --git a/arch/arm/kernel/hibernate.c b/arch/arm/kernel/hibernate.c
index bb8b79648643..c4cc50e58c13 100644
--- a/arch/arm/kernel/hibernate.c
+++ b/arch/arm/kernel/hibernate.c
@@ -21,8 +21,7 @@
21#include <asm/idmap.h> 21#include <asm/idmap.h>
22#include <asm/suspend.h> 22#include <asm/suspend.h>
23#include <asm/memory.h> 23#include <asm/memory.h>
24 24#include <asm/sections.h>
25extern const void __nosave_begin, __nosave_end;
26 25
27int pfn_is_nosave(unsigned long pfn) 26int pfn_is_nosave(unsigned long pfn)
28{ 27{
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 7a996aaa061e..c245d903927f 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -12,6 +12,7 @@
12#include <linux/bootmem.h> 12#include <linux/bootmem.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/genalloc.h>
15#include <linux/gfp.h> 16#include <linux/gfp.h>
16#include <linux/errno.h> 17#include <linux/errno.h>
17#include <linux/list.h> 18#include <linux/list.h>
@@ -298,57 +299,29 @@ static void *
298__dma_alloc_remap(struct page *page, size_t size, gfp_t gfp, pgprot_t prot, 299__dma_alloc_remap(struct page *page, size_t size, gfp_t gfp, pgprot_t prot,
299 const void *caller) 300 const void *caller)
300{ 301{
301 struct vm_struct *area;
302 unsigned long addr;
303
304 /* 302 /*
305 * DMA allocation can be mapped to user space, so lets 303 * DMA allocation can be mapped to user space, so lets
306 * set VM_USERMAP flags too. 304 * set VM_USERMAP flags too.
307 */ 305 */
308 area = get_vm_area_caller(size, VM_ARM_DMA_CONSISTENT | VM_USERMAP, 306 return dma_common_contiguous_remap(page, size,
309 caller); 307 VM_ARM_DMA_CONSISTENT | VM_USERMAP,
310 if (!area) 308 prot, caller);
311 return NULL;
312 addr = (unsigned long)area->addr;
313 area->phys_addr = __pfn_to_phys(page_to_pfn(page));
314
315 if (ioremap_page_range(addr, addr + size, area->phys_addr, prot)) {
316 vunmap((void *)addr);
317 return NULL;
318 }
319 return (void *)addr;
320} 309}
321 310
322static void __dma_free_remap(void *cpu_addr, size_t size) 311static void __dma_free_remap(void *cpu_addr, size_t size)
323{ 312{
324 unsigned int flags = VM_ARM_DMA_CONSISTENT | VM_USERMAP; 313 dma_common_free_remap(cpu_addr, size,
325 struct vm_struct *area = find_vm_area(cpu_addr); 314 VM_ARM_DMA_CONSISTENT | VM_USERMAP);
326 if (!area || (area->flags & flags) != flags) {
327 WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr);
328 return;
329 }
330 unmap_kernel_range((unsigned long)cpu_addr, size);
331 vunmap(cpu_addr);
332} 315}
333 316
334#define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K 317#define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K
318static struct gen_pool *atomic_pool;
335 319
336struct dma_pool { 320static size_t atomic_pool_size = DEFAULT_DMA_COHERENT_POOL_SIZE;
337 size_t size;
338 spinlock_t lock;
339 unsigned long *bitmap;
340 unsigned long nr_pages;
341 void *vaddr;
342 struct page **pages;
343};
344
345static struct dma_pool atomic_pool = {
346 .size = DEFAULT_DMA_COHERENT_POOL_SIZE,
347};
348 321
349static int __init early_coherent_pool(char *p) 322static int __init early_coherent_pool(char *p)
350{ 323{
351 atomic_pool.size = memparse(p, &p); 324 atomic_pool_size = memparse(p, &p);
352 return 0; 325 return 0;
353} 326}
354early_param("coherent_pool", early_coherent_pool); 327early_param("coherent_pool", early_coherent_pool);
@@ -358,14 +331,14 @@ void __init init_dma_coherent_pool_size(unsigned long size)
358 /* 331 /*
359 * Catch any attempt to set the pool size too late. 332 * Catch any attempt to set the pool size too late.
360 */ 333 */
361 BUG_ON(atomic_pool.vaddr); 334 BUG_ON(atomic_pool);
362 335
363 /* 336 /*
364 * Set architecture specific coherent pool size only if 337 * Set architecture specific coherent pool size only if
365 * it has not been changed by kernel command line parameter. 338 * it has not been changed by kernel command line parameter.
366 */ 339 */
367 if (atomic_pool.size == DEFAULT_DMA_COHERENT_POOL_SIZE) 340 if (atomic_pool_size == DEFAULT_DMA_COHERENT_POOL_SIZE)
368 atomic_pool.size = size; 341 atomic_pool_size = size;
369} 342}
370 343
371/* 344/*
@@ -373,52 +346,44 @@ void __init init_dma_coherent_pool_size(unsigned long size)
373 */ 346 */
374static int __init atomic_pool_init(void) 347static int __init atomic_pool_init(void)
375{ 348{
376 struct dma_pool *pool = &atomic_pool;
377 pgprot_t prot = pgprot_dmacoherent(PAGE_KERNEL); 349 pgprot_t prot = pgprot_dmacoherent(PAGE_KERNEL);
378 gfp_t gfp = GFP_KERNEL | GFP_DMA; 350 gfp_t gfp = GFP_KERNEL | GFP_DMA;
379 unsigned long nr_pages = pool->size >> PAGE_SHIFT;
380 unsigned long *bitmap;
381 struct page *page; 351 struct page *page;
382 struct page **pages;
383 void *ptr; 352 void *ptr;
384 int bitmap_size = BITS_TO_LONGS(nr_pages) * sizeof(long);
385 353
386 bitmap = kzalloc(bitmap_size, GFP_KERNEL); 354 atomic_pool = gen_pool_create(PAGE_SHIFT, -1);
387 if (!bitmap) 355 if (!atomic_pool)
388 goto no_bitmap; 356 goto out;
389
390 pages = kzalloc(nr_pages * sizeof(struct page *), GFP_KERNEL);
391 if (!pages)
392 goto no_pages;
393 357
394 if (dev_get_cma_area(NULL)) 358 if (dev_get_cma_area(NULL))
395 ptr = __alloc_from_contiguous(NULL, pool->size, prot, &page, 359 ptr = __alloc_from_contiguous(NULL, atomic_pool_size, prot,
396 atomic_pool_init); 360 &page, atomic_pool_init);
397 else 361 else
398 ptr = __alloc_remap_buffer(NULL, pool->size, gfp, prot, &page, 362 ptr = __alloc_remap_buffer(NULL, atomic_pool_size, gfp, prot,
399 atomic_pool_init); 363 &page, atomic_pool_init);
400 if (ptr) { 364 if (ptr) {
401 int i; 365 int ret;
402 366
403 for (i = 0; i < nr_pages; i++) 367 ret = gen_pool_add_virt(atomic_pool, (unsigned long)ptr,
404 pages[i] = page + i; 368 page_to_phys(page),
405 369 atomic_pool_size, -1);
406 spin_lock_init(&pool->lock); 370 if (ret)
407 pool->vaddr = ptr; 371 goto destroy_genpool;
408 pool->pages = pages; 372
409 pool->bitmap = bitmap; 373 gen_pool_set_algo(atomic_pool,
410 pool->nr_pages = nr_pages; 374 gen_pool_first_fit_order_align,
411 pr_info("DMA: preallocated %u KiB pool for atomic coherent allocations\n", 375 (void *)PAGE_SHIFT);
412 (unsigned)pool->size / 1024); 376 pr_info("DMA: preallocated %zd KiB pool for atomic coherent allocations\n",
377 atomic_pool_size / 1024);
413 return 0; 378 return 0;
414 } 379 }
415 380
416 kfree(pages); 381destroy_genpool:
417no_pages: 382 gen_pool_destroy(atomic_pool);
418 kfree(bitmap); 383 atomic_pool = NULL;
419no_bitmap: 384out:
420 pr_err("DMA: failed to allocate %u KiB pool for atomic coherent allocation\n", 385 pr_err("DMA: failed to allocate %zx KiB pool for atomic coherent allocation\n",
421 (unsigned)pool->size / 1024); 386 atomic_pool_size / 1024);
422 return -ENOMEM; 387 return -ENOMEM;
423} 388}
424/* 389/*
@@ -522,76 +487,36 @@ static void *__alloc_remap_buffer(struct device *dev, size_t size, gfp_t gfp,
522 487
523static void *__alloc_from_pool(size_t size, struct page **ret_page) 488static void *__alloc_from_pool(size_t size, struct page **ret_page)
524{ 489{
525 struct dma_pool *pool = &atomic_pool; 490 unsigned long val;
526 unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
527 unsigned int pageno;
528 unsigned long flags;
529 void *ptr = NULL; 491 void *ptr = NULL;
530 unsigned long align_mask;
531 492
532 if (!pool->vaddr) { 493 if (!atomic_pool) {
533 WARN(1, "coherent pool not initialised!\n"); 494 WARN(1, "coherent pool not initialised!\n");
534 return NULL; 495 return NULL;
535 } 496 }
536 497
537 /* 498 val = gen_pool_alloc(atomic_pool, size);
538 * Align the region allocation - allocations from pool are rather 499 if (val) {
539 * small, so align them to their order in pages, minimum is a page 500 phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val);
540 * size. This helps reduce fragmentation of the DMA space. 501
541 */ 502 *ret_page = phys_to_page(phys);
542 align_mask = (1 << get_order(size)) - 1; 503 ptr = (void *)val;
543
544 spin_lock_irqsave(&pool->lock, flags);
545 pageno = bitmap_find_next_zero_area(pool->bitmap, pool->nr_pages,
546 0, count, align_mask);
547 if (pageno < pool->nr_pages) {
548 bitmap_set(pool->bitmap, pageno, count);
549 ptr = pool->vaddr + PAGE_SIZE * pageno;
550 *ret_page = pool->pages[pageno];
551 } else {
552 pr_err_once("ERROR: %u KiB atomic DMA coherent pool is too small!\n"
553 "Please increase it with coherent_pool= kernel parameter!\n",
554 (unsigned)pool->size / 1024);
555 } 504 }
556 spin_unlock_irqrestore(&pool->lock, flags);
557 505
558 return ptr; 506 return ptr;
559} 507}
560 508
561static bool __in_atomic_pool(void *start, size_t size) 509static bool __in_atomic_pool(void *start, size_t size)
562{ 510{
563 struct dma_pool *pool = &atomic_pool; 511 return addr_in_gen_pool(atomic_pool, (unsigned long)start, size);
564 void *end = start + size;
565 void *pool_start = pool->vaddr;
566 void *pool_end = pool->vaddr + pool->size;
567
568 if (start < pool_start || start >= pool_end)
569 return false;
570
571 if (end <= pool_end)
572 return true;
573
574 WARN(1, "Wrong coherent size(%p-%p) from atomic pool(%p-%p)\n",
575 start, end - 1, pool_start, pool_end - 1);
576
577 return false;
578} 512}
579 513
580static int __free_from_pool(void *start, size_t size) 514static int __free_from_pool(void *start, size_t size)
581{ 515{
582 struct dma_pool *pool = &atomic_pool;
583 unsigned long pageno, count;
584 unsigned long flags;
585
586 if (!__in_atomic_pool(start, size)) 516 if (!__in_atomic_pool(start, size))
587 return 0; 517 return 0;
588 518
589 pageno = (start - pool->vaddr) >> PAGE_SHIFT; 519 gen_pool_free(atomic_pool, (unsigned long)start, size);
590 count = size >> PAGE_SHIFT;
591
592 spin_lock_irqsave(&pool->lock, flags);
593 bitmap_clear(pool->bitmap, pageno, count);
594 spin_unlock_irqrestore(&pool->lock, flags);
595 520
596 return 1; 521 return 1;
597} 522}
@@ -1271,29 +1196,8 @@ static void *
1271__iommu_alloc_remap(struct page **pages, size_t size, gfp_t gfp, pgprot_t prot, 1196__iommu_alloc_remap(struct page **pages, size_t size, gfp_t gfp, pgprot_t prot,
1272 const void *caller) 1197 const void *caller)
1273{ 1198{
1274 unsigned int i, nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT; 1199 return dma_common_pages_remap(pages, size,
1275 struct vm_struct *area; 1200 VM_ARM_DMA_CONSISTENT | VM_USERMAP, prot, caller);
1276 unsigned long p;
1277
1278 area = get_vm_area_caller(size, VM_ARM_DMA_CONSISTENT | VM_USERMAP,
1279 caller);
1280 if (!area)
1281 return NULL;
1282
1283 area->pages = pages;
1284 area->nr_pages = nr_pages;
1285 p = (unsigned long)area->addr;
1286
1287 for (i = 0; i < nr_pages; i++) {
1288 phys_addr_t phys = __pfn_to_phys(page_to_pfn(pages[i]));
1289 if (ioremap_page_range(p, p + PAGE_SIZE, phys, prot))
1290 goto err;
1291 p += PAGE_SIZE;
1292 }
1293 return area->addr;
1294err:
1295 unmap_kernel_range((unsigned long)area->addr, size);
1296 vunmap(area->addr);
1297 return NULL; 1201 return NULL;
1298} 1202}
1299 1203
@@ -1355,11 +1259,13 @@ static int __iommu_remove_mapping(struct device *dev, dma_addr_t iova, size_t si
1355 1259
1356static struct page **__atomic_get_pages(void *addr) 1260static struct page **__atomic_get_pages(void *addr)
1357{ 1261{
1358 struct dma_pool *pool = &atomic_pool; 1262 struct page *page;
1359 struct page **pages = pool->pages; 1263 phys_addr_t phys;
1360 int offs = (addr - pool->vaddr) >> PAGE_SHIFT; 1264
1265 phys = gen_pool_virt_to_phys(atomic_pool, (unsigned long)addr);
1266 page = phys_to_page(phys);
1361 1267
1362 return pages + offs; 1268 return (struct page **)page;
1363} 1269}
1364 1270
1365static struct page **__iommu_get_pages(void *cpu_addr, struct dma_attrs *attrs) 1271static struct page **__iommu_get_pages(void *cpu_addr, struct dma_attrs *attrs)
@@ -1501,8 +1407,8 @@ void arm_iommu_free_attrs(struct device *dev, size_t size, void *cpu_addr,
1501 } 1407 }
1502 1408
1503 if (!dma_get_attr(DMA_ATTR_NO_KERNEL_MAPPING, attrs)) { 1409 if (!dma_get_attr(DMA_ATTR_NO_KERNEL_MAPPING, attrs)) {
1504 unmap_kernel_range((unsigned long)cpu_addr, size); 1410 dma_common_free_remap(cpu_addr, size,
1505 vunmap(cpu_addr); 1411 VM_ARM_DMA_CONSISTENT | VM_USERMAP);
1506 } 1412 }
1507 1413
1508 __iommu_remove_mapping(dev, handle, size); 1414 __iommu_remove_mapping(dev, handle, size);
diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c
index 43d54f5b26b9..265b836b3bd1 100644
--- a/arch/arm/mm/flush.c
+++ b/arch/arm/mm/flush.c
@@ -400,3 +400,18 @@ void __flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned l
400 */ 400 */
401 __cpuc_flush_dcache_area(page_address(page), PAGE_SIZE); 401 __cpuc_flush_dcache_area(page_address(page), PAGE_SIZE);
402} 402}
403
404#ifdef CONFIG_TRANSPARENT_HUGEPAGE
405#ifdef CONFIG_HAVE_RCU_TABLE_FREE
406void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
407 pmd_t *pmdp)
408{
409 pmd_t pmd = pmd_mksplitting(*pmdp);
410 VM_BUG_ON(address & ~PMD_MASK);
411 set_pmd_at(vma->vm_mm, address, pmdp, pmd);
412
413 /* dummy IPI to serialise against fast_gup */
414 kick_all_cpus_sync();
415}
416#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
417#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 9221645dd192..92bba32d9230 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -322,7 +322,7 @@ void __init arm_memblock_init(const struct machine_desc *mdesc)
322 * reserve memory for DMA contigouos allocations, 322 * reserve memory for DMA contigouos allocations,
323 * must come from DMA area inside low memory 323 * must come from DMA area inside low memory
324 */ 324 */
325 dma_contiguous_reserve(min(arm_dma_limit, arm_lowmem_limit)); 325 dma_contiguous_reserve(arm_dma_limit);
326 326
327 arm_memblock_steal_permitted = false; 327 arm_memblock_steal_permitted = false;
328 memblock_dump_all(); 328 memblock_dump_all();
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 3f0e854d0ff4..c49ca4c738bb 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -18,6 +18,7 @@ config ARM64
18 select COMMON_CLK 18 select COMMON_CLK
19 select CPU_PM if (SUSPEND || CPU_IDLE) 19 select CPU_PM if (SUSPEND || CPU_IDLE)
20 select DCACHE_WORD_ACCESS 20 select DCACHE_WORD_ACCESS
21 select GENERIC_ALLOCATOR
21 select GENERIC_CLOCKEVENTS 22 select GENERIC_CLOCKEVENTS
22 select GENERIC_CLOCKEVENTS_BROADCAST if SMP 23 select GENERIC_CLOCKEVENTS_BROADCAST if SMP
23 select GENERIC_CPU_AUTOPROBE 24 select GENERIC_CPU_AUTOPROBE
@@ -56,6 +57,7 @@ config ARM64
56 select HAVE_PERF_EVENTS 57 select HAVE_PERF_EVENTS
57 select HAVE_PERF_REGS 58 select HAVE_PERF_REGS
58 select HAVE_PERF_USER_STACK_DUMP 59 select HAVE_PERF_USER_STACK_DUMP
60 select HAVE_RCU_TABLE_FREE
59 select HAVE_SYSCALL_TRACEPOINTS 61 select HAVE_SYSCALL_TRACEPOINTS
60 select IRQ_DOMAIN 62 select IRQ_DOMAIN
61 select MODULES_USE_ELF_RELA 63 select MODULES_USE_ELF_RELA
@@ -109,6 +111,9 @@ config GENERIC_CALIBRATE_DELAY
109config ZONE_DMA 111config ZONE_DMA
110 def_bool y 112 def_bool y
111 113
114config HAVE_GENERIC_RCU_GUP
115 def_bool y
116
112config ARCH_DMA_ADDR_T_64BIT 117config ARCH_DMA_ADDR_T_64BIT
113 def_bool y 118 def_bool y
114 119
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 77dbe1e6398d..cefd3e825612 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -244,6 +244,16 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
244 244
245#define __HAVE_ARCH_PTE_SPECIAL 245#define __HAVE_ARCH_PTE_SPECIAL
246 246
247static inline pte_t pud_pte(pud_t pud)
248{
249 return __pte(pud_val(pud));
250}
251
252static inline pmd_t pud_pmd(pud_t pud)
253{
254 return __pmd(pud_val(pud));
255}
256
247static inline pte_t pmd_pte(pmd_t pmd) 257static inline pte_t pmd_pte(pmd_t pmd)
248{ 258{
249 return __pte(pmd_val(pmd)); 259 return __pte(pmd_val(pmd));
@@ -261,7 +271,13 @@ static inline pmd_t pte_pmd(pte_t pte)
261#ifdef CONFIG_TRANSPARENT_HUGEPAGE 271#ifdef CONFIG_TRANSPARENT_HUGEPAGE
262#define pmd_trans_huge(pmd) (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT)) 272#define pmd_trans_huge(pmd) (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT))
263#define pmd_trans_splitting(pmd) pte_special(pmd_pte(pmd)) 273#define pmd_trans_splitting(pmd) pte_special(pmd_pte(pmd))
264#endif 274#ifdef CONFIG_HAVE_RCU_TABLE_FREE
275#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
276struct vm_area_struct;
277void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
278 pmd_t *pmdp);
279#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
280#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
265 281
266#define pmd_young(pmd) pte_young(pmd_pte(pmd)) 282#define pmd_young(pmd) pte_young(pmd_pte(pmd))
267#define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd))) 283#define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd)))
@@ -282,6 +298,7 @@ static inline pmd_t pte_pmd(pte_t pte)
282#define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot) 298#define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot)
283 299
284#define pmd_page(pmd) pfn_to_page(__phys_to_pfn(pmd_val(pmd) & PHYS_MASK)) 300#define pmd_page(pmd) pfn_to_page(__phys_to_pfn(pmd_val(pmd) & PHYS_MASK))
301#define pud_write(pud) pte_write(pud_pte(pud))
285#define pud_pfn(pud) (((pud_val(pud) & PUD_MASK) & PHYS_MASK) >> PAGE_SHIFT) 302#define pud_pfn(pud) (((pud_val(pud) & PUD_MASK) & PHYS_MASK) >> PAGE_SHIFT)
286 303
287#define set_pmd_at(mm, addr, pmdp, pmd) set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)) 304#define set_pmd_at(mm, addr, pmdp, pmd) set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd))
@@ -383,6 +400,8 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
383 return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(addr); 400 return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(addr);
384} 401}
385 402
403#define pud_page(pud) pmd_page(pud_pmd(pud))
404
386#endif /* CONFIG_ARM64_PGTABLE_LEVELS > 2 */ 405#endif /* CONFIG_ARM64_PGTABLE_LEVELS > 2 */
387 406
388#if CONFIG_ARM64_PGTABLE_LEVELS > 3 407#if CONFIG_ARM64_PGTABLE_LEVELS > 3
diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
index 62731ef9749a..a82c0c5c8b52 100644
--- a/arch/arm64/include/asm/tlb.h
+++ b/arch/arm64/include/asm/tlb.h
@@ -23,6 +23,20 @@
23 23
24#include <asm-generic/tlb.h> 24#include <asm-generic/tlb.h>
25 25
26#include <linux/pagemap.h>
27#include <linux/swap.h>
28
29#ifdef CONFIG_HAVE_RCU_TABLE_FREE
30
31#define tlb_remove_entry(tlb, entry) tlb_remove_table(tlb, entry)
32static inline void __tlb_remove_table(void *_table)
33{
34 free_page_and_swap_cache((struct page *)_table);
35}
36#else
37#define tlb_remove_entry(tlb, entry) tlb_remove_page(tlb, entry)
38#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
39
26/* 40/*
27 * There's three ways the TLB shootdown code is used: 41 * There's three ways the TLB shootdown code is used:
28 * 1. Unmapping a range of vmas. See zap_page_range(), unmap_region(). 42 * 1. Unmapping a range of vmas. See zap_page_range(), unmap_region().
@@ -88,7 +102,7 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
88{ 102{
89 pgtable_page_dtor(pte); 103 pgtable_page_dtor(pte);
90 tlb_add_flush(tlb, addr); 104 tlb_add_flush(tlb, addr);
91 tlb_remove_page(tlb, pte); 105 tlb_remove_entry(tlb, pte);
92} 106}
93 107
94#if CONFIG_ARM64_PGTABLE_LEVELS > 2 108#if CONFIG_ARM64_PGTABLE_LEVELS > 2
@@ -96,7 +110,7 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp,
96 unsigned long addr) 110 unsigned long addr)
97{ 111{
98 tlb_add_flush(tlb, addr); 112 tlb_add_flush(tlb, addr);
99 tlb_remove_page(tlb, virt_to_page(pmdp)); 113 tlb_remove_entry(tlb, virt_to_page(pmdp));
100} 114}
101#endif 115#endif
102 116
@@ -105,7 +119,7 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pudp,
105 unsigned long addr) 119 unsigned long addr)
106{ 120{
107 tlb_add_flush(tlb, addr); 121 tlb_add_flush(tlb, addr);
108 tlb_remove_page(tlb, virt_to_page(pudp)); 122 tlb_remove_entry(tlb, virt_to_page(pudp));
109} 123}
110#endif 124#endif
111 125
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 2c71077cacfd..d92094203913 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -20,6 +20,7 @@
20#include <linux/gfp.h> 20#include <linux/gfp.h>
21#include <linux/export.h> 21#include <linux/export.h>
22#include <linux/slab.h> 22#include <linux/slab.h>
23#include <linux/genalloc.h>
23#include <linux/dma-mapping.h> 24#include <linux/dma-mapping.h>
24#include <linux/dma-contiguous.h> 25#include <linux/dma-contiguous.h>
25#include <linux/vmalloc.h> 26#include <linux/vmalloc.h>
@@ -38,6 +39,54 @@ static pgprot_t __get_dma_pgprot(struct dma_attrs *attrs, pgprot_t prot,
38 return prot; 39 return prot;
39} 40}
40 41
42static struct gen_pool *atomic_pool;
43
44#define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K
45static size_t atomic_pool_size = DEFAULT_DMA_COHERENT_POOL_SIZE;
46
47static int __init early_coherent_pool(char *p)
48{
49 atomic_pool_size = memparse(p, &p);
50 return 0;
51}
52early_param("coherent_pool", early_coherent_pool);
53
54static void *__alloc_from_pool(size_t size, struct page **ret_page)
55{
56 unsigned long val;
57 void *ptr = NULL;
58
59 if (!atomic_pool) {
60 WARN(1, "coherent pool not initialised!\n");
61 return NULL;
62 }
63
64 val = gen_pool_alloc(atomic_pool, size);
65 if (val) {
66 phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val);
67
68 *ret_page = phys_to_page(phys);
69 ptr = (void *)val;
70 }
71
72 return ptr;
73}
74
75static bool __in_atomic_pool(void *start, size_t size)
76{
77 return addr_in_gen_pool(atomic_pool, (unsigned long)start, size);
78}
79
80static int __free_from_pool(void *start, size_t size)
81{
82 if (!__in_atomic_pool(start, size))
83 return 0;
84
85 gen_pool_free(atomic_pool, (unsigned long)start, size);
86
87 return 1;
88}
89
41static void *__dma_alloc_coherent(struct device *dev, size_t size, 90static void *__dma_alloc_coherent(struct device *dev, size_t size,
42 dma_addr_t *dma_handle, gfp_t flags, 91 dma_addr_t *dma_handle, gfp_t flags,
43 struct dma_attrs *attrs) 92 struct dma_attrs *attrs)
@@ -50,7 +99,7 @@ static void *__dma_alloc_coherent(struct device *dev, size_t size,
50 if (IS_ENABLED(CONFIG_ZONE_DMA) && 99 if (IS_ENABLED(CONFIG_ZONE_DMA) &&
51 dev->coherent_dma_mask <= DMA_BIT_MASK(32)) 100 dev->coherent_dma_mask <= DMA_BIT_MASK(32))
52 flags |= GFP_DMA; 101 flags |= GFP_DMA;
53 if (IS_ENABLED(CONFIG_DMA_CMA)) { 102 if (IS_ENABLED(CONFIG_DMA_CMA) && (flags & __GFP_WAIT)) {
54 struct page *page; 103 struct page *page;
55 104
56 size = PAGE_ALIGN(size); 105 size = PAGE_ALIGN(size);
@@ -70,50 +119,54 @@ static void __dma_free_coherent(struct device *dev, size_t size,
70 void *vaddr, dma_addr_t dma_handle, 119 void *vaddr, dma_addr_t dma_handle,
71 struct dma_attrs *attrs) 120 struct dma_attrs *attrs)
72{ 121{
122 bool freed;
123 phys_addr_t paddr = dma_to_phys(dev, dma_handle);
124
73 if (dev == NULL) { 125 if (dev == NULL) {
74 WARN_ONCE(1, "Use an actual device structure for DMA allocation\n"); 126 WARN_ONCE(1, "Use an actual device structure for DMA allocation\n");
75 return; 127 return;
76 } 128 }
77 129
78 if (IS_ENABLED(CONFIG_DMA_CMA)) { 130 freed = dma_release_from_contiguous(dev,
79 phys_addr_t paddr = dma_to_phys(dev, dma_handle);
80
81 dma_release_from_contiguous(dev,
82 phys_to_page(paddr), 131 phys_to_page(paddr),
83 size >> PAGE_SHIFT); 132 size >> PAGE_SHIFT);
84 } else { 133 if (!freed)
85 swiotlb_free_coherent(dev, size, vaddr, dma_handle); 134 swiotlb_free_coherent(dev, size, vaddr, dma_handle);
86 }
87} 135}
88 136
89static void *__dma_alloc_noncoherent(struct device *dev, size_t size, 137static void *__dma_alloc_noncoherent(struct device *dev, size_t size,
90 dma_addr_t *dma_handle, gfp_t flags, 138 dma_addr_t *dma_handle, gfp_t flags,
91 struct dma_attrs *attrs) 139 struct dma_attrs *attrs)
92{ 140{
93 struct page *page, **map; 141 struct page *page;
94 void *ptr, *coherent_ptr; 142 void *ptr, *coherent_ptr;
95 int order, i;
96 143
97 size = PAGE_ALIGN(size); 144 size = PAGE_ALIGN(size);
98 order = get_order(size); 145
146 if (!(flags & __GFP_WAIT)) {
147 struct page *page = NULL;
148 void *addr = __alloc_from_pool(size, &page);
149
150 if (addr)
151 *dma_handle = phys_to_dma(dev, page_to_phys(page));
152
153 return addr;
154
155 }
99 156
100 ptr = __dma_alloc_coherent(dev, size, dma_handle, flags, attrs); 157 ptr = __dma_alloc_coherent(dev, size, dma_handle, flags, attrs);
101 if (!ptr) 158 if (!ptr)
102 goto no_mem; 159 goto no_mem;
103 map = kmalloc(sizeof(struct page *) << order, flags & ~GFP_DMA);
104 if (!map)
105 goto no_map;
106 160
107 /* remove any dirty cache lines on the kernel alias */ 161 /* remove any dirty cache lines on the kernel alias */
108 __dma_flush_range(ptr, ptr + size); 162 __dma_flush_range(ptr, ptr + size);
109 163
110 /* create a coherent mapping */ 164 /* create a coherent mapping */
111 page = virt_to_page(ptr); 165 page = virt_to_page(ptr);
112 for (i = 0; i < (size >> PAGE_SHIFT); i++) 166 coherent_ptr = dma_common_contiguous_remap(page, size, VM_USERMAP,
113 map[i] = page + i; 167 __get_dma_pgprot(attrs,
114 coherent_ptr = vmap(map, size >> PAGE_SHIFT, VM_MAP, 168 __pgprot(PROT_NORMAL_NC), false),
115 __get_dma_pgprot(attrs, __pgprot(PROT_NORMAL_NC), false)); 169 NULL);
116 kfree(map);
117 if (!coherent_ptr) 170 if (!coherent_ptr)
118 goto no_map; 171 goto no_map;
119 172
@@ -132,6 +185,8 @@ static void __dma_free_noncoherent(struct device *dev, size_t size,
132{ 185{
133 void *swiotlb_addr = phys_to_virt(dma_to_phys(dev, dma_handle)); 186 void *swiotlb_addr = phys_to_virt(dma_to_phys(dev, dma_handle));
134 187
188 if (__free_from_pool(vaddr, size))
189 return;
135 vunmap(vaddr); 190 vunmap(vaddr);
136 __dma_free_coherent(dev, size, swiotlb_addr, dma_handle, attrs); 191 __dma_free_coherent(dev, size, swiotlb_addr, dma_handle, attrs);
137} 192}
@@ -307,6 +362,67 @@ EXPORT_SYMBOL(coherent_swiotlb_dma_ops);
307 362
308extern int swiotlb_late_init_with_default_size(size_t default_size); 363extern int swiotlb_late_init_with_default_size(size_t default_size);
309 364
365static int __init atomic_pool_init(void)
366{
367 pgprot_t prot = __pgprot(PROT_NORMAL_NC);
368 unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT;
369 struct page *page;
370 void *addr;
371 unsigned int pool_size_order = get_order(atomic_pool_size);
372
373 if (dev_get_cma_area(NULL))
374 page = dma_alloc_from_contiguous(NULL, nr_pages,
375 pool_size_order);
376 else
377 page = alloc_pages(GFP_DMA, pool_size_order);
378
379 if (page) {
380 int ret;
381 void *page_addr = page_address(page);
382
383 memset(page_addr, 0, atomic_pool_size);
384 __dma_flush_range(page_addr, page_addr + atomic_pool_size);
385
386 atomic_pool = gen_pool_create(PAGE_SHIFT, -1);
387 if (!atomic_pool)
388 goto free_page;
389
390 addr = dma_common_contiguous_remap(page, atomic_pool_size,
391 VM_USERMAP, prot, atomic_pool_init);
392
393 if (!addr)
394 goto destroy_genpool;
395
396 ret = gen_pool_add_virt(atomic_pool, (unsigned long)addr,
397 page_to_phys(page),
398 atomic_pool_size, -1);
399 if (ret)
400 goto remove_mapping;
401
402 gen_pool_set_algo(atomic_pool,
403 gen_pool_first_fit_order_align,
404 (void *)PAGE_SHIFT);
405
406 pr_info("DMA: preallocated %zu KiB pool for atomic allocations\n",
407 atomic_pool_size / 1024);
408 return 0;
409 }
410 goto out;
411
412remove_mapping:
413 dma_common_free_remap(addr, atomic_pool_size, VM_USERMAP);
414destroy_genpool:
415 gen_pool_destroy(atomic_pool);
416 atomic_pool = NULL;
417free_page:
418 if (!dma_release_from_contiguous(NULL, page, nr_pages))
419 __free_pages(page, pool_size_order);
420out:
421 pr_err("DMA: failed to allocate %zu KiB pool for atomic coherent allocation\n",
422 atomic_pool_size / 1024);
423 return -ENOMEM;
424}
425
310static int __init swiotlb_late_init(void) 426static int __init swiotlb_late_init(void)
311{ 427{
312 size_t swiotlb_size = min(SZ_64M, MAX_ORDER_NR_PAGES << PAGE_SHIFT); 428 size_t swiotlb_size = min(SZ_64M, MAX_ORDER_NR_PAGES << PAGE_SHIFT);
@@ -315,7 +431,17 @@ static int __init swiotlb_late_init(void)
315 431
316 return swiotlb_late_init_with_default_size(swiotlb_size); 432 return swiotlb_late_init_with_default_size(swiotlb_size);
317} 433}
318arch_initcall(swiotlb_late_init); 434
435static int __init arm64_dma_init(void)
436{
437 int ret = 0;
438
439 ret |= swiotlb_late_init();
440 ret |= atomic_pool_init();
441
442 return ret;
443}
444arch_initcall(arm64_dma_init);
319 445
320#define PREALLOC_DMA_DEBUG_ENTRIES 4096 446#define PREALLOC_DMA_DEBUG_ENTRIES 4096
321 447
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c
index 0d64089d28b5..b6f14e8d2121 100644
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c
@@ -104,3 +104,19 @@ EXPORT_SYMBOL(flush_dcache_page);
104 */ 104 */
105EXPORT_SYMBOL(flush_cache_all); 105EXPORT_SYMBOL(flush_cache_all);
106EXPORT_SYMBOL(flush_icache_range); 106EXPORT_SYMBOL(flush_icache_range);
107
108#ifdef CONFIG_TRANSPARENT_HUGEPAGE
109#ifdef CONFIG_HAVE_RCU_TABLE_FREE
110void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
111 pmd_t *pmdp)
112{
113 pmd_t pmd = pmd_mksplitting(*pmdp);
114
115 VM_BUG_ON(address & ~PMD_MASK);
116 set_pmd_at(vma->vm_mm, address, pmdp, pmd);
117
118 /* dummy IPI to serialise against fast_gup */
119 kick_all_cpus_sync();
120}
121#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
122#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/cris/include/asm/Kbuild b/arch/cris/include/asm/Kbuild
index 802b94c4ca86..2ca489eaadd3 100644
--- a/arch/cris/include/asm/Kbuild
+++ b/arch/cris/include/asm/Kbuild
@@ -15,6 +15,7 @@ generic-y += mcs_spinlock.h
15generic-y += module.h 15generic-y += module.h
16generic-y += preempt.h 16generic-y += preempt.h
17generic-y += scatterlist.h 17generic-y += scatterlist.h
18generic-y += sections.h
18generic-y += trace_clock.h 19generic-y += trace_clock.h
19generic-y += vga.h 20generic-y += vga.h
20generic-y += xor.h 21generic-y += xor.h
diff --git a/arch/cris/include/asm/sections.h b/arch/cris/include/asm/sections.h
deleted file mode 100644
index 2c998ce8967b..000000000000
--- a/arch/cris/include/asm/sections.h
+++ /dev/null
@@ -1,7 +0,0 @@
1#ifndef _CRIS_SECTIONS_H
2#define _CRIS_SECTIONS_H
3
4/* nothing to see, move along */
5#include <asm-generic/sections.h>
6
7#endif
diff --git a/arch/frv/include/asm/processor.h b/arch/frv/include/asm/processor.h
index 6554e78893f2..ae8d423e79d9 100644
--- a/arch/frv/include/asm/processor.h
+++ b/arch/frv/include/asm/processor.h
@@ -35,22 +35,6 @@
35struct task_struct; 35struct task_struct;
36 36
37/* 37/*
38 * CPU type and hardware bug flags. Kept separately for each CPU.
39 */
40struct cpuinfo_frv {
41#ifdef CONFIG_MMU
42 unsigned long *pgd_quick;
43 unsigned long *pte_quick;
44 unsigned long pgtable_cache_sz;
45#endif
46} __cacheline_aligned;
47
48extern struct cpuinfo_frv __nongprelbss boot_cpu_data;
49
50#define cpu_data (&boot_cpu_data)
51#define current_cpu_data boot_cpu_data
52
53/*
54 * Bus types 38 * Bus types
55 */ 39 */
56#define EISA_bus 0 40#define EISA_bus 0
diff --git a/arch/frv/kernel/irq-mb93091.c b/arch/frv/kernel/irq-mb93091.c
index 2cc327a1ca44..091b2839be90 100644
--- a/arch/frv/kernel/irq-mb93091.c
+++ b/arch/frv/kernel/irq-mb93091.c
@@ -107,25 +107,25 @@ static irqreturn_t fpga_interrupt(int irq, void *_mask)
107static struct irqaction fpga_irq[4] = { 107static struct irqaction fpga_irq[4] = {
108 [0] = { 108 [0] = {
109 .handler = fpga_interrupt, 109 .handler = fpga_interrupt,
110 .flags = IRQF_DISABLED | IRQF_SHARED, 110 .flags = IRQF_SHARED,
111 .name = "fpga.0", 111 .name = "fpga.0",
112 .dev_id = (void *) 0x0028UL, 112 .dev_id = (void *) 0x0028UL,
113 }, 113 },
114 [1] = { 114 [1] = {
115 .handler = fpga_interrupt, 115 .handler = fpga_interrupt,
116 .flags = IRQF_DISABLED | IRQF_SHARED, 116 .flags = IRQF_SHARED,
117 .name = "fpga.1", 117 .name = "fpga.1",
118 .dev_id = (void *) 0x0050UL, 118 .dev_id = (void *) 0x0050UL,
119 }, 119 },
120 [2] = { 120 [2] = {
121 .handler = fpga_interrupt, 121 .handler = fpga_interrupt,
122 .flags = IRQF_DISABLED | IRQF_SHARED, 122 .flags = IRQF_SHARED,
123 .name = "fpga.2", 123 .name = "fpga.2",
124 .dev_id = (void *) 0x1c00UL, 124 .dev_id = (void *) 0x1c00UL,
125 }, 125 },
126 [3] = { 126 [3] = {
127 .handler = fpga_interrupt, 127 .handler = fpga_interrupt,
128 .flags = IRQF_DISABLED | IRQF_SHARED, 128 .flags = IRQF_SHARED,
129 .name = "fpga.3", 129 .name = "fpga.3",
130 .dev_id = (void *) 0x6386UL, 130 .dev_id = (void *) 0x6386UL,
131 } 131 }
diff --git a/arch/frv/kernel/irq-mb93093.c b/arch/frv/kernel/irq-mb93093.c
index 95e4eb4f1f38..1f3015cf80f5 100644
--- a/arch/frv/kernel/irq-mb93093.c
+++ b/arch/frv/kernel/irq-mb93093.c
@@ -105,7 +105,6 @@ static irqreturn_t fpga_interrupt(int irq, void *_mask)
105static struct irqaction fpga_irq[1] = { 105static struct irqaction fpga_irq[1] = {
106 [0] = { 106 [0] = {
107 .handler = fpga_interrupt, 107 .handler = fpga_interrupt,
108 .flags = IRQF_DISABLED,
109 .name = "fpga.0", 108 .name = "fpga.0",
110 .dev_id = (void *) 0x0700UL, 109 .dev_id = (void *) 0x0700UL,
111 } 110 }
diff --git a/arch/frv/kernel/irq-mb93493.c b/arch/frv/kernel/irq-mb93493.c
index ba648da0932d..8ca5aa4ff595 100644
--- a/arch/frv/kernel/irq-mb93493.c
+++ b/arch/frv/kernel/irq-mb93493.c
@@ -118,13 +118,13 @@ static irqreturn_t mb93493_interrupt(int irq, void *_piqsr)
118static struct irqaction mb93493_irq[2] = { 118static struct irqaction mb93493_irq[2] = {
119 [0] = { 119 [0] = {
120 .handler = mb93493_interrupt, 120 .handler = mb93493_interrupt,
121 .flags = IRQF_DISABLED | IRQF_SHARED, 121 .flags = IRQF_SHARED,
122 .name = "mb93493.0", 122 .name = "mb93493.0",
123 .dev_id = (void *) __addr_MB93493_IQSR(0), 123 .dev_id = (void *) __addr_MB93493_IQSR(0),
124 }, 124 },
125 [1] = { 125 [1] = {
126 .handler = mb93493_interrupt, 126 .handler = mb93493_interrupt,
127 .flags = IRQF_DISABLED | IRQF_SHARED, 127 .flags = IRQF_SHARED,
128 .name = "mb93493.1", 128 .name = "mb93493.1",
129 .dev_id = (void *) __addr_MB93493_IQSR(1), 129 .dev_id = (void *) __addr_MB93493_IQSR(1),
130 } 130 }
diff --git a/arch/frv/kernel/setup.c b/arch/frv/kernel/setup.c
index 9f3a7a62d787..9f4a9a607dbe 100644
--- a/arch/frv/kernel/setup.c
+++ b/arch/frv/kernel/setup.c
@@ -104,8 +104,6 @@ unsigned long __nongprelbss dma_coherent_mem_end;
104unsigned long __initdata __sdram_old_base; 104unsigned long __initdata __sdram_old_base;
105unsigned long __initdata num_mappedpages; 105unsigned long __initdata num_mappedpages;
106 106
107struct cpuinfo_frv __nongprelbss boot_cpu_data;
108
109char __initdata command_line[COMMAND_LINE_SIZE]; 107char __initdata command_line[COMMAND_LINE_SIZE];
110char __initdata redboot_command_line[COMMAND_LINE_SIZE]; 108char __initdata redboot_command_line[COMMAND_LINE_SIZE];
111 109
diff --git a/arch/frv/kernel/time.c b/arch/frv/kernel/time.c
index b457de496b70..332e00bf9d06 100644
--- a/arch/frv/kernel/time.c
+++ b/arch/frv/kernel/time.c
@@ -44,7 +44,6 @@ static irqreturn_t timer_interrupt(int irq, void *dummy);
44 44
45static struct irqaction timer_irq = { 45static struct irqaction timer_irq = {
46 .handler = timer_interrupt, 46 .handler = timer_interrupt,
47 .flags = IRQF_DISABLED,
48 .name = "timer", 47 .name = "timer",
49}; 48};
50 49
diff --git a/arch/m32r/include/asm/Kbuild b/arch/m32r/include/asm/Kbuild
index e02448b0648b..3796801d6e0c 100644
--- a/arch/m32r/include/asm/Kbuild
+++ b/arch/m32r/include/asm/Kbuild
@@ -8,4 +8,5 @@ generic-y += mcs_spinlock.h
8generic-y += module.h 8generic-y += module.h
9generic-y += preempt.h 9generic-y += preempt.h
10generic-y += scatterlist.h 10generic-y += scatterlist.h
11generic-y += sections.h
11generic-y += trace_clock.h 12generic-y += trace_clock.h
diff --git a/arch/m32r/include/asm/sections.h b/arch/m32r/include/asm/sections.h
deleted file mode 100644
index 5e5d21c4908a..000000000000
--- a/arch/m32r/include/asm/sections.h
+++ /dev/null
@@ -1,7 +0,0 @@
1#ifndef _M32R_SECTIONS_H
2#define _M32R_SECTIONS_H
3
4/* nothing to see, move along */
5#include <asm-generic/sections.h>
6
7#endif /* _M32R_SECTIONS_H */
diff --git a/arch/m32r/kernel/time.c b/arch/m32r/kernel/time.c
index 1a15f81ea1bd..093f2761aa51 100644
--- a/arch/m32r/kernel/time.c
+++ b/arch/m32r/kernel/time.c
@@ -134,7 +134,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
134 134
135static struct irqaction irq0 = { 135static struct irqaction irq0 = {
136 .handler = timer_interrupt, 136 .handler = timer_interrupt,
137 .flags = IRQF_DISABLED,
138 .name = "MFT2", 137 .name = "MFT2",
139}; 138};
140 139
diff --git a/arch/m68k/kernel/sys_m68k.c b/arch/m68k/kernel/sys_m68k.c
index 3a480b3df0d6..9aa01adb407f 100644
--- a/arch/m68k/kernel/sys_m68k.c
+++ b/arch/m68k/kernel/sys_m68k.c
@@ -376,7 +376,6 @@ cache_flush_060 (unsigned long addr, int scope, int cache, unsigned long len)
376asmlinkage int 376asmlinkage int
377sys_cacheflush (unsigned long addr, int scope, int cache, unsigned long len) 377sys_cacheflush (unsigned long addr, int scope, int cache, unsigned long len)
378{ 378{
379 struct vm_area_struct *vma;
380 int ret = -EINVAL; 379 int ret = -EINVAL;
381 380
382 if (scope < FLUSH_SCOPE_LINE || scope > FLUSH_SCOPE_ALL || 381 if (scope < FLUSH_SCOPE_LINE || scope > FLUSH_SCOPE_ALL ||
@@ -389,17 +388,21 @@ sys_cacheflush (unsigned long addr, int scope, int cache, unsigned long len)
389 if (!capable(CAP_SYS_ADMIN)) 388 if (!capable(CAP_SYS_ADMIN))
390 goto out; 389 goto out;
391 } else { 390 } else {
391 struct vm_area_struct *vma;
392
393 /* Check for overflow. */
394 if (addr + len < addr)
395 goto out;
396
392 /* 397 /*
393 * Verify that the specified address region actually belongs 398 * Verify that the specified address region actually belongs
394 * to this process. 399 * to this process.
395 */ 400 */
396 vma = find_vma (current->mm, addr);
397 ret = -EINVAL; 401 ret = -EINVAL;
398 /* Check for overflow. */ 402 down_read(&current->mm->mmap_sem);
399 if (addr + len < addr) 403 vma = find_vma(current->mm, addr);
400 goto out; 404 if (!vma || addr < vma->vm_start || addr + len > vma->vm_end)
401 if (vma == NULL || addr < vma->vm_start || addr + len > vma->vm_end) 405 goto out_unlock;
402 goto out;
403 } 406 }
404 407
405 if (CPU_IS_020_OR_030) { 408 if (CPU_IS_020_OR_030) {
@@ -429,7 +432,7 @@ sys_cacheflush (unsigned long addr, int scope, int cache, unsigned long len)
429 __asm__ __volatile__ ("movec %0, %%cacr" : : "r" (cacr)); 432 __asm__ __volatile__ ("movec %0, %%cacr" : : "r" (cacr));
430 } 433 }
431 ret = 0; 434 ret = 0;
432 goto out; 435 goto out_unlock;
433 } else { 436 } else {
434 /* 437 /*
435 * 040 or 060: don't blindly trust 'scope', someone could 438 * 040 or 060: don't blindly trust 'scope', someone could
@@ -446,6 +449,8 @@ sys_cacheflush (unsigned long addr, int scope, int cache, unsigned long len)
446 ret = cache_flush_060 (addr, scope, cache, len); 449 ret = cache_flush_060 (addr, scope, cache, len);
447 } 450 }
448 } 451 }
452out_unlock:
453 up_read(&current->mm->mmap_sem);
449out: 454out:
450 return ret; 455 return ret;
451} 456}
diff --git a/arch/mips/include/asm/suspend.h b/arch/mips/include/asm/suspend.h
deleted file mode 100644
index 3adac3b53d19..000000000000
--- a/arch/mips/include/asm/suspend.h
+++ /dev/null
@@ -1,7 +0,0 @@
1#ifndef __ASM_SUSPEND_H
2#define __ASM_SUSPEND_H
3
4/* References to section boundaries */
5extern const void __nosave_begin, __nosave_end;
6
7#endif /* __ASM_SUSPEND_H */
diff --git a/arch/mips/power/cpu.c b/arch/mips/power/cpu.c
index 521e5963df05..2129e67723ff 100644
--- a/arch/mips/power/cpu.c
+++ b/arch/mips/power/cpu.c
@@ -7,7 +7,7 @@
7 * Author: Hu Hongbing <huhb@lemote.com> 7 * Author: Hu Hongbing <huhb@lemote.com>
8 * Wu Zhangjin <wuzhangjin@gmail.com> 8 * Wu Zhangjin <wuzhangjin@gmail.com>
9 */ 9 */
10#include <asm/suspend.h> 10#include <asm/sections.h>
11#include <asm/fpu.h> 11#include <asm/fpu.h>
12#include <asm/dsp.h> 12#include <asm/dsp.h>
13 13
diff --git a/arch/mn10300/include/asm/Kbuild b/arch/mn10300/include/asm/Kbuild
index 77eb1a68d13b..54a062cb9f2c 100644
--- a/arch/mn10300/include/asm/Kbuild
+++ b/arch/mn10300/include/asm/Kbuild
@@ -8,4 +8,5 @@ generic-y += irq_work.h
8generic-y += mcs_spinlock.h 8generic-y += mcs_spinlock.h
9generic-y += preempt.h 9generic-y += preempt.h
10generic-y += scatterlist.h 10generic-y += scatterlist.h
11generic-y += sections.h
11generic-y += trace_clock.h 12generic-y += trace_clock.h
diff --git a/arch/mn10300/include/asm/sections.h b/arch/mn10300/include/asm/sections.h
deleted file mode 100644
index 2b8c5160388f..000000000000
--- a/arch/mn10300/include/asm/sections.h
+++ /dev/null
@@ -1 +0,0 @@
1#include <asm-generic/sections.h>
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index d98c1ecc3266..f60d4ea8b50c 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -38,10 +38,9 @@ static inline int pte_none(pte_t pte) { return (pte_val(pte) & ~_PTE_NONE_MASK)
38static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); } 38static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); }
39 39
40#ifdef CONFIG_NUMA_BALANCING 40#ifdef CONFIG_NUMA_BALANCING
41
42static inline int pte_present(pte_t pte) 41static inline int pte_present(pte_t pte)
43{ 42{
44 return pte_val(pte) & (_PAGE_PRESENT | _PAGE_NUMA); 43 return pte_val(pte) & _PAGE_NUMA_MASK;
45} 44}
46 45
47#define pte_present_nonuma pte_present_nonuma 46#define pte_present_nonuma pte_present_nonuma
@@ -50,37 +49,6 @@ static inline int pte_present_nonuma(pte_t pte)
50 return pte_val(pte) & (_PAGE_PRESENT); 49 return pte_val(pte) & (_PAGE_PRESENT);
51} 50}
52 51
53#define pte_numa pte_numa
54static inline int pte_numa(pte_t pte)
55{
56 return (pte_val(pte) &
57 (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
58}
59
60#define pte_mknonnuma pte_mknonnuma
61static inline pte_t pte_mknonnuma(pte_t pte)
62{
63 pte_val(pte) &= ~_PAGE_NUMA;
64 pte_val(pte) |= _PAGE_PRESENT | _PAGE_ACCESSED;
65 return pte;
66}
67
68#define pte_mknuma pte_mknuma
69static inline pte_t pte_mknuma(pte_t pte)
70{
71 /*
72 * We should not set _PAGE_NUMA on non present ptes. Also clear the
73 * present bit so that hash_page will return 1 and we collect this
74 * as numa fault.
75 */
76 if (pte_present(pte)) {
77 pte_val(pte) |= _PAGE_NUMA;
78 pte_val(pte) &= ~_PAGE_PRESENT;
79 } else
80 VM_BUG_ON(1);
81 return pte;
82}
83
84#define ptep_set_numa ptep_set_numa 52#define ptep_set_numa ptep_set_numa
85static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr, 53static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
86 pte_t *ptep) 54 pte_t *ptep)
@@ -92,12 +60,6 @@ static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
92 return; 60 return;
93} 61}
94 62
95#define pmd_numa pmd_numa
96static inline int pmd_numa(pmd_t pmd)
97{
98 return pte_numa(pmd_pte(pmd));
99}
100
101#define pmdp_set_numa pmdp_set_numa 63#define pmdp_set_numa pmdp_set_numa
102static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, 64static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
103 pmd_t *pmdp) 65 pmd_t *pmdp)
@@ -109,16 +71,21 @@ static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
109 return; 71 return;
110} 72}
111 73
112#define pmd_mknonnuma pmd_mknonnuma 74/*
113static inline pmd_t pmd_mknonnuma(pmd_t pmd) 75 * Generic NUMA pte helpers expect pteval_t and pmdval_t types to exist
76 * which was inherited from x86. For the purposes of powerpc pte_basic_t and
77 * pmd_t are equivalent
78 */
79#define pteval_t pte_basic_t
80#define pmdval_t pmd_t
81static inline pteval_t ptenuma_flags(pte_t pte)
114{ 82{
115 return pte_pmd(pte_mknonnuma(pmd_pte(pmd))); 83 return pte_val(pte) & _PAGE_NUMA_MASK;
116} 84}
117 85
118#define pmd_mknuma pmd_mknuma 86static inline pmdval_t pmdnuma_flags(pmd_t pmd)
119static inline pmd_t pmd_mknuma(pmd_t pmd)
120{ 87{
121 return pte_pmd(pte_mknuma(pmd_pte(pmd))); 88 return pmd_val(pmd) & _PAGE_NUMA_MASK;
122} 89}
123 90
124# else 91# else
diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h
index 8d1569c29042..e040c3595129 100644
--- a/arch/powerpc/include/asm/pte-common.h
+++ b/arch/powerpc/include/asm/pte-common.h
@@ -98,6 +98,11 @@ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void);
98 _PAGE_USER | _PAGE_ACCESSED | \ 98 _PAGE_USER | _PAGE_ACCESSED | \
99 _PAGE_RW | _PAGE_HWWRITE | _PAGE_DIRTY | _PAGE_EXEC) 99 _PAGE_RW | _PAGE_HWWRITE | _PAGE_DIRTY | _PAGE_EXEC)
100 100
101#ifdef CONFIG_NUMA_BALANCING
102/* Mask of bits that distinguish present and numa ptes */
103#define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PRESENT)
104#endif
105
101/* 106/*
102 * We define 2 sets of base prot bits, one for basic pages (ie, 107 * We define 2 sets of base prot bits, one for basic pages (ie,
103 * cacheable kernel and user pages) and one for non cacheable 108 * cacheable kernel and user pages) and one for non cacheable
diff --git a/arch/powerpc/kernel/suspend.c b/arch/powerpc/kernel/suspend.c
index 0167d53da30c..a531154cc0f3 100644
--- a/arch/powerpc/kernel/suspend.c
+++ b/arch/powerpc/kernel/suspend.c
@@ -9,9 +9,7 @@
9 9
10#include <linux/mm.h> 10#include <linux/mm.h>
11#include <asm/page.h> 11#include <asm/page.h>
12 12#include <asm/sections.h>
13/* References to section boundaries */
14extern const void __nosave_begin, __nosave_end;
15 13
16/* 14/*
17 * pfn_is_nosave - check if given pfn is in the 'nosave' section 15 * pfn_is_nosave - check if given pfn is in the 'nosave' section
diff --git a/arch/s390/kernel/suspend.c b/arch/s390/kernel/suspend.c
index a7a7537ce1e7..1c4c5accd220 100644
--- a/arch/s390/kernel/suspend.c
+++ b/arch/s390/kernel/suspend.c
@@ -13,14 +13,10 @@
13#include <asm/ipl.h> 13#include <asm/ipl.h>
14#include <asm/cio.h> 14#include <asm/cio.h>
15#include <asm/pci.h> 15#include <asm/pci.h>
16#include <asm/sections.h>
16#include "entry.h" 17#include "entry.h"
17 18
18/* 19/*
19 * References to section boundaries
20 */
21extern const void __nosave_begin, __nosave_end;
22
23/*
24 * The restore of the saved pages in an hibernation image will set 20 * The restore of the saved pages in an hibernation image will set
25 * the change and referenced bits in the storage key for each page. 21 * the change and referenced bits in the storage key for each page.
26 * Overindication of the referenced bits after an hibernation cycle 22 * Overindication of the referenced bits after an hibernation cycle
diff --git a/arch/score/include/asm/Kbuild b/arch/score/include/asm/Kbuild
index 3fe5681744f1..46461c19f284 100644
--- a/arch/score/include/asm/Kbuild
+++ b/arch/score/include/asm/Kbuild
@@ -10,6 +10,7 @@ generic-y += irq_work.h
10generic-y += mcs_spinlock.h 10generic-y += mcs_spinlock.h
11generic-y += preempt.h 11generic-y += preempt.h
12generic-y += scatterlist.h 12generic-y += scatterlist.h
13generic-y += sections.h
13generic-y += trace_clock.h 14generic-y += trace_clock.h
14generic-y += xor.h 15generic-y += xor.h
15generic-y += serial.h 16generic-y += serial.h
diff --git a/arch/score/include/asm/sections.h b/arch/score/include/asm/sections.h
deleted file mode 100644
index 9441d23af005..000000000000
--- a/arch/score/include/asm/sections.h
+++ /dev/null
@@ -1,6 +0,0 @@
1#ifndef _ASM_SCORE_SECTIONS_H
2#define _ASM_SCORE_SECTIONS_H
3
4#include <asm-generic/sections.h>
5
6#endif /* _ASM_SCORE_SECTIONS_H */
diff --git a/arch/sh/include/asm/sections.h b/arch/sh/include/asm/sections.h
index 1b6199740e98..7a99e6af6372 100644
--- a/arch/sh/include/asm/sections.h
+++ b/arch/sh/include/asm/sections.h
@@ -3,7 +3,6 @@
3 3
4#include <asm-generic/sections.h> 4#include <asm-generic/sections.h>
5 5
6extern long __nosave_begin, __nosave_end;
7extern long __machvec_start, __machvec_end; 6extern long __machvec_start, __machvec_end;
8extern char __uncached_start, __uncached_end; 7extern char __uncached_start, __uncached_end;
9extern char __start_eh_frame[], __stop_eh_frame[]; 8extern char __start_eh_frame[], __stop_eh_frame[];
diff --git a/arch/sparc/power/hibernate.c b/arch/sparc/power/hibernate.c
index 42b0b8ce699a..17bd2e167e07 100644
--- a/arch/sparc/power/hibernate.c
+++ b/arch/sparc/power/hibernate.c
@@ -9,11 +9,9 @@
9#include <asm/hibernate.h> 9#include <asm/hibernate.h>
10#include <asm/visasm.h> 10#include <asm/visasm.h>
11#include <asm/page.h> 11#include <asm/page.h>
12#include <asm/sections.h>
12#include <asm/tlb.h> 13#include <asm/tlb.h>
13 14
14/* References to section boundaries */
15extern const void __nosave_begin, __nosave_end;
16
17struct saved_context saved_context; 15struct saved_context saved_context;
18 16
19/* 17/*
diff --git a/arch/unicore32/include/mach/pm.h b/arch/unicore32/include/mach/pm.h
index 4dcd34ae194c..77b522694e74 100644
--- a/arch/unicore32/include/mach/pm.h
+++ b/arch/unicore32/include/mach/pm.h
@@ -36,8 +36,5 @@ extern int puv3_pm_enter(suspend_state_t state);
36/* Defined in hibernate_asm.S */ 36/* Defined in hibernate_asm.S */
37extern int restore_image(pgd_t *resume_pg_dir, struct pbe *restore_pblist); 37extern int restore_image(pgd_t *resume_pg_dir, struct pbe *restore_pblist);
38 38
39/* References to section boundaries */
40extern const void __nosave_begin, __nosave_end;
41
42extern struct pbe *restore_pblist; 39extern struct pbe *restore_pblist;
43#endif 40#endif
diff --git a/arch/unicore32/kernel/hibernate.c b/arch/unicore32/kernel/hibernate.c
index d75ef8b6cb56..9969ec374abb 100644
--- a/arch/unicore32/kernel/hibernate.c
+++ b/arch/unicore32/kernel/hibernate.c
@@ -18,6 +18,7 @@
18#include <asm/page.h> 18#include <asm/page.h>
19#include <asm/pgtable.h> 19#include <asm/pgtable.h>
20#include <asm/pgalloc.h> 20#include <asm/pgalloc.h>
21#include <asm/sections.h>
21#include <asm/suspend.h> 22#include <asm/suspend.h>
22 23
23#include "mach/pm.h" 24#include "mach/pm.h"
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e4b1f431c7ed..3eb8a41509b3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -30,7 +30,6 @@ config X86
30 select HAVE_UNSTABLE_SCHED_CLOCK 30 select HAVE_UNSTABLE_SCHED_CLOCK
31 select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 31 select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
32 select ARCH_SUPPORTS_INT128 if X86_64 32 select ARCH_SUPPORTS_INT128 if X86_64
33 select ARCH_WANTS_PROT_NUMA_PROT_NONE
34 select HAVE_IDE 33 select HAVE_IDE
35 select HAVE_OPROFILE 34 select HAVE_OPROFILE
36 select HAVE_PCSPKR_PLATFORM 35 select HAVE_PCSPKR_PLATFORM
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index f216963760e5..0f9724c9c510 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -325,6 +325,20 @@ static inline pteval_t pte_flags(pte_t pte)
325 return native_pte_val(pte) & PTE_FLAGS_MASK; 325 return native_pte_val(pte) & PTE_FLAGS_MASK;
326} 326}
327 327
328#ifdef CONFIG_NUMA_BALANCING
329/* Set of bits that distinguishes present, prot_none and numa ptes */
330#define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)
331static inline pteval_t ptenuma_flags(pte_t pte)
332{
333 return pte_flags(pte) & _PAGE_NUMA_MASK;
334}
335
336static inline pmdval_t pmdnuma_flags(pmd_t pmd)
337{
338 return pmd_flags(pmd) & _PAGE_NUMA_MASK;
339}
340#endif /* CONFIG_NUMA_BALANCING */
341
328#define pgprot_val(x) ((x).pgprot) 342#define pgprot_val(x) ((x).pgprot)
329#define __pgprot(x) ((pgprot_t) { (x) } ) 343#define __pgprot(x) ((pgprot_t) { (x) } )
330 344
diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c
index 7d28c885d238..291226b952a9 100644
--- a/arch/x86/power/hibernate_32.c
+++ b/arch/x86/power/hibernate_32.c
@@ -13,13 +13,11 @@
13#include <asm/page.h> 13#include <asm/page.h>
14#include <asm/pgtable.h> 14#include <asm/pgtable.h>
15#include <asm/mmzone.h> 15#include <asm/mmzone.h>
16#include <asm/sections.h>
16 17
17/* Defined in hibernate_asm_32.S */ 18/* Defined in hibernate_asm_32.S */
18extern int restore_image(void); 19extern int restore_image(void);
19 20
20/* References to section boundaries */
21extern const void __nosave_begin, __nosave_end;
22
23/* Pointer to the temporary resume page tables */ 21/* Pointer to the temporary resume page tables */
24pgd_t *resume_pg_dir; 22pgd_t *resume_pg_dir;
25 23
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index 35e2bb6c0f37..009947d419a6 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -17,11 +17,9 @@
17#include <asm/page.h> 17#include <asm/page.h>
18#include <asm/pgtable.h> 18#include <asm/pgtable.h>
19#include <asm/mtrr.h> 19#include <asm/mtrr.h>
20#include <asm/sections.h>
20#include <asm/suspend.h> 21#include <asm/suspend.h>
21 22
22/* References to section boundaries */
23extern __visible const void __nosave_begin, __nosave_end;
24
25/* Defined in hibernate_asm_64.S */ 23/* Defined in hibernate_asm_64.S */
26extern asmlinkage __visible int restore_image(void); 24extern asmlinkage __visible int restore_image(void);
27 25
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index 134f763d90fd..61a33f4ba608 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -252,6 +252,9 @@ config DMA_CMA
252 to allocate big physically-contiguous blocks of memory for use with 252 to allocate big physically-contiguous blocks of memory for use with
253 hardware components that do not support I/O map nor scatter-gather. 253 hardware components that do not support I/O map nor scatter-gather.
254 254
255 You can disable CMA by specifying "cma=0" on the kernel's command
256 line.
257
255 For more information see <include/linux/dma-contiguous.h>. 258 For more information see <include/linux/dma-contiguous.h>.
256 If unsure, say "n". 259 If unsure, say "n".
257 260
diff --git a/drivers/base/dma-mapping.c b/drivers/base/dma-mapping.c
index 6cd08e145bfa..9e8bbdd470ca 100644
--- a/drivers/base/dma-mapping.c
+++ b/drivers/base/dma-mapping.c
@@ -10,6 +10,8 @@
10#include <linux/dma-mapping.h> 10#include <linux/dma-mapping.h>
11#include <linux/export.h> 11#include <linux/export.h>
12#include <linux/gfp.h> 12#include <linux/gfp.h>
13#include <linux/slab.h>
14#include <linux/vmalloc.h>
13#include <asm-generic/dma-coherent.h> 15#include <asm-generic/dma-coherent.h>
14 16
15/* 17/*
@@ -267,3 +269,73 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
267 return ret; 269 return ret;
268} 270}
269EXPORT_SYMBOL(dma_common_mmap); 271EXPORT_SYMBOL(dma_common_mmap);
272
273#ifdef CONFIG_MMU
274/*
275 * remaps an array of PAGE_SIZE pages into another vm_area
276 * Cannot be used in non-sleeping contexts
277 */
278void *dma_common_pages_remap(struct page **pages, size_t size,
279 unsigned long vm_flags, pgprot_t prot,
280 const void *caller)
281{
282 struct vm_struct *area;
283
284 area = get_vm_area_caller(size, vm_flags, caller);
285 if (!area)
286 return NULL;
287
288 area->pages = pages;
289
290 if (map_vm_area(area, prot, pages)) {
291 vunmap(area->addr);
292 return NULL;
293 }
294
295 return area->addr;
296}
297
298/*
299 * remaps an allocated contiguous region into another vm_area.
300 * Cannot be used in non-sleeping contexts
301 */
302
303void *dma_common_contiguous_remap(struct page *page, size_t size,
304 unsigned long vm_flags,
305 pgprot_t prot, const void *caller)
306{
307 int i;
308 struct page **pages;
309 void *ptr;
310 unsigned long pfn;
311
312 pages = kmalloc(sizeof(struct page *) << get_order(size), GFP_KERNEL);
313 if (!pages)
314 return NULL;
315
316 for (i = 0, pfn = page_to_pfn(page); i < (size >> PAGE_SHIFT); i++)
317 pages[i] = pfn_to_page(pfn + i);
318
319 ptr = dma_common_pages_remap(pages, size, vm_flags, prot, caller);
320
321 kfree(pages);
322
323 return ptr;
324}
325
326/*
327 * unmaps a range previously mapped by dma_common_*_remap
328 */
329void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags)
330{
331 struct vm_struct *area = find_vm_area(cpu_addr);
332
333 if (!area || (area->flags & vm_flags) != vm_flags) {
334 WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr);
335 return;
336 }
337
338 unmap_kernel_range((unsigned long)cpu_addr, size);
339 vunmap(cpu_addr);
340}
341#endif
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index a2e13e250bba..7c5d87191b28 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -373,6 +373,45 @@ static ssize_t show_phys_device(struct device *dev,
373 return sprintf(buf, "%d\n", mem->phys_device); 373 return sprintf(buf, "%d\n", mem->phys_device);
374} 374}
375 375
376#ifdef CONFIG_MEMORY_HOTREMOVE
377static ssize_t show_valid_zones(struct device *dev,
378 struct device_attribute *attr, char *buf)
379{
380 struct memory_block *mem = to_memory_block(dev);
381 unsigned long start_pfn, end_pfn;
382 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
383 struct page *first_page;
384 struct zone *zone;
385
386 start_pfn = section_nr_to_pfn(mem->start_section_nr);
387 end_pfn = start_pfn + nr_pages;
388 first_page = pfn_to_page(start_pfn);
389
390 /* The block contains more than one zone can not be offlined. */
391 if (!test_pages_in_a_zone(start_pfn, end_pfn))
392 return sprintf(buf, "none\n");
393
394 zone = page_zone(first_page);
395
396 if (zone_idx(zone) == ZONE_MOVABLE - 1) {
397 /*The mem block is the last memoryblock of this zone.*/
398 if (end_pfn == zone_end_pfn(zone))
399 return sprintf(buf, "%s %s\n",
400 zone->name, (zone + 1)->name);
401 }
402
403 if (zone_idx(zone) == ZONE_MOVABLE) {
404 /*The mem block is the first memoryblock of ZONE_MOVABLE.*/
405 if (start_pfn == zone->zone_start_pfn)
406 return sprintf(buf, "%s %s\n",
407 zone->name, (zone - 1)->name);
408 }
409
410 return sprintf(buf, "%s\n", zone->name);
411}
412static DEVICE_ATTR(valid_zones, 0444, show_valid_zones, NULL);
413#endif
414
376static DEVICE_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL); 415static DEVICE_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL);
377static DEVICE_ATTR(state, 0644, show_mem_state, store_mem_state); 416static DEVICE_ATTR(state, 0644, show_mem_state, store_mem_state);
378static DEVICE_ATTR(phys_device, 0444, show_phys_device, NULL); 417static DEVICE_ATTR(phys_device, 0444, show_phys_device, NULL);
@@ -523,6 +562,9 @@ static struct attribute *memory_memblk_attrs[] = {
523 &dev_attr_state.attr, 562 &dev_attr_state.attr,
524 &dev_attr_phys_device.attr, 563 &dev_attr_phys_device.attr,
525 &dev_attr_removable.attr, 564 &dev_attr_removable.attr,
565#ifdef CONFIG_MEMORY_HOTREMOVE
566 &dev_attr_valid_zones.attr,
567#endif
526 NULL 568 NULL
527}; 569};
528 570
diff --git a/drivers/base/node.c b/drivers/base/node.c
index d51c49c9bafa..472168cd0c97 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -289,8 +289,6 @@ static int register_node(struct node *node, int num, struct node *parent)
289 device_create_file(&node->dev, &dev_attr_distance); 289 device_create_file(&node->dev, &dev_attr_distance);
290 device_create_file(&node->dev, &dev_attr_vmstat); 290 device_create_file(&node->dev, &dev_attr_vmstat);
291 291
292 scan_unevictable_register_node(node);
293
294 hugetlb_register_node(node); 292 hugetlb_register_node(node);
295 293
296 compaction_register_node(node); 294 compaction_register_node(node);
@@ -314,7 +312,6 @@ void unregister_node(struct node *node)
314 device_remove_file(&node->dev, &dev_attr_distance); 312 device_remove_file(&node->dev, &dev_attr_distance);
315 device_remove_file(&node->dev, &dev_attr_vmstat); 313 device_remove_file(&node->dev, &dev_attr_vmstat);
316 314
317 scan_unevictable_unregister_node(node);
318 hugetlb_unregister_node(node); /* no-op, if memoryless node */ 315 hugetlb_unregister_node(node); /* no-op, if memoryless node */
319 316
320 device_unregister(&node->dev); 317 device_unregister(&node->dev);
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index d00831c3d731..3b850164c65c 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -103,10 +103,10 @@ static ssize_t mem_used_total_show(struct device *dev,
103 103
104 down_read(&zram->init_lock); 104 down_read(&zram->init_lock);
105 if (init_done(zram)) 105 if (init_done(zram))
106 val = zs_get_total_size_bytes(meta->mem_pool); 106 val = zs_get_total_pages(meta->mem_pool);
107 up_read(&zram->init_lock); 107 up_read(&zram->init_lock);
108 108
109 return scnprintf(buf, PAGE_SIZE, "%llu\n", val); 109 return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT);
110} 110}
111 111
112static ssize_t max_comp_streams_show(struct device *dev, 112static ssize_t max_comp_streams_show(struct device *dev,
@@ -122,6 +122,72 @@ static ssize_t max_comp_streams_show(struct device *dev,
122 return scnprintf(buf, PAGE_SIZE, "%d\n", val); 122 return scnprintf(buf, PAGE_SIZE, "%d\n", val);
123} 123}
124 124
125static ssize_t mem_limit_show(struct device *dev,
126 struct device_attribute *attr, char *buf)
127{
128 u64 val;
129 struct zram *zram = dev_to_zram(dev);
130
131 down_read(&zram->init_lock);
132 val = zram->limit_pages;
133 up_read(&zram->init_lock);
134
135 return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT);
136}
137
138static ssize_t mem_limit_store(struct device *dev,
139 struct device_attribute *attr, const char *buf, size_t len)
140{
141 u64 limit;
142 char *tmp;
143 struct zram *zram = dev_to_zram(dev);
144
145 limit = memparse(buf, &tmp);
146 if (buf == tmp) /* no chars parsed, invalid input */
147 return -EINVAL;
148
149 down_write(&zram->init_lock);
150 zram->limit_pages = PAGE_ALIGN(limit) >> PAGE_SHIFT;
151 up_write(&zram->init_lock);
152
153 return len;
154}
155
156static ssize_t mem_used_max_show(struct device *dev,
157 struct device_attribute *attr, char *buf)
158{
159 u64 val = 0;
160 struct zram *zram = dev_to_zram(dev);
161
162 down_read(&zram->init_lock);
163 if (init_done(zram))
164 val = atomic_long_read(&zram->stats.max_used_pages);
165 up_read(&zram->init_lock);
166
167 return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT);
168}
169
170static ssize_t mem_used_max_store(struct device *dev,
171 struct device_attribute *attr, const char *buf, size_t len)
172{
173 int err;
174 unsigned long val;
175 struct zram *zram = dev_to_zram(dev);
176 struct zram_meta *meta = zram->meta;
177
178 err = kstrtoul(buf, 10, &val);
179 if (err || val != 0)
180 return -EINVAL;
181
182 down_read(&zram->init_lock);
183 if (init_done(zram))
184 atomic_long_set(&zram->stats.max_used_pages,
185 zs_get_total_pages(meta->mem_pool));
186 up_read(&zram->init_lock);
187
188 return len;
189}
190
125static ssize_t max_comp_streams_store(struct device *dev, 191static ssize_t max_comp_streams_store(struct device *dev,
126 struct device_attribute *attr, const char *buf, size_t len) 192 struct device_attribute *attr, const char *buf, size_t len)
127{ 193{
@@ -434,6 +500,21 @@ out_cleanup:
434 return ret; 500 return ret;
435} 501}
436 502
503static inline void update_used_max(struct zram *zram,
504 const unsigned long pages)
505{
506 int old_max, cur_max;
507
508 old_max = atomic_long_read(&zram->stats.max_used_pages);
509
510 do {
511 cur_max = old_max;
512 if (pages > cur_max)
513 old_max = atomic_long_cmpxchg(
514 &zram->stats.max_used_pages, cur_max, pages);
515 } while (old_max != cur_max);
516}
517
437static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, 518static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
438 int offset) 519 int offset)
439{ 520{
@@ -445,6 +526,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
445 struct zram_meta *meta = zram->meta; 526 struct zram_meta *meta = zram->meta;
446 struct zcomp_strm *zstrm; 527 struct zcomp_strm *zstrm;
447 bool locked = false; 528 bool locked = false;
529 unsigned long alloced_pages;
448 530
449 page = bvec->bv_page; 531 page = bvec->bv_page;
450 if (is_partial_io(bvec)) { 532 if (is_partial_io(bvec)) {
@@ -513,6 +595,16 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
513 ret = -ENOMEM; 595 ret = -ENOMEM;
514 goto out; 596 goto out;
515 } 597 }
598
599 alloced_pages = zs_get_total_pages(meta->mem_pool);
600 if (zram->limit_pages && alloced_pages > zram->limit_pages) {
601 zs_free(meta->mem_pool, handle);
602 ret = -ENOMEM;
603 goto out;
604 }
605
606 update_used_max(zram, alloced_pages);
607
516 cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO); 608 cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO);
517 609
518 if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) { 610 if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) {
@@ -606,6 +698,7 @@ static void zram_bio_discard(struct zram *zram, u32 index,
606 bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); 698 bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
607 zram_free_page(zram, index); 699 zram_free_page(zram, index);
608 bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); 700 bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
701 atomic64_inc(&zram->stats.notify_free);
609 index++; 702 index++;
610 n -= PAGE_SIZE; 703 n -= PAGE_SIZE;
611 } 704 }
@@ -617,6 +710,9 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
617 struct zram_meta *meta; 710 struct zram_meta *meta;
618 711
619 down_write(&zram->init_lock); 712 down_write(&zram->init_lock);
713
714 zram->limit_pages = 0;
715
620 if (!init_done(zram)) { 716 if (!init_done(zram)) {
621 up_write(&zram->init_lock); 717 up_write(&zram->init_lock);
622 return; 718 return;
@@ -857,6 +953,10 @@ static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL);
857static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store); 953static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store);
858static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL); 954static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL);
859static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); 955static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL);
956static DEVICE_ATTR(mem_limit, S_IRUGO | S_IWUSR, mem_limit_show,
957 mem_limit_store);
958static DEVICE_ATTR(mem_used_max, S_IRUGO | S_IWUSR, mem_used_max_show,
959 mem_used_max_store);
860static DEVICE_ATTR(max_comp_streams, S_IRUGO | S_IWUSR, 960static DEVICE_ATTR(max_comp_streams, S_IRUGO | S_IWUSR,
861 max_comp_streams_show, max_comp_streams_store); 961 max_comp_streams_show, max_comp_streams_store);
862static DEVICE_ATTR(comp_algorithm, S_IRUGO | S_IWUSR, 962static DEVICE_ATTR(comp_algorithm, S_IRUGO | S_IWUSR,
@@ -885,6 +985,8 @@ static struct attribute *zram_disk_attrs[] = {
885 &dev_attr_orig_data_size.attr, 985 &dev_attr_orig_data_size.attr,
886 &dev_attr_compr_data_size.attr, 986 &dev_attr_compr_data_size.attr,
887 &dev_attr_mem_used_total.attr, 987 &dev_attr_mem_used_total.attr,
988 &dev_attr_mem_limit.attr,
989 &dev_attr_mem_used_max.attr,
888 &dev_attr_max_comp_streams.attr, 990 &dev_attr_max_comp_streams.attr,
889 &dev_attr_comp_algorithm.attr, 991 &dev_attr_comp_algorithm.attr,
890 NULL, 992 NULL,
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index e0f725c87cc6..c6ee271317f5 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -90,6 +90,7 @@ struct zram_stats {
90 atomic64_t notify_free; /* no. of swap slot free notifications */ 90 atomic64_t notify_free; /* no. of swap slot free notifications */
91 atomic64_t zero_pages; /* no. of zero filled pages */ 91 atomic64_t zero_pages; /* no. of zero filled pages */
92 atomic64_t pages_stored; /* no. of pages currently stored */ 92 atomic64_t pages_stored; /* no. of pages currently stored */
93 atomic_long_t max_used_pages; /* no. of maximum pages stored */
93}; 94};
94 95
95struct zram_meta { 96struct zram_meta {
@@ -112,6 +113,11 @@ struct zram {
112 u64 disksize; /* bytes */ 113 u64 disksize; /* bytes */
113 int max_comp_streams; 114 int max_comp_streams;
114 struct zram_stats stats; 115 struct zram_stats stats;
116 /*
117 * the number of pages zram can consume for storing compressed data
118 */
119 unsigned long limit_pages;
120
115 char compressor[10]; 121 char compressor[10];
116}; 122};
117#endif 123#endif
diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c
index 79f18e6d9c4f..cc016c615c19 100644
--- a/drivers/firmware/memmap.c
+++ b/drivers/firmware/memmap.c
@@ -184,6 +184,9 @@ static int add_sysfs_fw_map_entry(struct firmware_map_entry *entry)
184 static int map_entries_nr; 184 static int map_entries_nr;
185 static struct kset *mmap_kset; 185 static struct kset *mmap_kset;
186 186
187 if (entry->kobj.state_in_sysfs)
188 return -EEXIST;
189
187 if (!mmap_kset) { 190 if (!mmap_kset) {
188 mmap_kset = kset_create_and_add("memmap", NULL, firmware_kobj); 191 mmap_kset = kset_create_and_add("memmap", NULL, firmware_kobj);
189 if (!mmap_kset) 192 if (!mmap_kset)
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index c6683f2e396c..00b228638274 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -25,6 +25,7 @@ config VIRTIO_PCI
25config VIRTIO_BALLOON 25config VIRTIO_BALLOON
26 tristate "Virtio balloon driver" 26 tristate "Virtio balloon driver"
27 depends on VIRTIO 27 depends on VIRTIO
28 select MEMORY_BALLOON
28 ---help--- 29 ---help---
29 This driver supports increasing and decreasing the amount 30 This driver supports increasing and decreasing the amount
30 of memory within a KVM guest. 31 of memory within a KVM guest.
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 25ebe8eecdb7..f893148a107b 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -59,7 +59,7 @@ struct virtio_balloon
59 * Each page on this list adds VIRTIO_BALLOON_PAGES_PER_PAGE 59 * Each page on this list adds VIRTIO_BALLOON_PAGES_PER_PAGE
60 * to num_pages above. 60 * to num_pages above.
61 */ 61 */
62 struct balloon_dev_info *vb_dev_info; 62 struct balloon_dev_info vb_dev_info;
63 63
64 /* Synchronize access/update to this struct virtio_balloon elements */ 64 /* Synchronize access/update to this struct virtio_balloon elements */
65 struct mutex balloon_lock; 65 struct mutex balloon_lock;
@@ -127,7 +127,7 @@ static void set_page_pfns(u32 pfns[], struct page *page)
127 127
128static void fill_balloon(struct virtio_balloon *vb, size_t num) 128static void fill_balloon(struct virtio_balloon *vb, size_t num)
129{ 129{
130 struct balloon_dev_info *vb_dev_info = vb->vb_dev_info; 130 struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info;
131 131
132 /* We can only do one array worth at a time. */ 132 /* We can only do one array worth at a time. */
133 num = min(num, ARRAY_SIZE(vb->pfns)); 133 num = min(num, ARRAY_SIZE(vb->pfns));
@@ -163,15 +163,15 @@ static void release_pages_by_pfn(const u32 pfns[], unsigned int num)
163 /* Find pfns pointing at start of each page, get pages and free them. */ 163 /* Find pfns pointing at start of each page, get pages and free them. */
164 for (i = 0; i < num; i += VIRTIO_BALLOON_PAGES_PER_PAGE) { 164 for (i = 0; i < num; i += VIRTIO_BALLOON_PAGES_PER_PAGE) {
165 struct page *page = balloon_pfn_to_page(pfns[i]); 165 struct page *page = balloon_pfn_to_page(pfns[i]);
166 balloon_page_free(page);
167 adjust_managed_page_count(page, 1); 166 adjust_managed_page_count(page, 1);
167 put_page(page); /* balloon reference */
168 } 168 }
169} 169}
170 170
171static void leak_balloon(struct virtio_balloon *vb, size_t num) 171static void leak_balloon(struct virtio_balloon *vb, size_t num)
172{ 172{
173 struct page *page; 173 struct page *page;
174 struct balloon_dev_info *vb_dev_info = vb->vb_dev_info; 174 struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info;
175 175
176 /* We can only do one array worth at a time. */ 176 /* We can only do one array worth at a time. */
177 num = min(num, ARRAY_SIZE(vb->pfns)); 177 num = min(num, ARRAY_SIZE(vb->pfns));
@@ -353,12 +353,11 @@ static int init_vqs(struct virtio_balloon *vb)
353 return 0; 353 return 0;
354} 354}
355 355
356static const struct address_space_operations virtio_balloon_aops;
357#ifdef CONFIG_BALLOON_COMPACTION 356#ifdef CONFIG_BALLOON_COMPACTION
358/* 357/*
359 * virtballoon_migratepage - perform the balloon page migration on behalf of 358 * virtballoon_migratepage - perform the balloon page migration on behalf of
360 * a compation thread. (called under page lock) 359 * a compation thread. (called under page lock)
361 * @mapping: the page->mapping which will be assigned to the new migrated page. 360 * @vb_dev_info: the balloon device
362 * @newpage: page that will replace the isolated page after migration finishes. 361 * @newpage: page that will replace the isolated page after migration finishes.
363 * @page : the isolated (old) page that is about to be migrated to newpage. 362 * @page : the isolated (old) page that is about to be migrated to newpage.
364 * @mode : compaction mode -- not used for balloon page migration. 363 * @mode : compaction mode -- not used for balloon page migration.
@@ -373,17 +372,13 @@ static const struct address_space_operations virtio_balloon_aops;
373 * This function preforms the balloon page migration task. 372 * This function preforms the balloon page migration task.
374 * Called through balloon_mapping->a_ops->migratepage 373 * Called through balloon_mapping->a_ops->migratepage
375 */ 374 */
376static int virtballoon_migratepage(struct address_space *mapping, 375static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info,
377 struct page *newpage, struct page *page, enum migrate_mode mode) 376 struct page *newpage, struct page *page, enum migrate_mode mode)
378{ 377{
379 struct balloon_dev_info *vb_dev_info = balloon_page_device(page); 378 struct virtio_balloon *vb = container_of(vb_dev_info,
380 struct virtio_balloon *vb; 379 struct virtio_balloon, vb_dev_info);
381 unsigned long flags; 380 unsigned long flags;
382 381
383 BUG_ON(!vb_dev_info);
384
385 vb = vb_dev_info->balloon_device;
386
387 /* 382 /*
388 * In order to avoid lock contention while migrating pages concurrently 383 * In order to avoid lock contention while migrating pages concurrently
389 * to leak_balloon() or fill_balloon() we just give up the balloon_lock 384 * to leak_balloon() or fill_balloon() we just give up the balloon_lock
@@ -395,21 +390,19 @@ static int virtballoon_migratepage(struct address_space *mapping,
395 if (!mutex_trylock(&vb->balloon_lock)) 390 if (!mutex_trylock(&vb->balloon_lock))
396 return -EAGAIN; 391 return -EAGAIN;
397 392
393 get_page(newpage); /* balloon reference */
394
398 /* balloon's page migration 1st step -- inflate "newpage" */ 395 /* balloon's page migration 1st step -- inflate "newpage" */
399 spin_lock_irqsave(&vb_dev_info->pages_lock, flags); 396 spin_lock_irqsave(&vb_dev_info->pages_lock, flags);
400 balloon_page_insert(newpage, mapping, &vb_dev_info->pages); 397 balloon_page_insert(vb_dev_info, newpage);
401 vb_dev_info->isolated_pages--; 398 vb_dev_info->isolated_pages--;
399 __count_vm_event(BALLOON_MIGRATE);
402 spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags); 400 spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags);
403 vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; 401 vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE;
404 set_page_pfns(vb->pfns, newpage); 402 set_page_pfns(vb->pfns, newpage);
405 tell_host(vb, vb->inflate_vq); 403 tell_host(vb, vb->inflate_vq);
406 404
407 /* 405 /* balloon's page migration 2nd step -- deflate "page" */
408 * balloon's page migration 2nd step -- deflate "page"
409 *
410 * It's safe to delete page->lru here because this page is at
411 * an isolated migration list, and this step is expected to happen here
412 */
413 balloon_page_delete(page); 406 balloon_page_delete(page);
414 vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; 407 vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE;
415 set_page_pfns(vb->pfns, page); 408 set_page_pfns(vb->pfns, page);
@@ -417,20 +410,15 @@ static int virtballoon_migratepage(struct address_space *mapping,
417 410
418 mutex_unlock(&vb->balloon_lock); 411 mutex_unlock(&vb->balloon_lock);
419 412
420 return MIGRATEPAGE_BALLOON_SUCCESS; 413 put_page(page); /* balloon reference */
421}
422 414
423/* define the balloon_mapping->a_ops callback to allow balloon page migration */ 415 return MIGRATEPAGE_SUCCESS;
424static const struct address_space_operations virtio_balloon_aops = { 416}
425 .migratepage = virtballoon_migratepage,
426};
427#endif /* CONFIG_BALLOON_COMPACTION */ 417#endif /* CONFIG_BALLOON_COMPACTION */
428 418
429static int virtballoon_probe(struct virtio_device *vdev) 419static int virtballoon_probe(struct virtio_device *vdev)
430{ 420{
431 struct virtio_balloon *vb; 421 struct virtio_balloon *vb;
432 struct address_space *vb_mapping;
433 struct balloon_dev_info *vb_devinfo;
434 int err; 422 int err;
435 423
436 vdev->priv = vb = kmalloc(sizeof(*vb), GFP_KERNEL); 424 vdev->priv = vb = kmalloc(sizeof(*vb), GFP_KERNEL);
@@ -446,30 +434,14 @@ static int virtballoon_probe(struct virtio_device *vdev)
446 vb->vdev = vdev; 434 vb->vdev = vdev;
447 vb->need_stats_update = 0; 435 vb->need_stats_update = 0;
448 436
449 vb_devinfo = balloon_devinfo_alloc(vb); 437 balloon_devinfo_init(&vb->vb_dev_info);
450 if (IS_ERR(vb_devinfo)) { 438#ifdef CONFIG_BALLOON_COMPACTION
451 err = PTR_ERR(vb_devinfo); 439 vb->vb_dev_info.migratepage = virtballoon_migratepage;
452 goto out_free_vb; 440#endif
453 }
454
455 vb_mapping = balloon_mapping_alloc(vb_devinfo,
456 (balloon_compaction_check()) ?
457 &virtio_balloon_aops : NULL);
458 if (IS_ERR(vb_mapping)) {
459 /*
460 * IS_ERR(vb_mapping) && PTR_ERR(vb_mapping) == -EOPNOTSUPP
461 * This means !CONFIG_BALLOON_COMPACTION, otherwise we get off.
462 */
463 err = PTR_ERR(vb_mapping);
464 if (err != -EOPNOTSUPP)
465 goto out_free_vb_devinfo;
466 }
467
468 vb->vb_dev_info = vb_devinfo;
469 441
470 err = init_vqs(vb); 442 err = init_vqs(vb);
471 if (err) 443 if (err)
472 goto out_free_vb_mapping; 444 goto out_free_vb;
473 445
474 vb->thread = kthread_run(balloon, vb, "vballoon"); 446 vb->thread = kthread_run(balloon, vb, "vballoon");
475 if (IS_ERR(vb->thread)) { 447 if (IS_ERR(vb->thread)) {
@@ -481,10 +453,6 @@ static int virtballoon_probe(struct virtio_device *vdev)
481 453
482out_del_vqs: 454out_del_vqs:
483 vdev->config->del_vqs(vdev); 455 vdev->config->del_vqs(vdev);
484out_free_vb_mapping:
485 balloon_mapping_free(vb_mapping);
486out_free_vb_devinfo:
487 balloon_devinfo_free(vb_devinfo);
488out_free_vb: 456out_free_vb:
489 kfree(vb); 457 kfree(vb);
490out: 458out:
@@ -510,8 +478,6 @@ static void virtballoon_remove(struct virtio_device *vdev)
510 478
511 kthread_stop(vb->thread); 479 kthread_stop(vb->thread);
512 remove_common(vb); 480 remove_common(vb);
513 balloon_mapping_free(vb->vb_dev_info->mapping);
514 balloon_devinfo_free(vb->vb_dev_info);
515 kfree(vb); 481 kfree(vb);
516} 482}
517 483
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 6d7274619bf9..e2f3ad0879ce 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -304,6 +304,12 @@ static int blkdev_readpage(struct file * file, struct page * page)
304 return block_read_full_page(page, blkdev_get_block); 304 return block_read_full_page(page, blkdev_get_block);
305} 305}
306 306
307static int blkdev_readpages(struct file *file, struct address_space *mapping,
308 struct list_head *pages, unsigned nr_pages)
309{
310 return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block);
311}
312
307static int blkdev_write_begin(struct file *file, struct address_space *mapping, 313static int blkdev_write_begin(struct file *file, struct address_space *mapping,
308 loff_t pos, unsigned len, unsigned flags, 314 loff_t pos, unsigned len, unsigned flags,
309 struct page **pagep, void **fsdata) 315 struct page **pagep, void **fsdata)
@@ -1622,6 +1628,7 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)
1622 1628
1623static const struct address_space_operations def_blk_aops = { 1629static const struct address_space_operations def_blk_aops = {
1624 .readpage = blkdev_readpage, 1630 .readpage = blkdev_readpage,
1631 .readpages = blkdev_readpages,
1625 .writepage = blkdev_writepage, 1632 .writepage = blkdev_writepage,
1626 .write_begin = blkdev_write_begin, 1633 .write_begin = blkdev_write_begin,
1627 .write_end = blkdev_write_end, 1634 .write_end = blkdev_write_end,
diff --git a/fs/buffer.c b/fs/buffer.c
index 3588a80854b2..44c14a87750e 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1253,7 +1253,7 @@ static struct buffer_head *__bread_slow(struct buffer_head *bh)
1253 * a local interrupt disable for that. 1253 * a local interrupt disable for that.
1254 */ 1254 */
1255 1255
1256#define BH_LRU_SIZE 8 1256#define BH_LRU_SIZE 16
1257 1257
1258struct bh_lru { 1258struct bh_lru {
1259 struct buffer_head *bhs[BH_LRU_SIZE]; 1259 struct buffer_head *bhs[BH_LRU_SIZE];
@@ -2956,7 +2956,7 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
2956 2956
2957/* 2957/*
2958 * This allows us to do IO even on the odd last sectors 2958 * This allows us to do IO even on the odd last sectors
2959 * of a device, even if the bh block size is some multiple 2959 * of a device, even if the block size is some multiple
2960 * of the physical sector size. 2960 * of the physical sector size.
2961 * 2961 *
2962 * We'll just truncate the bio to the size of the device, 2962 * We'll just truncate the bio to the size of the device,
@@ -2966,10 +2966,11 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
2966 * errors, this only handles the "we need to be able to 2966 * errors, this only handles the "we need to be able to
2967 * do IO at the final sector" case. 2967 * do IO at the final sector" case.
2968 */ 2968 */
2969static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh) 2969void guard_bio_eod(int rw, struct bio *bio)
2970{ 2970{
2971 sector_t maxsector; 2971 sector_t maxsector;
2972 unsigned bytes; 2972 struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
2973 unsigned truncated_bytes;
2973 2974
2974 maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9; 2975 maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
2975 if (!maxsector) 2976 if (!maxsector)
@@ -2984,23 +2985,20 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
2984 return; 2985 return;
2985 2986
2986 maxsector -= bio->bi_iter.bi_sector; 2987 maxsector -= bio->bi_iter.bi_sector;
2987 bytes = bio->bi_iter.bi_size; 2988 if (likely((bio->bi_iter.bi_size >> 9) <= maxsector))
2988 if (likely((bytes >> 9) <= maxsector))
2989 return; 2989 return;
2990 2990
2991 /* Uhhuh. We've got a bh that straddles the device size! */ 2991 /* Uhhuh. We've got a bio that straddles the device size! */
2992 bytes = maxsector << 9; 2992 truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9);
2993 2993
2994 /* Truncate the bio.. */ 2994 /* Truncate the bio.. */
2995 bio->bi_iter.bi_size = bytes; 2995 bio->bi_iter.bi_size -= truncated_bytes;
2996 bio->bi_io_vec[0].bv_len = bytes; 2996 bvec->bv_len -= truncated_bytes;
2997 2997
2998 /* ..and clear the end of the buffer for reads */ 2998 /* ..and clear the end of the buffer for reads */
2999 if ((rw & RW_MASK) == READ) { 2999 if ((rw & RW_MASK) == READ) {
3000 void *kaddr = kmap_atomic(bh->b_page); 3000 zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len,
3001 memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes); 3001 truncated_bytes);
3002 kunmap_atomic(kaddr);
3003 flush_dcache_page(bh->b_page);
3004 } 3002 }
3005} 3003}
3006 3004
@@ -3041,7 +3039,7 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
3041 bio->bi_flags |= bio_flags; 3039 bio->bi_flags |= bio_flags;
3042 3040
3043 /* Take care of bh's that straddle the end of the device */ 3041 /* Take care of bh's that straddle the end of the device */
3044 guard_bh_eod(rw, bio, bh); 3042 guard_bio_eod(rw, bio);
3045 3043
3046 if (buffer_meta(bh)) 3044 if (buffer_meta(bh))
3047 rw |= REQ_META; 3045 rw |= REQ_META;
diff --git a/fs/internal.h b/fs/internal.h
index e325b4f9c799..b2623200107b 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -35,6 +35,11 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
35#endif 35#endif
36 36
37/* 37/*
38 * buffer.c
39 */
40extern void guard_bio_eod(int rw, struct bio *bio);
41
42/*
38 * char_dev.c 43 * char_dev.c
39 */ 44 */
40extern void __init chrdev_init(void); 45extern void __init chrdev_init(void);
diff --git a/fs/mpage.c b/fs/mpage.c
index 5f9ed622274f..3e79220babac 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -28,6 +28,7 @@
28#include <linux/backing-dev.h> 28#include <linux/backing-dev.h>
29#include <linux/pagevec.h> 29#include <linux/pagevec.h>
30#include <linux/cleancache.h> 30#include <linux/cleancache.h>
31#include "internal.h"
31 32
32/* 33/*
33 * I/O completion handler for multipage BIOs. 34 * I/O completion handler for multipage BIOs.
@@ -57,6 +58,7 @@ static void mpage_end_io(struct bio *bio, int err)
57static struct bio *mpage_bio_submit(int rw, struct bio *bio) 58static struct bio *mpage_bio_submit(int rw, struct bio *bio)
58{ 59{
59 bio->bi_end_io = mpage_end_io; 60 bio->bi_end_io = mpage_end_io;
61 guard_bio_eod(rw, bio);
60 submit_bio(rw, bio); 62 submit_bio(rw, bio);
61 return NULL; 63 return NULL;
62} 64}
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index b13992a41bd9..c991616acca9 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -78,7 +78,7 @@ static int create_fd(struct fsnotify_group *group,
78 78
79 pr_debug("%s: group=%p event=%p\n", __func__, group, event); 79 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
80 80
81 client_fd = get_unused_fd(); 81 client_fd = get_unused_fd_flags(group->fanotify_data.f_flags);
82 if (client_fd < 0) 82 if (client_fd < 0)
83 return client_fd; 83 return client_fd;
84 84
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 85e7d2b431d9..9c0898c4cfe1 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -23,9 +23,6 @@ extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
23 struct fsnotify_group *group, struct vfsmount *mnt, 23 struct fsnotify_group *group, struct vfsmount *mnt,
24 int allow_dups); 24 int allow_dups);
25 25
26/* final kfree of a group */
27extern void fsnotify_final_destroy_group(struct fsnotify_group *group);
28
29/* vfsmount specific destruction of a mark */ 26/* vfsmount specific destruction of a mark */
30extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark); 27extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark);
31/* inode specific destruction of a mark */ 28/* inode specific destruction of a mark */
diff --git a/fs/notify/group.c b/fs/notify/group.c
index ad1995980456..d16b62cb2854 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -31,7 +31,7 @@
31/* 31/*
32 * Final freeing of a group 32 * Final freeing of a group
33 */ 33 */
34void fsnotify_final_destroy_group(struct fsnotify_group *group) 34static void fsnotify_final_destroy_group(struct fsnotify_group *group)
35{ 35{
36 if (group->ops->free_group_priv) 36 if (group->ops->free_group_priv)
37 group->ops->free_group_priv(group); 37 group->ops->free_group_priv(group);
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 0f88bc0b4e6c..7d888d77d59a 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -165,8 +165,10 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
165 /* ideally the idr is empty and we won't hit the BUG in the callback */ 165 /* ideally the idr is empty and we won't hit the BUG in the callback */
166 idr_for_each(&group->inotify_data.idr, idr_callback, group); 166 idr_for_each(&group->inotify_data.idr, idr_callback, group);
167 idr_destroy(&group->inotify_data.idr); 167 idr_destroy(&group->inotify_data.idr);
168 atomic_dec(&group->inotify_data.user->inotify_devs); 168 if (group->inotify_data.user) {
169 free_uid(group->inotify_data.user); 169 atomic_dec(&group->inotify_data.user->inotify_devs);
170 free_uid(group->inotify_data.user);
171 }
170} 172}
171 173
172static void inotify_free_event(struct fsnotify_event *fsn_event) 174static void inotify_free_event(struct fsnotify_event *fsn_event)
diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c
index dd6103cc93c1..825a54e8f490 100644
--- a/fs/ntfs/debug.c
+++ b/fs/ntfs/debug.c
@@ -112,7 +112,7 @@ void __ntfs_error(const char *function, const struct super_block *sb,
112/* If 1, output debug messages, and if 0, don't. */ 112/* If 1, output debug messages, and if 0, don't. */
113int debug_msgs = 0; 113int debug_msgs = 0;
114 114
115void __ntfs_debug (const char *file, int line, const char *function, 115void __ntfs_debug(const char *file, int line, const char *function,
116 const char *fmt, ...) 116 const char *fmt, ...)
117{ 117{
118 struct va_format vaf; 118 struct va_format vaf;
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index f5ec1ce7a532..643faa44f22b 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * file.c - NTFS kernel file operations. Part of the Linux-NTFS project. 2 * file.c - NTFS kernel file operations. Part of the Linux-NTFS project.
3 * 3 *
4 * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc. 4 * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
5 * 5 *
6 * This program/include file is free software; you can redistribute it and/or 6 * This program/include file is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as published 7 * modify it under the terms of the GNU General Public License as published
@@ -410,7 +410,8 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
410 BUG_ON(!nr_pages); 410 BUG_ON(!nr_pages);
411 err = nr = 0; 411 err = nr = 0;
412 do { 412 do {
413 pages[nr] = find_lock_page(mapping, index); 413 pages[nr] = find_get_page_flags(mapping, index, FGP_LOCK |
414 FGP_ACCESSED);
414 if (!pages[nr]) { 415 if (!pages[nr]) {
415 if (!*cached_page) { 416 if (!*cached_page) {
416 *cached_page = page_cache_alloc(mapping); 417 *cached_page = page_cache_alloc(mapping);
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 6c3296e546c3..9e1e112074fb 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3208,7 +3208,7 @@ static void __exit exit_ntfs_fs(void)
3208} 3208}
3209 3209
3210MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>"); 3210MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>");
3211MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc."); 3211MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.");
3212MODULE_VERSION(NTFS_VERSION); 3212MODULE_VERSION(NTFS_VERSION);
3213MODULE_LICENSE("GPL"); 3213MODULE_LICENSE("GPL");
3214#ifdef DEBUG 3214#ifdef DEBUG
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 4a231a166cf8..1ef547e49373 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1481,8 +1481,16 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
1481 handle_t *handle; 1481 handle_t *handle;
1482 struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; 1482 struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1483 1483
1484 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1485 if (IS_ERR(handle)) {
1486 ret = PTR_ERR(handle);
1487 mlog_errno(ret);
1488 goto out;
1489 }
1490
1484 page = find_or_create_page(mapping, 0, GFP_NOFS); 1491 page = find_or_create_page(mapping, 0, GFP_NOFS);
1485 if (!page) { 1492 if (!page) {
1493 ocfs2_commit_trans(osb, handle);
1486 ret = -ENOMEM; 1494 ret = -ENOMEM;
1487 mlog_errno(ret); 1495 mlog_errno(ret);
1488 goto out; 1496 goto out;
@@ -1494,13 +1502,6 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
1494 wc->w_pages[0] = wc->w_target_page = page; 1502 wc->w_pages[0] = wc->w_target_page = page;
1495 wc->w_num_pages = 1; 1503 wc->w_num_pages = 1;
1496 1504
1497 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1498 if (IS_ERR(handle)) {
1499 ret = PTR_ERR(handle);
1500 mlog_errno(ret);
1501 goto out;
1502 }
1503
1504 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, 1505 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
1505 OCFS2_JOURNAL_ACCESS_WRITE); 1506 OCFS2_JOURNAL_ACCESS_WRITE);
1506 if (ret) { 1507 if (ret) {
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 73039295d0d1..d13385448168 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -2572,6 +2572,25 @@ int o2hb_check_node_heartbeating(u8 node_num)
2572} 2572}
2573EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating); 2573EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
2574 2574
2575int o2hb_check_node_heartbeating_no_sem(u8 node_num)
2576{
2577 unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
2578 unsigned long flags;
2579
2580 spin_lock_irqsave(&o2hb_live_lock, flags);
2581 o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
2582 spin_unlock_irqrestore(&o2hb_live_lock, flags);
2583 if (!test_bit(node_num, testing_map)) {
2584 mlog(ML_HEARTBEAT,
2585 "node (%u) does not have heartbeating enabled.\n",
2586 node_num);
2587 return 0;
2588 }
2589
2590 return 1;
2591}
2592EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_no_sem);
2593
2575int o2hb_check_node_heartbeating_from_callback(u8 node_num) 2594int o2hb_check_node_heartbeating_from_callback(u8 node_num)
2576{ 2595{
2577 unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 2596 unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 00ad8e8fea51..3ef5137dc362 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -80,6 +80,7 @@ void o2hb_fill_node_map(unsigned long *map,
80void o2hb_exit(void); 80void o2hb_exit(void);
81int o2hb_init(void); 81int o2hb_init(void);
82int o2hb_check_node_heartbeating(u8 node_num); 82int o2hb_check_node_heartbeating(u8 node_num);
83int o2hb_check_node_heartbeating_no_sem(u8 node_num);
83int o2hb_check_node_heartbeating_from_callback(u8 node_num); 84int o2hb_check_node_heartbeating_from_callback(u8 node_num);
84int o2hb_check_local_node_heartbeating(void); 85int o2hb_check_local_node_heartbeating(void);
85void o2hb_stop_all_regions(void); 86void o2hb_stop_all_regions(void);
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 73ba81928bce..27d1242c8383 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -185,29 +185,13 @@ static const struct seq_operations nst_seq_ops = {
185static int nst_fop_open(struct inode *inode, struct file *file) 185static int nst_fop_open(struct inode *inode, struct file *file)
186{ 186{
187 struct o2net_send_tracking *dummy_nst; 187 struct o2net_send_tracking *dummy_nst;
188 struct seq_file *seq;
189 int ret;
190 188
191 dummy_nst = kmalloc(sizeof(struct o2net_send_tracking), GFP_KERNEL); 189 dummy_nst = __seq_open_private(file, &nst_seq_ops, sizeof(*dummy_nst));
192 if (dummy_nst == NULL) { 190 if (!dummy_nst)
193 ret = -ENOMEM; 191 return -ENOMEM;
194 goto out;
195 }
196 dummy_nst->st_task = NULL;
197
198 ret = seq_open(file, &nst_seq_ops);
199 if (ret)
200 goto out;
201
202 seq = file->private_data;
203 seq->private = dummy_nst;
204 o2net_debug_add_nst(dummy_nst); 192 o2net_debug_add_nst(dummy_nst);
205 193
206 dummy_nst = NULL; 194 return 0;
207
208out:
209 kfree(dummy_nst);
210 return ret;
211} 195}
212 196
213static int nst_fop_release(struct inode *inode, struct file *file) 197static int nst_fop_release(struct inode *inode, struct file *file)
@@ -412,33 +396,27 @@ static const struct seq_operations sc_seq_ops = {
412 .show = sc_seq_show, 396 .show = sc_seq_show,
413}; 397};
414 398
415static int sc_common_open(struct file *file, struct o2net_sock_debug *sd) 399static int sc_common_open(struct file *file, int ctxt)
416{ 400{
401 struct o2net_sock_debug *sd;
417 struct o2net_sock_container *dummy_sc; 402 struct o2net_sock_container *dummy_sc;
418 struct seq_file *seq;
419 int ret;
420 403
421 dummy_sc = kmalloc(sizeof(struct o2net_sock_container), GFP_KERNEL); 404 dummy_sc = kzalloc(sizeof(*dummy_sc), GFP_KERNEL);
422 if (dummy_sc == NULL) { 405 if (!dummy_sc)
423 ret = -ENOMEM; 406 return -ENOMEM;
424 goto out;
425 }
426 dummy_sc->sc_page = NULL;
427 407
428 ret = seq_open(file, &sc_seq_ops); 408 sd = __seq_open_private(file, &sc_seq_ops, sizeof(*sd));
429 if (ret) 409 if (!sd) {
430 goto out; 410 kfree(dummy_sc);
411 return -ENOMEM;
412 }
431 413
432 seq = file->private_data; 414 sd->dbg_ctxt = ctxt;
433 seq->private = sd;
434 sd->dbg_sock = dummy_sc; 415 sd->dbg_sock = dummy_sc;
435 o2net_debug_add_sc(dummy_sc);
436 416
437 dummy_sc = NULL; 417 o2net_debug_add_sc(dummy_sc);
438 418
439out: 419 return 0;
440 kfree(dummy_sc);
441 return ret;
442} 420}
443 421
444static int sc_fop_release(struct inode *inode, struct file *file) 422static int sc_fop_release(struct inode *inode, struct file *file)
@@ -453,16 +431,7 @@ static int sc_fop_release(struct inode *inode, struct file *file)
453 431
454static int stats_fop_open(struct inode *inode, struct file *file) 432static int stats_fop_open(struct inode *inode, struct file *file)
455{ 433{
456 struct o2net_sock_debug *sd; 434 return sc_common_open(file, SHOW_SOCK_STATS);
457
458 sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
459 if (sd == NULL)
460 return -ENOMEM;
461
462 sd->dbg_ctxt = SHOW_SOCK_STATS;
463 sd->dbg_sock = NULL;
464
465 return sc_common_open(file, sd);
466} 435}
467 436
468static const struct file_operations stats_seq_fops = { 437static const struct file_operations stats_seq_fops = {
@@ -474,16 +443,7 @@ static const struct file_operations stats_seq_fops = {
474 443
475static int sc_fop_open(struct inode *inode, struct file *file) 444static int sc_fop_open(struct inode *inode, struct file *file)
476{ 445{
477 struct o2net_sock_debug *sd; 446 return sc_common_open(file, SHOW_SOCK_CONTAINERS);
478
479 sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
480 if (sd == NULL)
481 return -ENOMEM;
482
483 sd->dbg_ctxt = SHOW_SOCK_CONTAINERS;
484 sd->dbg_sock = NULL;
485
486 return sc_common_open(file, sd);
487} 447}
488 448
489static const struct file_operations sc_seq_fops = { 449static const struct file_operations sc_seq_fops = {
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index ea34952f9496..97de0fbd9f78 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -536,7 +536,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
536 if (nn->nn_persistent_error || nn->nn_sc_valid) 536 if (nn->nn_persistent_error || nn->nn_sc_valid)
537 wake_up(&nn->nn_sc_wq); 537 wake_up(&nn->nn_sc_wq);
538 538
539 if (!was_err && nn->nn_persistent_error) { 539 if (was_valid && !was_err && nn->nn_persistent_error) {
540 o2quo_conn_err(o2net_num_from_nn(nn)); 540 o2quo_conn_err(o2net_num_from_nn(nn));
541 queue_delayed_work(o2net_wq, &nn->nn_still_up, 541 queue_delayed_work(o2net_wq, &nn->nn_still_up,
542 msecs_to_jiffies(O2NET_QUORUM_DELAY_MS)); 542 msecs_to_jiffies(O2NET_QUORUM_DELAY_MS));
@@ -1601,7 +1601,15 @@ static void o2net_start_connect(struct work_struct *work)
1601 struct sockaddr_in myaddr = {0, }, remoteaddr = {0, }; 1601 struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
1602 int ret = 0, stop; 1602 int ret = 0, stop;
1603 unsigned int timeout; 1603 unsigned int timeout;
1604 unsigned int noio_flag;
1604 1605
1606 /*
1607 * sock_create allocates the sock with GFP_KERNEL. We must set
1608 * per-process flag PF_MEMALLOC_NOIO so that all allocations done
1609 * by this process are done as if GFP_NOIO was specified. So we
1610 * are not reentering filesystem while doing memory reclaim.
1611 */
1612 noio_flag = memalloc_noio_save();
1605 /* if we're greater we initiate tx, otherwise we accept */ 1613 /* if we're greater we initiate tx, otherwise we accept */
1606 if (o2nm_this_node() <= o2net_num_from_nn(nn)) 1614 if (o2nm_this_node() <= o2net_num_from_nn(nn))
1607 goto out; 1615 goto out;
@@ -1710,6 +1718,7 @@ out:
1710 if (mynode) 1718 if (mynode)
1711 o2nm_node_put(mynode); 1719 o2nm_node_put(mynode);
1712 1720
1721 memalloc_noio_restore(noio_flag);
1713 return; 1722 return;
1714} 1723}
1715 1724
@@ -1721,7 +1730,8 @@ static void o2net_connect_expired(struct work_struct *work)
1721 spin_lock(&nn->nn_lock); 1730 spin_lock(&nn->nn_lock);
1722 if (!nn->nn_sc_valid) { 1731 if (!nn->nn_sc_valid) {
1723 printk(KERN_NOTICE "o2net: No connection established with " 1732 printk(KERN_NOTICE "o2net: No connection established with "
1724 "node %u after %u.%u seconds, giving up.\n", 1733 "node %u after %u.%u seconds, check network and"
1734 " cluster configuration.\n",
1725 o2net_num_from_nn(nn), 1735 o2net_num_from_nn(nn),
1726 o2net_idle_timeout() / 1000, 1736 o2net_idle_timeout() / 1000,
1727 o2net_idle_timeout() % 1000); 1737 o2net_idle_timeout() % 1000);
@@ -1835,6 +1845,15 @@ static int o2net_accept_one(struct socket *sock, int *more)
1835 struct o2nm_node *local_node = NULL; 1845 struct o2nm_node *local_node = NULL;
1836 struct o2net_sock_container *sc = NULL; 1846 struct o2net_sock_container *sc = NULL;
1837 struct o2net_node *nn; 1847 struct o2net_node *nn;
1848 unsigned int noio_flag;
1849
1850 /*
1851 * sock_create_lite allocates the sock with GFP_KERNEL. We must set
1852 * per-process flag PF_MEMALLOC_NOIO so that all allocations done
1853 * by this process are done as if GFP_NOIO was specified. So we
1854 * are not reentering filesystem while doing memory reclaim.
1855 */
1856 noio_flag = memalloc_noio_save();
1838 1857
1839 BUG_ON(sock == NULL); 1858 BUG_ON(sock == NULL);
1840 *more = 0; 1859 *more = 0;
@@ -1951,6 +1970,8 @@ out:
1951 o2nm_node_put(local_node); 1970 o2nm_node_put(local_node);
1952 if (sc) 1971 if (sc)
1953 sc_put(sc); 1972 sc_put(sc);
1973
1974 memalloc_noio_restore(noio_flag);
1954 return ret; 1975 return ret;
1955} 1976}
1956 1977
@@ -2146,17 +2167,13 @@ int o2net_init(void)
2146 o2quo_init(); 2167 o2quo_init();
2147 2168
2148 if (o2net_debugfs_init()) 2169 if (o2net_debugfs_init())
2149 return -ENOMEM; 2170 goto out;
2150 2171
2151 o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL); 2172 o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL);
2152 o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); 2173 o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
2153 o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); 2174 o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
2154 if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) { 2175 if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp)
2155 kfree(o2net_hand); 2176 goto out;
2156 kfree(o2net_keep_req);
2157 kfree(o2net_keep_resp);
2158 return -ENOMEM;
2159 }
2160 2177
2161 o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION); 2178 o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION);
2162 o2net_hand->connector_id = cpu_to_be64(1); 2179 o2net_hand->connector_id = cpu_to_be64(1);
@@ -2181,6 +2198,14 @@ int o2net_init(void)
2181 } 2198 }
2182 2199
2183 return 0; 2200 return 0;
2201
2202out:
2203 kfree(o2net_hand);
2204 kfree(o2net_keep_req);
2205 kfree(o2net_keep_resp);
2206
2207 o2quo_exit();
2208 return -ENOMEM;
2184} 2209}
2185 2210
2186void o2net_exit(void) 2211void o2net_exit(void)
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 18f13c2e4a10..149eb556b8c6 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -647,41 +647,30 @@ static const struct seq_operations debug_lockres_ops = {
647static int debug_lockres_open(struct inode *inode, struct file *file) 647static int debug_lockres_open(struct inode *inode, struct file *file)
648{ 648{
649 struct dlm_ctxt *dlm = inode->i_private; 649 struct dlm_ctxt *dlm = inode->i_private;
650 int ret = -ENOMEM; 650 struct debug_lockres *dl;
651 struct seq_file *seq; 651 void *buf;
652 struct debug_lockres *dl = NULL;
653 652
654 dl = kzalloc(sizeof(struct debug_lockres), GFP_KERNEL); 653 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
655 if (!dl) { 654 if (!buf)
656 mlog_errno(ret);
657 goto bail; 655 goto bail;
658 }
659 656
660 dl->dl_len = PAGE_SIZE; 657 dl = __seq_open_private(file, &debug_lockres_ops, sizeof(*dl));
661 dl->dl_buf = kmalloc(dl->dl_len, GFP_KERNEL); 658 if (!dl)
662 if (!dl->dl_buf) { 659 goto bailfree;
663 mlog_errno(ret);
664 goto bail;
665 }
666 660
667 ret = seq_open(file, &debug_lockres_ops); 661 dl->dl_len = PAGE_SIZE;
668 if (ret) { 662 dl->dl_buf = buf;
669 mlog_errno(ret);
670 goto bail;
671 }
672
673 seq = file->private_data;
674 seq->private = dl;
675 663
676 dlm_grab(dlm); 664 dlm_grab(dlm);
677 dl->dl_ctxt = dlm; 665 dl->dl_ctxt = dlm;
678 666
679 return 0; 667 return 0;
668
669bailfree:
670 kfree(buf);
680bail: 671bail:
681 if (dl) 672 mlog_errno(-ENOMEM);
682 kfree(dl->dl_buf); 673 return -ENOMEM;
683 kfree(dl);
684 return ret;
685} 674}
686 675
687static int debug_lockres_release(struct inode *inode, struct file *file) 676static int debug_lockres_release(struct inode *inode, struct file *file)
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 3fcf205ee900..02d315fef432 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -839,7 +839,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
839 * to back off and try again. This gives heartbeat a chance 839 * to back off and try again. This gives heartbeat a chance
840 * to catch up. 840 * to catch up.
841 */ 841 */
842 if (!o2hb_check_node_heartbeating(query->node_idx)) { 842 if (!o2hb_check_node_heartbeating_no_sem(query->node_idx)) {
843 mlog(0, "node %u is not in our live map yet\n", 843 mlog(0, "node %u is not in our live map yet\n",
844 query->node_idx); 844 query->node_idx);
845 845
@@ -1975,24 +1975,22 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1975 1975
1976 dlm = kzalloc(sizeof(*dlm), GFP_KERNEL); 1976 dlm = kzalloc(sizeof(*dlm), GFP_KERNEL);
1977 if (!dlm) { 1977 if (!dlm) {
1978 mlog_errno(-ENOMEM); 1978 ret = -ENOMEM;
1979 mlog_errno(ret);
1979 goto leave; 1980 goto leave;
1980 } 1981 }
1981 1982
1982 dlm->name = kstrdup(domain, GFP_KERNEL); 1983 dlm->name = kstrdup(domain, GFP_KERNEL);
1983 if (dlm->name == NULL) { 1984 if (dlm->name == NULL) {
1984 mlog_errno(-ENOMEM); 1985 ret = -ENOMEM;
1985 kfree(dlm); 1986 mlog_errno(ret);
1986 dlm = NULL;
1987 goto leave; 1987 goto leave;
1988 } 1988 }
1989 1989
1990 dlm->lockres_hash = (struct hlist_head **)dlm_alloc_pagevec(DLM_HASH_PAGES); 1990 dlm->lockres_hash = (struct hlist_head **)dlm_alloc_pagevec(DLM_HASH_PAGES);
1991 if (!dlm->lockres_hash) { 1991 if (!dlm->lockres_hash) {
1992 mlog_errno(-ENOMEM); 1992 ret = -ENOMEM;
1993 kfree(dlm->name); 1993 mlog_errno(ret);
1994 kfree(dlm);
1995 dlm = NULL;
1996 goto leave; 1994 goto leave;
1997 } 1995 }
1998 1996
@@ -2002,11 +2000,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
2002 dlm->master_hash = (struct hlist_head **) 2000 dlm->master_hash = (struct hlist_head **)
2003 dlm_alloc_pagevec(DLM_HASH_PAGES); 2001 dlm_alloc_pagevec(DLM_HASH_PAGES);
2004 if (!dlm->master_hash) { 2002 if (!dlm->master_hash) {
2005 mlog_errno(-ENOMEM); 2003 ret = -ENOMEM;
2006 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); 2004 mlog_errno(ret);
2007 kfree(dlm->name);
2008 kfree(dlm);
2009 dlm = NULL;
2010 goto leave; 2005 goto leave;
2011 } 2006 }
2012 2007
@@ -2017,14 +2012,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
2017 dlm->node_num = o2nm_this_node(); 2012 dlm->node_num = o2nm_this_node();
2018 2013
2019 ret = dlm_create_debugfs_subroot(dlm); 2014 ret = dlm_create_debugfs_subroot(dlm);
2020 if (ret < 0) { 2015 if (ret < 0)
2021 dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
2022 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
2023 kfree(dlm->name);
2024 kfree(dlm);
2025 dlm = NULL;
2026 goto leave; 2016 goto leave;
2027 }
2028 2017
2029 spin_lock_init(&dlm->spinlock); 2018 spin_lock_init(&dlm->spinlock);
2030 spin_lock_init(&dlm->master_lock); 2019 spin_lock_init(&dlm->master_lock);
@@ -2085,6 +2074,19 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
2085 atomic_read(&dlm->dlm_refs.refcount)); 2074 atomic_read(&dlm->dlm_refs.refcount));
2086 2075
2087leave: 2076leave:
2077 if (ret < 0 && dlm) {
2078 if (dlm->master_hash)
2079 dlm_free_pagevec((void **)dlm->master_hash,
2080 DLM_HASH_PAGES);
2081
2082 if (dlm->lockres_hash)
2083 dlm_free_pagevec((void **)dlm->lockres_hash,
2084 DLM_HASH_PAGES);
2085
2086 kfree(dlm->name);
2087 kfree(dlm);
2088 dlm = NULL;
2089 }
2088 return dlm; 2090 return dlm;
2089} 2091}
2090 2092
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 12ba682fc53c..215e41abf101 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -625,9 +625,6 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
625 return res; 625 return res;
626 626
627error: 627error:
628 if (res && res->lockname.name)
629 kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
630
631 if (res) 628 if (res)
632 kmem_cache_free(dlm_lockres_cache, res); 629 kmem_cache_free(dlm_lockres_cache, res);
633 return NULL; 630 return NULL;
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 45067faf5695..3365839d2971 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1710,9 +1710,12 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
1710 BUG(); 1710 BUG();
1711 } else 1711 } else
1712 __dlm_lockres_grab_inflight_worker(dlm, res); 1712 __dlm_lockres_grab_inflight_worker(dlm, res);
1713 } else /* put.. incase we are not the master */ 1713 spin_unlock(&res->spinlock);
1714 } else {
1715 /* put.. incase we are not the master */
1716 spin_unlock(&res->spinlock);
1714 dlm_lockres_put(res); 1717 dlm_lockres_put(res);
1715 spin_unlock(&res->spinlock); 1718 }
1716 } 1719 }
1717 spin_unlock(&dlm->spinlock); 1720 spin_unlock(&dlm->spinlock);
1718 1721
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 52cfe99ae056..21262f2b1654 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2892,37 +2892,24 @@ static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
2892 2892
2893static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file) 2893static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
2894{ 2894{
2895 int ret;
2896 struct ocfs2_dlm_seq_priv *priv; 2895 struct ocfs2_dlm_seq_priv *priv;
2897 struct seq_file *seq;
2898 struct ocfs2_super *osb; 2896 struct ocfs2_super *osb;
2899 2897
2900 priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL); 2898 priv = __seq_open_private(file, &ocfs2_dlm_seq_ops, sizeof(*priv));
2901 if (!priv) { 2899 if (!priv) {
2902 ret = -ENOMEM; 2900 mlog_errno(-ENOMEM);
2903 mlog_errno(ret); 2901 return -ENOMEM;
2904 goto out;
2905 } 2902 }
2903
2906 osb = inode->i_private; 2904 osb = inode->i_private;
2907 ocfs2_get_dlm_debug(osb->osb_dlm_debug); 2905 ocfs2_get_dlm_debug(osb->osb_dlm_debug);
2908 priv->p_dlm_debug = osb->osb_dlm_debug; 2906 priv->p_dlm_debug = osb->osb_dlm_debug;
2909 INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list); 2907 INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list);
2910 2908
2911 ret = seq_open(file, &ocfs2_dlm_seq_ops);
2912 if (ret) {
2913 kfree(priv);
2914 mlog_errno(ret);
2915 goto out;
2916 }
2917
2918 seq = file->private_data;
2919 seq->private = priv;
2920
2921 ocfs2_add_lockres_tracking(&priv->p_iter_res, 2909 ocfs2_add_lockres_tracking(&priv->p_iter_res,
2922 priv->p_dlm_debug); 2910 priv->p_dlm_debug);
2923 2911
2924out: 2912 return 0;
2925 return ret;
2926} 2913}
2927 2914
2928static const struct file_operations ocfs2_dlm_debug_fops = { 2915static const struct file_operations ocfs2_dlm_debug_fops = {
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 2930e231f3f9..682732f3f0d8 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -760,7 +760,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
760 struct address_space *mapping = inode->i_mapping; 760 struct address_space *mapping = inode->i_mapping;
761 struct page *page; 761 struct page *page;
762 unsigned long index = abs_from >> PAGE_CACHE_SHIFT; 762 unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
763 handle_t *handle = NULL; 763 handle_t *handle;
764 int ret = 0; 764 int ret = 0;
765 unsigned zero_from, zero_to, block_start, block_end; 765 unsigned zero_from, zero_to, block_start, block_end;
766 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 766 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
@@ -769,11 +769,17 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
769 BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); 769 BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
770 BUG_ON(abs_from & (inode->i_blkbits - 1)); 770 BUG_ON(abs_from & (inode->i_blkbits - 1));
771 771
772 handle = ocfs2_zero_start_ordered_transaction(inode, di_bh);
773 if (IS_ERR(handle)) {
774 ret = PTR_ERR(handle);
775 goto out;
776 }
777
772 page = find_or_create_page(mapping, index, GFP_NOFS); 778 page = find_or_create_page(mapping, index, GFP_NOFS);
773 if (!page) { 779 if (!page) {
774 ret = -ENOMEM; 780 ret = -ENOMEM;
775 mlog_errno(ret); 781 mlog_errno(ret);
776 goto out; 782 goto out_commit_trans;
777 } 783 }
778 784
779 /* Get the offsets within the page that we want to zero */ 785 /* Get the offsets within the page that we want to zero */
@@ -805,15 +811,6 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
805 goto out_unlock; 811 goto out_unlock;
806 } 812 }
807 813
808 if (!handle) {
809 handle = ocfs2_zero_start_ordered_transaction(inode,
810 di_bh);
811 if (IS_ERR(handle)) {
812 ret = PTR_ERR(handle);
813 handle = NULL;
814 break;
815 }
816 }
817 814
818 /* must not update i_size! */ 815 /* must not update i_size! */
819 ret = block_commit_write(page, block_start + 1, 816 ret = block_commit_write(page, block_start + 1,
@@ -824,27 +821,29 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
824 ret = 0; 821 ret = 0;
825 } 822 }
826 823
824 /*
825 * fs-writeback will release the dirty pages without page lock
826 * whose offset are over inode size, the release happens at
827 * block_write_full_page().
828 */
829 i_size_write(inode, abs_to);
830 inode->i_blocks = ocfs2_inode_sector_count(inode);
831 di->i_size = cpu_to_le64((u64)i_size_read(inode));
832 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
833 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
834 di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
835 di->i_mtime_nsec = di->i_ctime_nsec;
827 if (handle) { 836 if (handle) {
828 /*
829 * fs-writeback will release the dirty pages without page lock
830 * whose offset are over inode size, the release happens at
831 * block_write_full_page().
832 */
833 i_size_write(inode, abs_to);
834 inode->i_blocks = ocfs2_inode_sector_count(inode);
835 di->i_size = cpu_to_le64((u64)i_size_read(inode));
836 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
837 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
838 di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
839 di->i_mtime_nsec = di->i_ctime_nsec;
840 ocfs2_journal_dirty(handle, di_bh); 837 ocfs2_journal_dirty(handle, di_bh);
841 ocfs2_update_inode_fsync_trans(handle, inode, 1); 838 ocfs2_update_inode_fsync_trans(handle, inode, 1);
842 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
843 } 839 }
844 840
845out_unlock: 841out_unlock:
846 unlock_page(page); 842 unlock_page(page);
847 page_cache_release(page); 843 page_cache_release(page);
844out_commit_trans:
845 if (handle)
846 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
848out: 847out:
849 return ret; 848 return ret;
850} 849}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index a6c991c0fc98..a9b76de46047 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -162,7 +162,7 @@ static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
162{ 162{
163 int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9; 163 int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9;
164 164
165 return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits); 165 return (blkcnt_t)OCFS2_I(inode)->ip_clusters << c_to_s_bits;
166} 166}
167 167
168/* Validate that a bh contains a valid inode */ 168/* Validate that a bh contains a valid inode */
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 6219aaadeb08..74caffeeee1d 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -404,7 +404,7 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode,
404 * 'vict_blkno' was out of the valid range. 404 * 'vict_blkno' was out of the valid range.
405 */ 405 */
406 if ((vict_blkno < le64_to_cpu(rec->c_blkno)) || 406 if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
407 (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) << 407 (vict_blkno >= ((u64)le32_to_cpu(ac_dinode->id1.bitmap1.i_total) <<
408 bits_per_unit))) { 408 bits_per_unit))) {
409 ret = -EINVAL; 409 ret = -EINVAL;
410 goto out; 410 goto out;
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 13a8537d8e8b..720aa389e0ea 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -591,7 +591,7 @@ static int ocfs2_control_release(struct inode *inode, struct file *file)
591 */ 591 */
592 ocfs2_control_this_node = -1; 592 ocfs2_control_this_node = -1;
593 running_proto.pv_major = 0; 593 running_proto.pv_major = 0;
594 running_proto.pv_major = 0; 594 running_proto.pv_minor = 0;
595 } 595 }
596 596
597out: 597out:
diff --git a/fs/proc/base.c b/fs/proc/base.c
index baf852b648ad..4c542b907754 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -632,29 +632,35 @@ static const struct file_operations proc_single_file_operations = {
632 .release = single_release, 632 .release = single_release,
633}; 633};
634 634
635static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) 635
636struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
636{ 637{
637 struct task_struct *task = get_proc_task(file_inode(file)); 638 struct task_struct *task = get_proc_task(inode);
638 struct mm_struct *mm; 639 struct mm_struct *mm = ERR_PTR(-ESRCH);
639 640
640 if (!task) 641 if (task) {
641 return -ESRCH; 642 mm = mm_access(task, mode);
643 put_task_struct(task);
642 644
643 mm = mm_access(task, mode); 645 if (!IS_ERR_OR_NULL(mm)) {
644 put_task_struct(task); 646 /* ensure this mm_struct can't be freed */
647 atomic_inc(&mm->mm_count);
648 /* but do not pin its memory */
649 mmput(mm);
650 }
651 }
652
653 return mm;
654}
655
656static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
657{
658 struct mm_struct *mm = proc_mem_open(inode, mode);
645 659
646 if (IS_ERR(mm)) 660 if (IS_ERR(mm))
647 return PTR_ERR(mm); 661 return PTR_ERR(mm);
648 662
649 if (mm) {
650 /* ensure this mm_struct can't be freed */
651 atomic_inc(&mm->mm_count);
652 /* but do not pin its memory */
653 mmput(mm);
654 }
655
656 file->private_data = mm; 663 file->private_data = mm;
657
658 return 0; 664 return 0;
659} 665}
660 666
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 7da13e49128a..aa7a0ee182e1 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -268,8 +268,9 @@ extern int proc_remount(struct super_block *, int *, char *);
268 * task_[no]mmu.c 268 * task_[no]mmu.c
269 */ 269 */
270struct proc_maps_private { 270struct proc_maps_private {
271 struct pid *pid; 271 struct inode *inode;
272 struct task_struct *task; 272 struct task_struct *task;
273 struct mm_struct *mm;
273#ifdef CONFIG_MMU 274#ifdef CONFIG_MMU
274 struct vm_area_struct *tail_vma; 275 struct vm_area_struct *tail_vma;
275#endif 276#endif
@@ -278,6 +279,8 @@ struct proc_maps_private {
278#endif 279#endif
279}; 280};
280 281
282struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode);
283
281extern const struct file_operations proc_pid_maps_operations; 284extern const struct file_operations proc_pid_maps_operations;
282extern const struct file_operations proc_tid_maps_operations; 285extern const struct file_operations proc_tid_maps_operations;
283extern const struct file_operations proc_pid_numa_maps_operations; 286extern const struct file_operations proc_pid_numa_maps_operations;
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 6df8d0722c97..91a4e6426321 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -610,8 +610,10 @@ static void __init proc_kcore_text_init(void)
610struct kcore_list kcore_modules; 610struct kcore_list kcore_modules;
611static void __init add_modules_range(void) 611static void __init add_modules_range(void)
612{ 612{
613 kclist_add(&kcore_modules, (void *)MODULES_VADDR, 613 if (MODULES_VADDR != VMALLOC_START && MODULES_END != VMALLOC_END) {
614 kclist_add(&kcore_modules, (void *)MODULES_VADDR,
614 MODULES_END - MODULES_VADDR, KCORE_VMALLOC); 615 MODULES_END - MODULES_VADDR, KCORE_VMALLOC);
616 }
615} 617}
616#else 618#else
617static void __init add_modules_range(void) 619static void __init add_modules_range(void)
diff --git a/fs/proc/page.c b/fs/proc/page.c
index e647c55275d9..1e3187da1fed 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -133,6 +133,9 @@ u64 stable_page_flags(struct page *page)
133 if (PageBuddy(page)) 133 if (PageBuddy(page))
134 u |= 1 << KPF_BUDDY; 134 u |= 1 << KPF_BUDDY;
135 135
136 if (PageBalloon(page))
137 u |= 1 << KPF_BALLOON;
138
136 u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); 139 u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
137 140
138 u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); 141 u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c34156888d70..b7a7dc963a35 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -87,32 +87,14 @@ unsigned long task_statm(struct mm_struct *mm,
87 87
88#ifdef CONFIG_NUMA 88#ifdef CONFIG_NUMA
89/* 89/*
90 * These functions are for numa_maps but called in generic **maps seq_file 90 * Save get_task_policy() for show_numa_map().
91 * ->start(), ->stop() ops.
92 *
93 * numa_maps scans all vmas under mmap_sem and checks their mempolicy.
94 * Each mempolicy object is controlled by reference counting. The problem here
95 * is how to avoid accessing dead mempolicy object.
96 *
97 * Because we're holding mmap_sem while reading seq_file, it's safe to access
98 * each vma's mempolicy, no vma objects will never drop refs to mempolicy.
99 *
100 * A task's mempolicy (task->mempolicy) has different behavior. task->mempolicy
101 * is set and replaced under mmap_sem but unrefed and cleared under task_lock().
102 * So, without task_lock(), we cannot trust get_vma_policy() because we cannot
103 * gurantee the task never exits under us. But taking task_lock() around
104 * get_vma_plicy() causes lock order problem.
105 *
106 * To access task->mempolicy without lock, we hold a reference count of an
107 * object pointed by task->mempolicy and remember it. This will guarantee
108 * that task->mempolicy points to an alive object or NULL in numa_maps accesses.
109 */ 91 */
110static void hold_task_mempolicy(struct proc_maps_private *priv) 92static void hold_task_mempolicy(struct proc_maps_private *priv)
111{ 93{
112 struct task_struct *task = priv->task; 94 struct task_struct *task = priv->task;
113 95
114 task_lock(task); 96 task_lock(task);
115 priv->task_mempolicy = task->mempolicy; 97 priv->task_mempolicy = get_task_policy(task);
116 mpol_get(priv->task_mempolicy); 98 mpol_get(priv->task_mempolicy);
117 task_unlock(task); 99 task_unlock(task);
118} 100}
@@ -129,124 +111,154 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
129} 111}
130#endif 112#endif
131 113
132static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma) 114static void vma_stop(struct proc_maps_private *priv)
133{ 115{
134 if (vma && vma != priv->tail_vma) { 116 struct mm_struct *mm = priv->mm;
135 struct mm_struct *mm = vma->vm_mm; 117
136 release_task_mempolicy(priv); 118 release_task_mempolicy(priv);
137 up_read(&mm->mmap_sem); 119 up_read(&mm->mmap_sem);
138 mmput(mm); 120 mmput(mm);
139 } 121}
122
123static struct vm_area_struct *
124m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma)
125{
126 if (vma == priv->tail_vma)
127 return NULL;
128 return vma->vm_next ?: priv->tail_vma;
129}
130
131static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma)
132{
133 if (m->count < m->size) /* vma is copied successfully */
134 m->version = m_next_vma(m->private, vma) ? vma->vm_start : -1UL;
140} 135}
141 136
142static void *m_start(struct seq_file *m, loff_t *pos) 137static void *m_start(struct seq_file *m, loff_t *ppos)
143{ 138{
144 struct proc_maps_private *priv = m->private; 139 struct proc_maps_private *priv = m->private;
145 unsigned long last_addr = m->version; 140 unsigned long last_addr = m->version;
146 struct mm_struct *mm; 141 struct mm_struct *mm;
147 struct vm_area_struct *vma, *tail_vma = NULL; 142 struct vm_area_struct *vma;
148 loff_t l = *pos; 143 unsigned int pos = *ppos;
149
150 /* Clear the per syscall fields in priv */
151 priv->task = NULL;
152 priv->tail_vma = NULL;
153
154 /*
155 * We remember last_addr rather than next_addr to hit with
156 * vmacache most of the time. We have zero last_addr at
157 * the beginning and also after lseek. We will have -1 last_addr
158 * after the end of the vmas.
159 */
160 144
145 /* See m_cache_vma(). Zero at the start or after lseek. */
161 if (last_addr == -1UL) 146 if (last_addr == -1UL)
162 return NULL; 147 return NULL;
163 148
164 priv->task = get_pid_task(priv->pid, PIDTYPE_PID); 149 priv->task = get_proc_task(priv->inode);
165 if (!priv->task) 150 if (!priv->task)
166 return ERR_PTR(-ESRCH); 151 return ERR_PTR(-ESRCH);
167 152
168 mm = mm_access(priv->task, PTRACE_MODE_READ); 153 mm = priv->mm;
169 if (!mm || IS_ERR(mm)) 154 if (!mm || !atomic_inc_not_zero(&mm->mm_users))
170 return mm; 155 return NULL;
171 down_read(&mm->mmap_sem);
172 156
173 tail_vma = get_gate_vma(priv->task->mm); 157 down_read(&mm->mmap_sem);
174 priv->tail_vma = tail_vma;
175 hold_task_mempolicy(priv); 158 hold_task_mempolicy(priv);
176 /* Start with last addr hint */ 159 priv->tail_vma = get_gate_vma(mm);
177 vma = find_vma(mm, last_addr); 160
178 if (last_addr && vma) { 161 if (last_addr) {
179 vma = vma->vm_next; 162 vma = find_vma(mm, last_addr);
180 goto out; 163 if (vma && (vma = m_next_vma(priv, vma)))
164 return vma;
181 } 165 }
182 166
183 /* 167 m->version = 0;
184 * Check the vma index is within the range and do 168 if (pos < mm->map_count) {
185 * sequential scan until m_index. 169 for (vma = mm->mmap; pos; pos--) {
186 */ 170 m->version = vma->vm_start;
187 vma = NULL;
188 if ((unsigned long)l < mm->map_count) {
189 vma = mm->mmap;
190 while (l-- && vma)
191 vma = vma->vm_next; 171 vma = vma->vm_next;
192 goto out; 172 }
173 return vma;
193 } 174 }
194 175
195 if (l != mm->map_count) 176 /* we do not bother to update m->version in this case */
196 tail_vma = NULL; /* After gate vma */ 177 if (pos == mm->map_count && priv->tail_vma)
197 178 return priv->tail_vma;
198out:
199 if (vma)
200 return vma;
201 179
202 release_task_mempolicy(priv); 180 vma_stop(priv);
203 /* End of vmas has been reached */ 181 return NULL;
204 m->version = (tail_vma != NULL)? 0: -1UL;
205 up_read(&mm->mmap_sem);
206 mmput(mm);
207 return tail_vma;
208} 182}
209 183
210static void *m_next(struct seq_file *m, void *v, loff_t *pos) 184static void *m_next(struct seq_file *m, void *v, loff_t *pos)
211{ 185{
212 struct proc_maps_private *priv = m->private; 186 struct proc_maps_private *priv = m->private;
213 struct vm_area_struct *vma = v; 187 struct vm_area_struct *next;
214 struct vm_area_struct *tail_vma = priv->tail_vma;
215 188
216 (*pos)++; 189 (*pos)++;
217 if (vma && (vma != tail_vma) && vma->vm_next) 190 next = m_next_vma(priv, v);
218 return vma->vm_next; 191 if (!next)
219 vma_stop(priv, vma); 192 vma_stop(priv);
220 return (vma != tail_vma)? tail_vma: NULL; 193 return next;
221} 194}
222 195
223static void m_stop(struct seq_file *m, void *v) 196static void m_stop(struct seq_file *m, void *v)
224{ 197{
225 struct proc_maps_private *priv = m->private; 198 struct proc_maps_private *priv = m->private;
226 struct vm_area_struct *vma = v;
227 199
228 if (!IS_ERR(vma)) 200 if (!IS_ERR_OR_NULL(v))
229 vma_stop(priv, vma); 201 vma_stop(priv);
230 if (priv->task) 202 if (priv->task) {
231 put_task_struct(priv->task); 203 put_task_struct(priv->task);
204 priv->task = NULL;
205 }
206}
207
208static int proc_maps_open(struct inode *inode, struct file *file,
209 const struct seq_operations *ops, int psize)
210{
211 struct proc_maps_private *priv = __seq_open_private(file, ops, psize);
212
213 if (!priv)
214 return -ENOMEM;
215
216 priv->inode = inode;
217 priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
218 if (IS_ERR(priv->mm)) {
219 int err = PTR_ERR(priv->mm);
220
221 seq_release_private(inode, file);
222 return err;
223 }
224
225 return 0;
226}
227
228static int proc_map_release(struct inode *inode, struct file *file)
229{
230 struct seq_file *seq = file->private_data;
231 struct proc_maps_private *priv = seq->private;
232
233 if (priv->mm)
234 mmdrop(priv->mm);
235
236 return seq_release_private(inode, file);
232} 237}
233 238
234static int do_maps_open(struct inode *inode, struct file *file, 239static int do_maps_open(struct inode *inode, struct file *file,
235 const struct seq_operations *ops) 240 const struct seq_operations *ops)
236{ 241{
237 struct proc_maps_private *priv; 242 return proc_maps_open(inode, file, ops,
238 int ret = -ENOMEM; 243 sizeof(struct proc_maps_private));
239 priv = kzalloc(sizeof(*priv), GFP_KERNEL); 244}
240 if (priv) { 245
241 priv->pid = proc_pid(inode); 246static pid_t pid_of_stack(struct proc_maps_private *priv,
242 ret = seq_open(file, ops); 247 struct vm_area_struct *vma, bool is_pid)
243 if (!ret) { 248{
244 struct seq_file *m = file->private_data; 249 struct inode *inode = priv->inode;
245 m->private = priv; 250 struct task_struct *task;
246 } else { 251 pid_t ret = 0;
247 kfree(priv); 252
248 } 253 rcu_read_lock();
254 task = pid_task(proc_pid(inode), PIDTYPE_PID);
255 if (task) {
256 task = task_of_stack(task, vma, is_pid);
257 if (task)
258 ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info);
249 } 259 }
260 rcu_read_unlock();
261
250 return ret; 262 return ret;
251} 263}
252 264
@@ -256,7 +268,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
256 struct mm_struct *mm = vma->vm_mm; 268 struct mm_struct *mm = vma->vm_mm;
257 struct file *file = vma->vm_file; 269 struct file *file = vma->vm_file;
258 struct proc_maps_private *priv = m->private; 270 struct proc_maps_private *priv = m->private;
259 struct task_struct *task = priv->task;
260 vm_flags_t flags = vma->vm_flags; 271 vm_flags_t flags = vma->vm_flags;
261 unsigned long ino = 0; 272 unsigned long ino = 0;
262 unsigned long long pgoff = 0; 273 unsigned long long pgoff = 0;
@@ -321,8 +332,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
321 goto done; 332 goto done;
322 } 333 }
323 334
324 tid = vm_is_stack(task, vma, is_pid); 335 tid = pid_of_stack(priv, vma, is_pid);
325
326 if (tid != 0) { 336 if (tid != 0) {
327 /* 337 /*
328 * Thread stack in /proc/PID/task/TID/maps or 338 * Thread stack in /proc/PID/task/TID/maps or
@@ -349,15 +359,8 @@ done:
349 359
350static int show_map(struct seq_file *m, void *v, int is_pid) 360static int show_map(struct seq_file *m, void *v, int is_pid)
351{ 361{
352 struct vm_area_struct *vma = v; 362 show_map_vma(m, v, is_pid);
353 struct proc_maps_private *priv = m->private; 363 m_cache_vma(m, v);
354 struct task_struct *task = priv->task;
355
356 show_map_vma(m, vma, is_pid);
357
358 if (m->count < m->size) /* vma is copied successfully */
359 m->version = (vma != get_gate_vma(task->mm))
360 ? vma->vm_start : 0;
361 return 0; 364 return 0;
362} 365}
363 366
@@ -399,14 +402,14 @@ const struct file_operations proc_pid_maps_operations = {
399 .open = pid_maps_open, 402 .open = pid_maps_open,
400 .read = seq_read, 403 .read = seq_read,
401 .llseek = seq_lseek, 404 .llseek = seq_lseek,
402 .release = seq_release_private, 405 .release = proc_map_release,
403}; 406};
404 407
405const struct file_operations proc_tid_maps_operations = { 408const struct file_operations proc_tid_maps_operations = {
406 .open = tid_maps_open, 409 .open = tid_maps_open,
407 .read = seq_read, 410 .read = seq_read,
408 .llseek = seq_lseek, 411 .llseek = seq_lseek,
409 .release = seq_release_private, 412 .release = proc_map_release,
410}; 413};
411 414
412/* 415/*
@@ -583,8 +586,6 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
583 586
584static int show_smap(struct seq_file *m, void *v, int is_pid) 587static int show_smap(struct seq_file *m, void *v, int is_pid)
585{ 588{
586 struct proc_maps_private *priv = m->private;
587 struct task_struct *task = priv->task;
588 struct vm_area_struct *vma = v; 589 struct vm_area_struct *vma = v;
589 struct mem_size_stats mss; 590 struct mem_size_stats mss;
590 struct mm_walk smaps_walk = { 591 struct mm_walk smaps_walk = {
@@ -637,10 +638,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
637 mss.nonlinear >> 10); 638 mss.nonlinear >> 10);
638 639
639 show_smap_vma_flags(m, vma); 640 show_smap_vma_flags(m, vma);
640 641 m_cache_vma(m, vma);
641 if (m->count < m->size) /* vma is copied successfully */
642 m->version = (vma != get_gate_vma(task->mm))
643 ? vma->vm_start : 0;
644 return 0; 642 return 0;
645} 643}
646 644
@@ -682,14 +680,14 @@ const struct file_operations proc_pid_smaps_operations = {
682 .open = pid_smaps_open, 680 .open = pid_smaps_open,
683 .read = seq_read, 681 .read = seq_read,
684 .llseek = seq_lseek, 682 .llseek = seq_lseek,
685 .release = seq_release_private, 683 .release = proc_map_release,
686}; 684};
687 685
688const struct file_operations proc_tid_smaps_operations = { 686const struct file_operations proc_tid_smaps_operations = {
689 .open = tid_smaps_open, 687 .open = tid_smaps_open,
690 .read = seq_read, 688 .read = seq_read,
691 .llseek = seq_lseek, 689 .llseek = seq_lseek,
692 .release = seq_release_private, 690 .release = proc_map_release,
693}; 691};
694 692
695/* 693/*
@@ -1029,7 +1027,6 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
1029 spinlock_t *ptl; 1027 spinlock_t *ptl;
1030 pte_t *pte; 1028 pte_t *pte;
1031 int err = 0; 1029 int err = 0;
1032 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
1033 1030
1034 /* find the first VMA at or above 'addr' */ 1031 /* find the first VMA at or above 'addr' */
1035 vma = find_vma(walk->mm, addr); 1032 vma = find_vma(walk->mm, addr);
@@ -1043,6 +1040,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
1043 1040
1044 for (; addr != end; addr += PAGE_SIZE) { 1041 for (; addr != end; addr += PAGE_SIZE) {
1045 unsigned long offset; 1042 unsigned long offset;
1043 pagemap_entry_t pme;
1046 1044
1047 offset = (addr & ~PAGEMAP_WALK_MASK) >> 1045 offset = (addr & ~PAGEMAP_WALK_MASK) >>
1048 PAGE_SHIFT; 1046 PAGE_SHIFT;
@@ -1057,32 +1055,51 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
1057 1055
1058 if (pmd_trans_unstable(pmd)) 1056 if (pmd_trans_unstable(pmd))
1059 return 0; 1057 return 0;
1060 for (; addr != end; addr += PAGE_SIZE) { 1058
1061 int flags2; 1059 while (1) {
1062 1060 /* End of address space hole, which we mark as non-present. */
1063 /* check to see if we've left 'vma' behind 1061 unsigned long hole_end;
1064 * and need a new, higher one */ 1062
1065 if (vma && (addr >= vma->vm_end)) { 1063 if (vma)
1066 vma = find_vma(walk->mm, addr); 1064 hole_end = min(end, vma->vm_start);
1067 if (vma && (vma->vm_flags & VM_SOFTDIRTY)) 1065 else
1068 flags2 = __PM_SOFT_DIRTY; 1066 hole_end = end;
1069 else 1067
1070 flags2 = 0; 1068 for (; addr < hole_end; addr += PAGE_SIZE) {
1071 pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); 1069 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
1070
1071 err = add_to_pagemap(addr, &pme, pm);
1072 if (err)
1073 return err;
1072 } 1074 }
1073 1075
1074 /* check that 'vma' actually covers this address, 1076 if (!vma || vma->vm_start >= end)
1075 * and that it isn't a huge page vma */ 1077 break;
1076 if (vma && (vma->vm_start <= addr) && 1078 /*
1077 !is_vm_hugetlb_page(vma)) { 1079 * We can't possibly be in a hugetlb VMA. In general,
1080 * for a mm_walk with a pmd_entry and a hugetlb_entry,
1081 * the pmd_entry can only be called on addresses in a
1082 * hugetlb if the walk starts in a non-hugetlb VMA and
1083 * spans a hugepage VMA. Since pagemap_read walks are
1084 * PMD-sized and PMD-aligned, this will never be true.
1085 */
1086 BUG_ON(is_vm_hugetlb_page(vma));
1087
1088 /* Addresses in the VMA. */
1089 for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
1090 pagemap_entry_t pme;
1078 pte = pte_offset_map(pmd, addr); 1091 pte = pte_offset_map(pmd, addr);
1079 pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); 1092 pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
1080 /* unmap before userspace copy */
1081 pte_unmap(pte); 1093 pte_unmap(pte);
1094 err = add_to_pagemap(addr, &pme, pm);
1095 if (err)
1096 return err;
1082 } 1097 }
1083 err = add_to_pagemap(addr, &pme, pm); 1098
1084 if (err) 1099 if (addr == end)
1085 return err; 1100 break;
1101
1102 vma = find_vma(walk->mm, addr);
1086 } 1103 }
1087 1104
1088 cond_resched(); 1105 cond_resched();
@@ -1415,7 +1432,6 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1415 struct vm_area_struct *vma = v; 1432 struct vm_area_struct *vma = v;
1416 struct numa_maps *md = &numa_priv->md; 1433 struct numa_maps *md = &numa_priv->md;
1417 struct file *file = vma->vm_file; 1434 struct file *file = vma->vm_file;
1418 struct task_struct *task = proc_priv->task;
1419 struct mm_struct *mm = vma->vm_mm; 1435 struct mm_struct *mm = vma->vm_mm;
1420 struct mm_walk walk = {}; 1436 struct mm_walk walk = {};
1421 struct mempolicy *pol; 1437 struct mempolicy *pol;
@@ -1435,9 +1451,13 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1435 walk.private = md; 1451 walk.private = md;
1436 walk.mm = mm; 1452 walk.mm = mm;
1437 1453
1438 pol = get_vma_policy(task, vma, vma->vm_start); 1454 pol = __get_vma_policy(vma, vma->vm_start);
1439 mpol_to_str(buffer, sizeof(buffer), pol); 1455 if (pol) {
1440 mpol_cond_put(pol); 1456 mpol_to_str(buffer, sizeof(buffer), pol);
1457 mpol_cond_put(pol);
1458 } else {
1459 mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy);
1460 }
1441 1461
1442 seq_printf(m, "%08lx %s", vma->vm_start, buffer); 1462 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1443 1463
@@ -1447,7 +1467,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1447 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { 1467 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1448 seq_puts(m, " heap"); 1468 seq_puts(m, " heap");
1449 } else { 1469 } else {
1450 pid_t tid = vm_is_stack(task, vma, is_pid); 1470 pid_t tid = pid_of_stack(proc_priv, vma, is_pid);
1451 if (tid != 0) { 1471 if (tid != 0) {
1452 /* 1472 /*
1453 * Thread stack in /proc/PID/task/TID/maps or 1473 * Thread stack in /proc/PID/task/TID/maps or
@@ -1495,9 +1515,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1495 seq_printf(m, " N%d=%lu", nid, md->node[nid]); 1515 seq_printf(m, " N%d=%lu", nid, md->node[nid]);
1496out: 1516out:
1497 seq_putc(m, '\n'); 1517 seq_putc(m, '\n');
1498 1518 m_cache_vma(m, vma);
1499 if (m->count < m->size)
1500 m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0;
1501 return 0; 1519 return 0;
1502} 1520}
1503 1521
@@ -1528,20 +1546,8 @@ static const struct seq_operations proc_tid_numa_maps_op = {
1528static int numa_maps_open(struct inode *inode, struct file *file, 1546static int numa_maps_open(struct inode *inode, struct file *file,
1529 const struct seq_operations *ops) 1547 const struct seq_operations *ops)
1530{ 1548{
1531 struct numa_maps_private *priv; 1549 return proc_maps_open(inode, file, ops,
1532 int ret = -ENOMEM; 1550 sizeof(struct numa_maps_private));
1533 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
1534 if (priv) {
1535 priv->proc_maps.pid = proc_pid(inode);
1536 ret = seq_open(file, ops);
1537 if (!ret) {
1538 struct seq_file *m = file->private_data;
1539 m->private = priv;
1540 } else {
1541 kfree(priv);
1542 }
1543 }
1544 return ret;
1545} 1551}
1546 1552
1547static int pid_numa_maps_open(struct inode *inode, struct file *file) 1553static int pid_numa_maps_open(struct inode *inode, struct file *file)
@@ -1558,13 +1564,13 @@ const struct file_operations proc_pid_numa_maps_operations = {
1558 .open = pid_numa_maps_open, 1564 .open = pid_numa_maps_open,
1559 .read = seq_read, 1565 .read = seq_read,
1560 .llseek = seq_lseek, 1566 .llseek = seq_lseek,
1561 .release = seq_release_private, 1567 .release = proc_map_release,
1562}; 1568};
1563 1569
1564const struct file_operations proc_tid_numa_maps_operations = { 1570const struct file_operations proc_tid_numa_maps_operations = {
1565 .open = tid_numa_maps_open, 1571 .open = tid_numa_maps_open,
1566 .read = seq_read, 1572 .read = seq_read,
1567 .llseek = seq_lseek, 1573 .llseek = seq_lseek,
1568 .release = seq_release_private, 1574 .release = proc_map_release,
1569}; 1575};
1570#endif /* CONFIG_NUMA */ 1576#endif /* CONFIG_NUMA */
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 678455d2d683..599ec2e20104 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -123,6 +123,25 @@ unsigned long task_statm(struct mm_struct *mm,
123 return size; 123 return size;
124} 124}
125 125
126static pid_t pid_of_stack(struct proc_maps_private *priv,
127 struct vm_area_struct *vma, bool is_pid)
128{
129 struct inode *inode = priv->inode;
130 struct task_struct *task;
131 pid_t ret = 0;
132
133 rcu_read_lock();
134 task = pid_task(proc_pid(inode), PIDTYPE_PID);
135 if (task) {
136 task = task_of_stack(task, vma, is_pid);
137 if (task)
138 ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info);
139 }
140 rcu_read_unlock();
141
142 return ret;
143}
144
126/* 145/*
127 * display a single VMA to a sequenced file 146 * display a single VMA to a sequenced file
128 */ 147 */
@@ -163,7 +182,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
163 seq_pad(m, ' '); 182 seq_pad(m, ' ');
164 seq_path(m, &file->f_path, ""); 183 seq_path(m, &file->f_path, "");
165 } else if (mm) { 184 } else if (mm) {
166 pid_t tid = vm_is_stack(priv->task, vma, is_pid); 185 pid_t tid = pid_of_stack(priv, vma, is_pid);
167 186
168 if (tid != 0) { 187 if (tid != 0) {
169 seq_pad(m, ' '); 188 seq_pad(m, ' ');
@@ -212,22 +231,22 @@ static void *m_start(struct seq_file *m, loff_t *pos)
212 loff_t n = *pos; 231 loff_t n = *pos;
213 232
214 /* pin the task and mm whilst we play with them */ 233 /* pin the task and mm whilst we play with them */
215 priv->task = get_pid_task(priv->pid, PIDTYPE_PID); 234 priv->task = get_proc_task(priv->inode);
216 if (!priv->task) 235 if (!priv->task)
217 return ERR_PTR(-ESRCH); 236 return ERR_PTR(-ESRCH);
218 237
219 mm = mm_access(priv->task, PTRACE_MODE_READ); 238 mm = priv->mm;
220 if (!mm || IS_ERR(mm)) { 239 if (!mm || !atomic_inc_not_zero(&mm->mm_users))
221 put_task_struct(priv->task); 240 return NULL;
222 priv->task = NULL;
223 return mm;
224 }
225 down_read(&mm->mmap_sem);
226 241
242 down_read(&mm->mmap_sem);
227 /* start from the Nth VMA */ 243 /* start from the Nth VMA */
228 for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) 244 for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
229 if (n-- == 0) 245 if (n-- == 0)
230 return p; 246 return p;
247
248 up_read(&mm->mmap_sem);
249 mmput(mm);
231 return NULL; 250 return NULL;
232} 251}
233 252
@@ -235,11 +254,13 @@ static void m_stop(struct seq_file *m, void *_vml)
235{ 254{
236 struct proc_maps_private *priv = m->private; 255 struct proc_maps_private *priv = m->private;
237 256
257 if (!IS_ERR_OR_NULL(_vml)) {
258 up_read(&priv->mm->mmap_sem);
259 mmput(priv->mm);
260 }
238 if (priv->task) { 261 if (priv->task) {
239 struct mm_struct *mm = priv->task->mm;
240 up_read(&mm->mmap_sem);
241 mmput(mm);
242 put_task_struct(priv->task); 262 put_task_struct(priv->task);
263 priv->task = NULL;
243 } 264 }
244} 265}
245 266
@@ -269,20 +290,33 @@ static int maps_open(struct inode *inode, struct file *file,
269 const struct seq_operations *ops) 290 const struct seq_operations *ops)
270{ 291{
271 struct proc_maps_private *priv; 292 struct proc_maps_private *priv;
272 int ret = -ENOMEM; 293
273 294 priv = __seq_open_private(file, ops, sizeof(*priv));
274 priv = kzalloc(sizeof(*priv), GFP_KERNEL); 295 if (!priv)
275 if (priv) { 296 return -ENOMEM;
276 priv->pid = proc_pid(inode); 297
277 ret = seq_open(file, ops); 298 priv->inode = inode;
278 if (!ret) { 299 priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
279 struct seq_file *m = file->private_data; 300 if (IS_ERR(priv->mm)) {
280 m->private = priv; 301 int err = PTR_ERR(priv->mm);
281 } else { 302
282 kfree(priv); 303 seq_release_private(inode, file);
283 } 304 return err;
284 } 305 }
285 return ret; 306
307 return 0;
308}
309
310
311static int map_release(struct inode *inode, struct file *file)
312{
313 struct seq_file *seq = file->private_data;
314 struct proc_maps_private *priv = seq->private;
315
316 if (priv->mm)
317 mmdrop(priv->mm);
318
319 return seq_release_private(inode, file);
286} 320}
287 321
288static int pid_maps_open(struct inode *inode, struct file *file) 322static int pid_maps_open(struct inode *inode, struct file *file)
@@ -299,13 +333,13 @@ const struct file_operations proc_pid_maps_operations = {
299 .open = pid_maps_open, 333 .open = pid_maps_open,
300 .read = seq_read, 334 .read = seq_read,
301 .llseek = seq_lseek, 335 .llseek = seq_lseek,
302 .release = seq_release_private, 336 .release = map_release,
303}; 337};
304 338
305const struct file_operations proc_tid_maps_operations = { 339const struct file_operations proc_tid_maps_operations = {
306 .open = tid_maps_open, 340 .open = tid_maps_open,
307 .read = seq_read, 341 .read = seq_read,
308 .llseek = seq_lseek, 342 .llseek = seq_lseek,
309 .release = seq_release_private, 343 .release = map_release,
310}; 344};
311 345
diff --git a/include/asm-generic/dma-mapping-common.h b/include/asm-generic/dma-mapping-common.h
index de8bf89940f8..a9fd248f5d48 100644
--- a/include/asm-generic/dma-mapping-common.h
+++ b/include/asm-generic/dma-mapping-common.h
@@ -179,6 +179,15 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
179extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, 179extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
180 void *cpu_addr, dma_addr_t dma_addr, size_t size); 180 void *cpu_addr, dma_addr_t dma_addr, size_t size);
181 181
182void *dma_common_contiguous_remap(struct page *page, size_t size,
183 unsigned long vm_flags,
184 pgprot_t prot, const void *caller);
185
186void *dma_common_pages_remap(struct page **pages, size_t size,
187 unsigned long vm_flags, pgprot_t prot,
188 const void *caller);
189void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags);
190
182/** 191/**
183 * dma_mmap_attrs - map a coherent DMA allocation into user space 192 * dma_mmap_attrs - map a coherent DMA allocation into user space
184 * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices 193 * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 977e545a64c3..081ff8826bf6 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -664,11 +664,12 @@ static inline int pmd_trans_unstable(pmd_t *pmd)
664} 664}
665 665
666#ifdef CONFIG_NUMA_BALANCING 666#ifdef CONFIG_NUMA_BALANCING
667#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
668/* 667/*
669 * _PAGE_NUMA works identical to _PAGE_PROTNONE (it's actually the 668 * _PAGE_NUMA distinguishes between an unmapped page table entry, an entry that
670 * same bit too). It's set only when _PAGE_PRESET is not set and it's 669 * is protected for PROT_NONE and a NUMA hinting fault entry. If the
671 * never set if _PAGE_PRESENT is set. 670 * architecture defines __PAGE_PROTNONE then it should take that into account
671 * but those that do not can rely on the fact that the NUMA hinting scanner
672 * skips inaccessible VMAs.
672 * 673 *
673 * pte/pmd_present() returns true if pte/pmd_numa returns true. Page 674 * pte/pmd_present() returns true if pte/pmd_numa returns true. Page
674 * fault triggers on those regions if pte/pmd_numa returns true 675 * fault triggers on those regions if pte/pmd_numa returns true
@@ -677,16 +678,14 @@ static inline int pmd_trans_unstable(pmd_t *pmd)
677#ifndef pte_numa 678#ifndef pte_numa
678static inline int pte_numa(pte_t pte) 679static inline int pte_numa(pte_t pte)
679{ 680{
680 return (pte_flags(pte) & 681 return ptenuma_flags(pte) == _PAGE_NUMA;
681 (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA;
682} 682}
683#endif 683#endif
684 684
685#ifndef pmd_numa 685#ifndef pmd_numa
686static inline int pmd_numa(pmd_t pmd) 686static inline int pmd_numa(pmd_t pmd)
687{ 687{
688 return (pmd_flags(pmd) & 688 return pmdnuma_flags(pmd) == _PAGE_NUMA;
689 (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA;
690} 689}
691#endif 690#endif
692 691
@@ -726,6 +725,8 @@ static inline pte_t pte_mknuma(pte_t pte)
726{ 725{
727 pteval_t val = pte_val(pte); 726 pteval_t val = pte_val(pte);
728 727
728 VM_BUG_ON(!(val & _PAGE_PRESENT));
729
729 val &= ~_PAGE_PRESENT; 730 val &= ~_PAGE_PRESENT;
730 val |= _PAGE_NUMA; 731 val |= _PAGE_NUMA;
731 732
@@ -769,16 +770,6 @@ static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
769} 770}
770#endif 771#endif
771#else 772#else
772extern int pte_numa(pte_t pte);
773extern int pmd_numa(pmd_t pmd);
774extern pte_t pte_mknonnuma(pte_t pte);
775extern pmd_t pmd_mknonnuma(pmd_t pmd);
776extern pte_t pte_mknuma(pte_t pte);
777extern pmd_t pmd_mknuma(pmd_t pmd);
778extern void ptep_set_numa(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
779extern void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp);
780#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
781#else
782static inline int pmd_numa(pmd_t pmd) 773static inline int pmd_numa(pmd_t pmd)
783{ 774{
784 return 0; 775 return 0;
diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index f1a24b5c3b90..b58fd667f87b 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -3,6 +3,8 @@
3 3
4/* References to section boundaries */ 4/* References to section boundaries */
5 5
6#include <linux/compiler.h>
7
6/* 8/*
7 * Usage guidelines: 9 * Usage guidelines:
8 * _text, _data: architecture specific, don't use them in arch-independent code 10 * _text, _data: architecture specific, don't use them in arch-independent code
@@ -37,6 +39,8 @@ extern char __start_rodata[], __end_rodata[];
37/* Start and end of .ctors section - used for constructor calls. */ 39/* Start and end of .ctors section - used for constructor calls. */
38extern char __ctors_start[], __ctors_end[]; 40extern char __ctors_start[], __ctors_end[];
39 41
42extern __visible const void __nosave_begin, __nosave_end;
43
40/* function descriptor handling (if any). Override 44/* function descriptor handling (if any). Override
41 * in asm/sections.h */ 45 * in asm/sections.h */
42#ifndef dereference_function_descriptor 46#ifndef dereference_function_descriptor
diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
index 089743ade734..9b0a15d06a4f 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -27,10 +27,13 @@
27 * counter raised only while it is under our special handling; 27 * counter raised only while it is under our special handling;
28 * 28 *
29 * iii. after the lockless scan step have selected a potential balloon page for 29 * iii. after the lockless scan step have selected a potential balloon page for
30 * isolation, re-test the page->mapping flags and the page ref counter 30 * isolation, re-test the PageBalloon mark and the PagePrivate flag
31 * under the proper page lock, to ensure isolating a valid balloon page 31 * under the proper page lock, to ensure isolating a valid balloon page
32 * (not yet isolated, nor under release procedure) 32 * (not yet isolated, nor under release procedure)
33 * 33 *
34 * iv. isolation or dequeueing procedure must clear PagePrivate flag under
35 * page lock together with removing page from balloon device page list.
36 *
34 * The functions provided by this interface are placed to help on coping with 37 * The functions provided by this interface are placed to help on coping with
35 * the aforementioned balloon page corner case, as well as to ensure the simple 38 * the aforementioned balloon page corner case, as well as to ensure the simple
36 * set of exposed rules are satisfied while we are dealing with balloon pages 39 * set of exposed rules are satisfied while we are dealing with balloon pages
@@ -54,43 +57,22 @@
54 * balloon driver as a page book-keeper for its registered balloon devices. 57 * balloon driver as a page book-keeper for its registered balloon devices.
55 */ 58 */
56struct balloon_dev_info { 59struct balloon_dev_info {
57 void *balloon_device; /* balloon device descriptor */
58 struct address_space *mapping; /* balloon special page->mapping */
59 unsigned long isolated_pages; /* # of isolated pages for migration */ 60 unsigned long isolated_pages; /* # of isolated pages for migration */
60 spinlock_t pages_lock; /* Protection to pages list */ 61 spinlock_t pages_lock; /* Protection to pages list */
61 struct list_head pages; /* Pages enqueued & handled to Host */ 62 struct list_head pages; /* Pages enqueued & handled to Host */
63 int (*migratepage)(struct balloon_dev_info *, struct page *newpage,
64 struct page *page, enum migrate_mode mode);
62}; 65};
63 66
64extern struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info); 67extern struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info);
65extern struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info); 68extern struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info);
66extern struct balloon_dev_info *balloon_devinfo_alloc(
67 void *balloon_dev_descriptor);
68 69
69static inline void balloon_devinfo_free(struct balloon_dev_info *b_dev_info) 70static inline void balloon_devinfo_init(struct balloon_dev_info *balloon)
70{
71 kfree(b_dev_info);
72}
73
74/*
75 * balloon_page_free - release a balloon page back to the page free lists
76 * @page: ballooned page to be set free
77 *
78 * This function must be used to properly set free an isolated/dequeued balloon
79 * page at the end of a sucessful page migration, or at the balloon driver's
80 * page release procedure.
81 */
82static inline void balloon_page_free(struct page *page)
83{ 71{
84 /* 72 balloon->isolated_pages = 0;
85 * Balloon pages always get an extra refcount before being isolated 73 spin_lock_init(&balloon->pages_lock);
86 * and before being dequeued to help on sorting out fortuite colisions 74 INIT_LIST_HEAD(&balloon->pages);
87 * between a thread attempting to isolate and another thread attempting 75 balloon->migratepage = NULL;
88 * to release the very same balloon page.
89 *
90 * Before we handle the page back to Buddy, lets drop its extra refcnt.
91 */
92 put_page(page);
93 __free_page(page);
94} 76}
95 77
96#ifdef CONFIG_BALLOON_COMPACTION 78#ifdef CONFIG_BALLOON_COMPACTION
@@ -98,107 +80,58 @@ extern bool balloon_page_isolate(struct page *page);
98extern void balloon_page_putback(struct page *page); 80extern void balloon_page_putback(struct page *page);
99extern int balloon_page_migrate(struct page *newpage, 81extern int balloon_page_migrate(struct page *newpage,
100 struct page *page, enum migrate_mode mode); 82 struct page *page, enum migrate_mode mode);
101extern struct address_space
102*balloon_mapping_alloc(struct balloon_dev_info *b_dev_info,
103 const struct address_space_operations *a_ops);
104
105static inline void balloon_mapping_free(struct address_space *balloon_mapping)
106{
107 kfree(balloon_mapping);
108}
109 83
110/* 84/*
111 * page_flags_cleared - helper to perform balloon @page ->flags tests. 85 * __is_movable_balloon_page - helper to perform @page PageBalloon tests
112 *
113 * As balloon pages are obtained from buddy and we do not play with page->flags
114 * at driver level (exception made when we get the page lock for compaction),
115 * we can safely identify a ballooned page by checking if the
116 * PAGE_FLAGS_CHECK_AT_PREP page->flags are all cleared. This approach also
117 * helps us skip ballooned pages that are locked for compaction or release, thus
118 * mitigating their racy check at balloon_page_movable()
119 */
120static inline bool page_flags_cleared(struct page *page)
121{
122 return !(page->flags & PAGE_FLAGS_CHECK_AT_PREP);
123}
124
125/*
126 * __is_movable_balloon_page - helper to perform @page mapping->flags tests
127 */ 86 */
128static inline bool __is_movable_balloon_page(struct page *page) 87static inline bool __is_movable_balloon_page(struct page *page)
129{ 88{
130 struct address_space *mapping = page->mapping; 89 return PageBalloon(page);
131 return mapping_balloon(mapping);
132} 90}
133 91
134/* 92/*
135 * balloon_page_movable - test page->mapping->flags to identify balloon pages 93 * balloon_page_movable - test PageBalloon to identify balloon pages
136 * that can be moved by compaction/migration. 94 * and PagePrivate to check that the page is not
137 * 95 * isolated and can be moved by compaction/migration.
138 * This function is used at core compaction's page isolation scheme, therefore
139 * most pages exposed to it are not enlisted as balloon pages and so, to avoid
140 * undesired side effects like racing against __free_pages(), we cannot afford
141 * holding the page locked while testing page->mapping->flags here.
142 * 96 *
143 * As we might return false positives in the case of a balloon page being just 97 * As we might return false positives in the case of a balloon page being just
144 * released under us, the page->mapping->flags need to be re-tested later, 98 * released under us, this need to be re-tested later, under the page lock.
145 * under the proper page lock, at the functions that will be coping with the
146 * balloon page case.
147 */ 99 */
148static inline bool balloon_page_movable(struct page *page) 100static inline bool balloon_page_movable(struct page *page)
149{ 101{
150 /* 102 return PageBalloon(page) && PagePrivate(page);
151 * Before dereferencing and testing mapping->flags, let's make sure
152 * this is not a page that uses ->mapping in a different way
153 */
154 if (page_flags_cleared(page) && !page_mapped(page) &&
155 page_count(page) == 1)
156 return __is_movable_balloon_page(page);
157
158 return false;
159} 103}
160 104
161/* 105/*
162 * isolated_balloon_page - identify an isolated balloon page on private 106 * isolated_balloon_page - identify an isolated balloon page on private
163 * compaction/migration page lists. 107 * compaction/migration page lists.
164 *
165 * After a compaction thread isolates a balloon page for migration, it raises
166 * the page refcount to prevent concurrent compaction threads from re-isolating
167 * the same page. For that reason putback_movable_pages(), or other routines
168 * that need to identify isolated balloon pages on private pagelists, cannot
169 * rely on balloon_page_movable() to accomplish the task.
170 */ 108 */
171static inline bool isolated_balloon_page(struct page *page) 109static inline bool isolated_balloon_page(struct page *page)
172{ 110{
173 /* Already isolated balloon pages, by default, have a raised refcount */ 111 return PageBalloon(page);
174 if (page_flags_cleared(page) && !page_mapped(page) &&
175 page_count(page) >= 2)
176 return __is_movable_balloon_page(page);
177
178 return false;
179} 112}
180 113
181/* 114/*
182 * balloon_page_insert - insert a page into the balloon's page list and make 115 * balloon_page_insert - insert a page into the balloon's page list and make
183 * the page->mapping assignment accordingly. 116 * the page->private assignment accordingly.
117 * @balloon : pointer to balloon device
184 * @page : page to be assigned as a 'balloon page' 118 * @page : page to be assigned as a 'balloon page'
185 * @mapping : allocated special 'balloon_mapping'
186 * @head : balloon's device page list head
187 * 119 *
188 * Caller must ensure the page is locked and the spin_lock protecting balloon 120 * Caller must ensure the page is locked and the spin_lock protecting balloon
189 * pages list is held before inserting a page into the balloon device. 121 * pages list is held before inserting a page into the balloon device.
190 */ 122 */
191static inline void balloon_page_insert(struct page *page, 123static inline void balloon_page_insert(struct balloon_dev_info *balloon,
192 struct address_space *mapping, 124 struct page *page)
193 struct list_head *head)
194{ 125{
195 page->mapping = mapping; 126 __SetPageBalloon(page);
196 list_add(&page->lru, head); 127 SetPagePrivate(page);
128 set_page_private(page, (unsigned long)balloon);
129 list_add(&page->lru, &balloon->pages);
197} 130}
198 131
199/* 132/*
200 * balloon_page_delete - delete a page from balloon's page list and clear 133 * balloon_page_delete - delete a page from balloon's page list and clear
201 * the page->mapping assignement accordingly. 134 * the page->private assignement accordingly.
202 * @page : page to be released from balloon's page list 135 * @page : page to be released from balloon's page list
203 * 136 *
204 * Caller must ensure the page is locked and the spin_lock protecting balloon 137 * Caller must ensure the page is locked and the spin_lock protecting balloon
@@ -206,8 +139,12 @@ static inline void balloon_page_insert(struct page *page,
206 */ 139 */
207static inline void balloon_page_delete(struct page *page) 140static inline void balloon_page_delete(struct page *page)
208{ 141{
209 page->mapping = NULL; 142 __ClearPageBalloon(page);
210 list_del(&page->lru); 143 set_page_private(page, 0);
144 if (PagePrivate(page)) {
145 ClearPagePrivate(page);
146 list_del(&page->lru);
147 }
211} 148}
212 149
213/* 150/*
@@ -216,11 +153,7 @@ static inline void balloon_page_delete(struct page *page)
216 */ 153 */
217static inline struct balloon_dev_info *balloon_page_device(struct page *page) 154static inline struct balloon_dev_info *balloon_page_device(struct page *page)
218{ 155{
219 struct address_space *mapping = page->mapping; 156 return (struct balloon_dev_info *)page_private(page);
220 if (likely(mapping))
221 return mapping->private_data;
222
223 return NULL;
224} 157}
225 158
226static inline gfp_t balloon_mapping_gfp_mask(void) 159static inline gfp_t balloon_mapping_gfp_mask(void)
@@ -228,34 +161,24 @@ static inline gfp_t balloon_mapping_gfp_mask(void)
228 return GFP_HIGHUSER_MOVABLE; 161 return GFP_HIGHUSER_MOVABLE;
229} 162}
230 163
231static inline bool balloon_compaction_check(void)
232{
233 return true;
234}
235
236#else /* !CONFIG_BALLOON_COMPACTION */ 164#else /* !CONFIG_BALLOON_COMPACTION */
237 165
238static inline void *balloon_mapping_alloc(void *balloon_device, 166static inline void balloon_page_insert(struct balloon_dev_info *balloon,
239 const struct address_space_operations *a_ops) 167 struct page *page)
240{
241 return ERR_PTR(-EOPNOTSUPP);
242}
243
244static inline void balloon_mapping_free(struct address_space *balloon_mapping)
245{ 168{
246 return; 169 __SetPageBalloon(page);
170 list_add(&page->lru, &balloon->pages);
247} 171}
248 172
249static inline void balloon_page_insert(struct page *page, 173static inline void balloon_page_delete(struct page *page)
250 struct address_space *mapping,
251 struct list_head *head)
252{ 174{
253 list_add(&page->lru, head); 175 __ClearPageBalloon(page);
176 list_del(&page->lru);
254} 177}
255 178
256static inline void balloon_page_delete(struct page *page) 179static inline bool __is_movable_balloon_page(struct page *page)
257{ 180{
258 list_del(&page->lru); 181 return false;
259} 182}
260 183
261static inline bool balloon_page_movable(struct page *page) 184static inline bool balloon_page_movable(struct page *page)
@@ -289,9 +212,5 @@ static inline gfp_t balloon_mapping_gfp_mask(void)
289 return GFP_HIGHUSER; 212 return GFP_HIGHUSER;
290} 213}
291 214
292static inline bool balloon_compaction_check(void)
293{
294 return false;
295}
296#endif /* CONFIG_BALLOON_COMPACTION */ 215#endif /* CONFIG_BALLOON_COMPACTION */
297#endif /* _LINUX_BALLOON_COMPACTION_H */ 216#endif /* _LINUX_BALLOON_COMPACTION_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 518b46555b80..87be398166d3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1564,7 +1564,7 @@ static inline int blk_rq_map_integrity_sg(struct request_queue *q,
1564} 1564}
1565static inline struct blk_integrity *bdev_get_integrity(struct block_device *b) 1565static inline struct blk_integrity *bdev_get_integrity(struct block_device *b)
1566{ 1566{
1567 return 0; 1567 return NULL;
1568} 1568}
1569static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) 1569static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
1570{ 1570{
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 01e3132820da..60bdf8dc02a3 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -2,14 +2,24 @@
2#define _LINUX_COMPACTION_H 2#define _LINUX_COMPACTION_H
3 3
4/* Return values for compact_zone() and try_to_compact_pages() */ 4/* Return values for compact_zone() and try_to_compact_pages() */
5/* compaction didn't start as it was deferred due to past failures */
6#define COMPACT_DEFERRED 0
5/* compaction didn't start as it was not possible or direct reclaim was more suitable */ 7/* compaction didn't start as it was not possible or direct reclaim was more suitable */
6#define COMPACT_SKIPPED 0 8#define COMPACT_SKIPPED 1
7/* compaction should continue to another pageblock */ 9/* compaction should continue to another pageblock */
8#define COMPACT_CONTINUE 1 10#define COMPACT_CONTINUE 2
9/* direct compaction partially compacted a zone and there are suitable pages */ 11/* direct compaction partially compacted a zone and there are suitable pages */
10#define COMPACT_PARTIAL 2 12#define COMPACT_PARTIAL 3
11/* The full zone was compacted */ 13/* The full zone was compacted */
12#define COMPACT_COMPLETE 3 14#define COMPACT_COMPLETE 4
15
16/* Used to signal whether compaction detected need_sched() or lock contention */
17/* No contention detected */
18#define COMPACT_CONTENDED_NONE 0
19/* Either need_sched() was true or fatal signal pending */
20#define COMPACT_CONTENDED_SCHED 1
21/* Zone lock or lru_lock was contended in async compaction */
22#define COMPACT_CONTENDED_LOCK 2
13 23
14#ifdef CONFIG_COMPACTION 24#ifdef CONFIG_COMPACTION
15extern int sysctl_compact_memory; 25extern int sysctl_compact_memory;
@@ -22,7 +32,8 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
22extern int fragmentation_index(struct zone *zone, unsigned int order); 32extern int fragmentation_index(struct zone *zone, unsigned int order);
23extern unsigned long try_to_compact_pages(struct zonelist *zonelist, 33extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
24 int order, gfp_t gfp_mask, nodemask_t *mask, 34 int order, gfp_t gfp_mask, nodemask_t *mask,
25 enum migrate_mode mode, bool *contended); 35 enum migrate_mode mode, int *contended,
36 struct zone **candidate_zone);
26extern void compact_pgdat(pg_data_t *pgdat, int order); 37extern void compact_pgdat(pg_data_t *pgdat, int order);
27extern void reset_isolation_suitable(pg_data_t *pgdat); 38extern void reset_isolation_suitable(pg_data_t *pgdat);
28extern unsigned long compaction_suitable(struct zone *zone, int order); 39extern unsigned long compaction_suitable(struct zone *zone, int order);
@@ -91,7 +102,8 @@ static inline bool compaction_restarting(struct zone *zone, int order)
91#else 102#else
92static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, 103static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
93 int order, gfp_t gfp_mask, nodemask_t *nodemask, 104 int order, gfp_t gfp_mask, nodemask_t *nodemask,
94 enum migrate_mode mode, bool *contended) 105 enum migrate_mode mode, int *contended,
106 struct zone **candidate_zone)
95{ 107{
96 return COMPACT_CONTINUE; 108 return COMPACT_CONTINUE;
97} 109}
diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index 1c2fdaa2ffc3..1ccaab44abcc 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -110,6 +110,10 @@ extern void gen_pool_set_algo(struct gen_pool *pool, genpool_algo_t algo,
110extern unsigned long gen_pool_first_fit(unsigned long *map, unsigned long size, 110extern unsigned long gen_pool_first_fit(unsigned long *map, unsigned long size,
111 unsigned long start, unsigned int nr, void *data); 111 unsigned long start, unsigned int nr, void *data);
112 112
113extern unsigned long gen_pool_first_fit_order_align(unsigned long *map,
114 unsigned long size, unsigned long start, unsigned int nr,
115 void *data);
116
113extern unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size, 117extern unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size,
114 unsigned long start, unsigned int nr, void *data); 118 unsigned long start, unsigned int nr, void *data);
115 119
@@ -117,6 +121,9 @@ extern struct gen_pool *devm_gen_pool_create(struct device *dev,
117 int min_alloc_order, int nid); 121 int min_alloc_order, int nid);
118extern struct gen_pool *dev_get_gen_pool(struct device *dev); 122extern struct gen_pool *dev_get_gen_pool(struct device *dev);
119 123
124bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
125 size_t size);
126
120#ifdef CONFIG_OF 127#ifdef CONFIG_OF
121extern struct gen_pool *of_get_named_gen_pool(struct device_node *np, 128extern struct gen_pool *of_get_named_gen_pool(struct device_node *np,
122 const char *propname, int index); 129 const char *propname, int index);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 5e7219dc0fae..41b30fd4d041 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -156,7 +156,7 @@ struct vm_area_struct;
156#define GFP_DMA32 __GFP_DMA32 156#define GFP_DMA32 __GFP_DMA32
157 157
158/* Convert GFP flags to their corresponding migrate type */ 158/* Convert GFP flags to their corresponding migrate type */
159static inline int allocflags_to_migratetype(gfp_t gfp_flags) 159static inline int gfpflags_to_migratetype(const gfp_t gfp_flags)
160{ 160{
161 WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK); 161 WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
162 162
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 63579cb8d3dc..ad9051bab267 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -132,7 +132,7 @@ extern int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
132static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, 132static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
133 spinlock_t **ptl) 133 spinlock_t **ptl)
134{ 134{
135 VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem)); 135 VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma);
136 if (pmd_trans_huge(*pmd)) 136 if (pmd_trans_huge(*pmd))
137 return __pmd_trans_huge_lock(pmd, vma, ptl); 137 return __pmd_trans_huge_lock(pmd, vma, ptl);
138 else 138 else
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 95624bed87ef..e9e420b6d931 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -715,23 +715,8 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
715 (void) (&_max1 == &_max2); \ 715 (void) (&_max1 == &_max2); \
716 _max1 > _max2 ? _max1 : _max2; }) 716 _max1 > _max2 ? _max1 : _max2; })
717 717
718#define min3(x, y, z) ({ \ 718#define min3(x, y, z) min((typeof(x))min(x, y), z)
719 typeof(x) _min1 = (x); \ 719#define max3(x, y, z) max((typeof(x))max(x, y), z)
720 typeof(y) _min2 = (y); \
721 typeof(z) _min3 = (z); \
722 (void) (&_min1 == &_min2); \
723 (void) (&_min1 == &_min3); \
724 _min1 < _min2 ? (_min1 < _min3 ? _min1 : _min3) : \
725 (_min2 < _min3 ? _min2 : _min3); })
726
727#define max3(x, y, z) ({ \
728 typeof(x) _max1 = (x); \
729 typeof(y) _max2 = (y); \
730 typeof(z) _max3 = (z); \
731 (void) (&_max1 == &_max2); \
732 (void) (&_max1 == &_max3); \
733 _max1 > _max2 ? (_max1 > _max3 ? _max1 : _max3) : \
734 (_max2 > _max3 ? _max2 : _max3); })
735 720
736/** 721/**
737 * min_not_zero - return the minimum that is _not_ zero, unless both are zero 722 * min_not_zero - return the minimum that is _not_ zero, unless both are zero
@@ -746,20 +731,13 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
746/** 731/**
747 * clamp - return a value clamped to a given range with strict typechecking 732 * clamp - return a value clamped to a given range with strict typechecking
748 * @val: current value 733 * @val: current value
749 * @min: minimum allowable value 734 * @lo: lowest allowable value
750 * @max: maximum allowable value 735 * @hi: highest allowable value
751 * 736 *
752 * This macro does strict typechecking of min/max to make sure they are of the 737 * This macro does strict typechecking of lo/hi to make sure they are of the
753 * same type as val. See the unnecessary pointer comparisons. 738 * same type as val. See the unnecessary pointer comparisons.
754 */ 739 */
755#define clamp(val, min, max) ({ \ 740#define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi)
756 typeof(val) __val = (val); \
757 typeof(min) __min = (min); \
758 typeof(max) __max = (max); \
759 (void) (&__val == &__min); \
760 (void) (&__val == &__max); \
761 __val = __val < __min ? __min: __val; \
762 __val > __max ? __max: __val; })
763 741
764/* 742/*
765 * ..and if you can't take the strict 743 * ..and if you can't take the strict
@@ -781,36 +759,26 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
781 * clamp_t - return a value clamped to a given range using a given type 759 * clamp_t - return a value clamped to a given range using a given type
782 * @type: the type of variable to use 760 * @type: the type of variable to use
783 * @val: current value 761 * @val: current value
784 * @min: minimum allowable value 762 * @lo: minimum allowable value
785 * @max: maximum allowable value 763 * @hi: maximum allowable value
786 * 764 *
787 * This macro does no typechecking and uses temporary variables of type 765 * This macro does no typechecking and uses temporary variables of type
788 * 'type' to make all the comparisons. 766 * 'type' to make all the comparisons.
789 */ 767 */
790#define clamp_t(type, val, min, max) ({ \ 768#define clamp_t(type, val, lo, hi) min_t(type, max_t(type, val, lo), hi)
791 type __val = (val); \
792 type __min = (min); \
793 type __max = (max); \
794 __val = __val < __min ? __min: __val; \
795 __val > __max ? __max: __val; })
796 769
797/** 770/**
798 * clamp_val - return a value clamped to a given range using val's type 771 * clamp_val - return a value clamped to a given range using val's type
799 * @val: current value 772 * @val: current value
800 * @min: minimum allowable value 773 * @lo: minimum allowable value
801 * @max: maximum allowable value 774 * @hi: maximum allowable value
802 * 775 *
803 * This macro does no typechecking and uses temporary variables of whatever 776 * This macro does no typechecking and uses temporary variables of whatever
804 * type the input argument 'val' is. This is useful when val is an unsigned 777 * type the input argument 'val' is. This is useful when val is an unsigned
805 * type and min and max are literals that will otherwise be assigned a signed 778 * type and min and max are literals that will otherwise be assigned a signed
806 * integer type. 779 * integer type.
807 */ 780 */
808#define clamp_val(val, min, max) ({ \ 781#define clamp_val(val, lo, hi) clamp_t(typeof(val), val, lo, hi)
809 typeof(val) __val = (val); \
810 typeof(val) __min = (min); \
811 typeof(val) __max = (max); \
812 __val = __val < __min ? __min: __val; \
813 __val > __max ? __max: __val; })
814 782
815 783
816/* 784/*
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e0752d204d9e..19df5d857411 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -440,11 +440,6 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order);
440 440
441int memcg_cache_id(struct mem_cgroup *memcg); 441int memcg_cache_id(struct mem_cgroup *memcg);
442 442
443int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
444 struct kmem_cache *root_cache);
445void memcg_free_cache_params(struct kmem_cache *s);
446
447int memcg_update_cache_size(struct kmem_cache *s, int num_groups);
448void memcg_update_array_size(int num_groups); 443void memcg_update_array_size(int num_groups);
449 444
450struct kmem_cache * 445struct kmem_cache *
@@ -574,16 +569,6 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
574 return -1; 569 return -1;
575} 570}
576 571
577static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg,
578 struct kmem_cache *s, struct kmem_cache *root_cache)
579{
580 return 0;
581}
582
583static inline void memcg_free_cache_params(struct kmem_cache *s)
584{
585}
586
587static inline struct kmem_cache * 572static inline struct kmem_cache *
588memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) 573memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
589{ 574{
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index d9524c49d767..8f1a41951df9 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -84,6 +84,7 @@ extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages);
84extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); 84extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
85/* VM interface that may be used by firmware interface */ 85/* VM interface that may be used by firmware interface */
86extern int online_pages(unsigned long, unsigned long, int); 86extern int online_pages(unsigned long, unsigned long, int);
87extern int test_pages_in_a_zone(unsigned long, unsigned long);
87extern void __offline_isolated_pages(unsigned long, unsigned long); 88extern void __offline_isolated_pages(unsigned long, unsigned long);
88 89
89typedef void (*online_page_callback_t)(struct page *page); 90typedef void (*online_page_callback_t)(struct page *page);
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index f230a978e6ba..3d385c81c153 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -134,9 +134,10 @@ void mpol_free_shared_policy(struct shared_policy *p);
134struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, 134struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
135 unsigned long idx); 135 unsigned long idx);
136 136
137struct mempolicy *get_vma_policy(struct task_struct *tsk, 137struct mempolicy *get_task_policy(struct task_struct *p);
138 struct vm_area_struct *vma, unsigned long addr); 138struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
139bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma); 139 unsigned long addr);
140bool vma_policy_mof(struct vm_area_struct *vma);
140 141
141extern void numa_default_policy(void); 142extern void numa_default_policy(void);
142extern void numa_policy_init(void); 143extern void numa_policy_init(void);
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index a2901c414664..01aad3ed89ec 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -13,18 +13,9 @@ typedef void free_page_t(struct page *page, unsigned long private);
13 * Return values from addresss_space_operations.migratepage(): 13 * Return values from addresss_space_operations.migratepage():
14 * - negative errno on page migration failure; 14 * - negative errno on page migration failure;
15 * - zero on page migration success; 15 * - zero on page migration success;
16 *
17 * The balloon page migration introduces this special case where a 'distinct'
18 * return code is used to flag a successful page migration to unmap_and_move().
19 * This approach is necessary because page migration can race against balloon
20 * deflation procedure, and for such case we could introduce a nasty page leak
21 * if a successfully migrated balloon page gets released concurrently with
22 * migration's unmap_and_move() wrap-up steps.
23 */ 16 */
24#define MIGRATEPAGE_SUCCESS 0 17#define MIGRATEPAGE_SUCCESS 0
25#define MIGRATEPAGE_BALLOON_SUCCESS 1 /* special ret code for balloon page 18
26 * sucessful migration case.
27 */
28enum migrate_reason { 19enum migrate_reason {
29 MR_COMPACTION, 20 MR_COMPACTION,
30 MR_MEMORY_FAILURE, 21 MR_MEMORY_FAILURE,
@@ -82,9 +73,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
82 return -ENOSYS; 73 return -ENOSYS;
83} 74}
84 75
85/* Possible settings for the migrate_page() method in address_operations */
86#define migrate_page NULL
87
88#endif /* CONFIG_MIGRATION */ 76#endif /* CONFIG_MIGRATION */
89 77
90#ifdef CONFIG_NUMA_BALANCING 78#ifdef CONFIG_NUMA_BALANCING
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0f4196a0bc20..fa0d74e06428 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -18,6 +18,7 @@
18#include <linux/pfn.h> 18#include <linux/pfn.h>
19#include <linux/bit_spinlock.h> 19#include <linux/bit_spinlock.h>
20#include <linux/shrinker.h> 20#include <linux/shrinker.h>
21#include <linux/resource.h>
21 22
22struct mempolicy; 23struct mempolicy;
23struct anon_vma; 24struct anon_vma;
@@ -553,6 +554,25 @@ static inline void __ClearPageBuddy(struct page *page)
553 atomic_set(&page->_mapcount, -1); 554 atomic_set(&page->_mapcount, -1);
554} 555}
555 556
557#define PAGE_BALLOON_MAPCOUNT_VALUE (-256)
558
559static inline int PageBalloon(struct page *page)
560{
561 return atomic_read(&page->_mapcount) == PAGE_BALLOON_MAPCOUNT_VALUE;
562}
563
564static inline void __SetPageBalloon(struct page *page)
565{
566 VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page);
567 atomic_set(&page->_mapcount, PAGE_BALLOON_MAPCOUNT_VALUE);
568}
569
570static inline void __ClearPageBalloon(struct page *page)
571{
572 VM_BUG_ON_PAGE(!PageBalloon(page), page);
573 atomic_set(&page->_mapcount, -1);
574}
575
556void put_page(struct page *page); 576void put_page(struct page *page);
557void put_pages_list(struct list_head *pages); 577void put_pages_list(struct list_head *pages);
558 578
@@ -1247,8 +1267,8 @@ static inline int stack_guard_page_end(struct vm_area_struct *vma,
1247 !vma_growsup(vma->vm_next, addr); 1267 !vma_growsup(vma->vm_next, addr);
1248} 1268}
1249 1269
1250extern pid_t 1270extern struct task_struct *task_of_stack(struct task_struct *task,
1251vm_is_stack(struct task_struct *task, struct vm_area_struct *vma, int in_group); 1271 struct vm_area_struct *vma, bool in_group);
1252 1272
1253extern unsigned long move_page_tables(struct vm_area_struct *vma, 1273extern unsigned long move_page_tables(struct vm_area_struct *vma,
1254 unsigned long old_addr, struct vm_area_struct *new_vma, 1274 unsigned long old_addr, struct vm_area_struct *new_vma,
@@ -1780,6 +1800,20 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
1780 bool *need_rmap_locks); 1800 bool *need_rmap_locks);
1781extern void exit_mmap(struct mm_struct *); 1801extern void exit_mmap(struct mm_struct *);
1782 1802
1803static inline int check_data_rlimit(unsigned long rlim,
1804 unsigned long new,
1805 unsigned long start,
1806 unsigned long end_data,
1807 unsigned long start_data)
1808{
1809 if (rlim < RLIM_INFINITY) {
1810 if (((new - start) + (end_data - start_data)) > rlim)
1811 return -ENOSPC;
1812 }
1813
1814 return 0;
1815}
1816
1783extern int mm_take_all_locks(struct mm_struct *mm); 1817extern int mm_take_all_locks(struct mm_struct *mm);
1784extern void mm_drop_all_locks(struct mm_struct *mm); 1818extern void mm_drop_all_locks(struct mm_struct *mm);
1785 1819
diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index 2f348d02f640..877ef226f90f 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -4,10 +4,14 @@
4#include <linux/stringify.h> 4#include <linux/stringify.h>
5 5
6struct page; 6struct page;
7struct vm_area_struct;
8struct mm_struct;
7 9
8extern void dump_page(struct page *page, const char *reason); 10extern void dump_page(struct page *page, const char *reason);
9extern void dump_page_badflags(struct page *page, const char *reason, 11extern void dump_page_badflags(struct page *page, const char *reason,
10 unsigned long badflags); 12 unsigned long badflags);
13void dump_vma(const struct vm_area_struct *vma);
14void dump_mm(const struct mm_struct *mm);
11 15
12#ifdef CONFIG_DEBUG_VM 16#ifdef CONFIG_DEBUG_VM
13#define VM_BUG_ON(cond) BUG_ON(cond) 17#define VM_BUG_ON(cond) BUG_ON(cond)
@@ -18,12 +22,28 @@ extern void dump_page_badflags(struct page *page, const char *reason,
18 BUG(); \ 22 BUG(); \
19 } \ 23 } \
20 } while (0) 24 } while (0)
25#define VM_BUG_ON_VMA(cond, vma) \
26 do { \
27 if (unlikely(cond)) { \
28 dump_vma(vma); \
29 BUG(); \
30 } \
31 } while (0)
32#define VM_BUG_ON_MM(cond, mm) \
33 do { \
34 if (unlikely(cond)) { \
35 dump_mm(mm); \
36 BUG(); \
37 } \
38 } while (0)
21#define VM_WARN_ON(cond) WARN_ON(cond) 39#define VM_WARN_ON(cond) WARN_ON(cond)
22#define VM_WARN_ON_ONCE(cond) WARN_ON_ONCE(cond) 40#define VM_WARN_ON_ONCE(cond) WARN_ON_ONCE(cond)
23#define VM_WARN_ONCE(cond, format...) WARN_ONCE(cond, format) 41#define VM_WARN_ONCE(cond, format...) WARN_ONCE(cond, format)
24#else 42#else
25#define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond) 43#define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
26#define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond) 44#define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond)
45#define VM_BUG_ON_VMA(cond, vma) VM_BUG_ON(cond)
46#define VM_BUG_ON_MM(cond, mm) VM_BUG_ON(cond)
27#define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond) 47#define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond)
28#define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) 48#define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond)
29#define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) 49#define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 318df7051850..48bf12ef6620 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -521,13 +521,13 @@ struct zone {
521 atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; 521 atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
522} ____cacheline_internodealigned_in_smp; 522} ____cacheline_internodealigned_in_smp;
523 523
524typedef enum { 524enum zone_flags {
525 ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */ 525 ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */
526 ZONE_OOM_LOCKED, /* zone is in OOM killer zonelist */ 526 ZONE_OOM_LOCKED, /* zone is in OOM killer zonelist */
527 ZONE_CONGESTED, /* zone has many dirty pages backed by 527 ZONE_CONGESTED, /* zone has many dirty pages backed by
528 * a congested BDI 528 * a congested BDI
529 */ 529 */
530 ZONE_TAIL_LRU_DIRTY, /* reclaim scanning has recently found 530 ZONE_DIRTY, /* reclaim scanning has recently found
531 * many dirty file pages at the tail 531 * many dirty file pages at the tail
532 * of the LRU. 532 * of the LRU.
533 */ 533 */
@@ -535,52 +535,7 @@ typedef enum {
535 * many pages under writeback 535 * many pages under writeback
536 */ 536 */
537 ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */ 537 ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */
538} zone_flags_t; 538};
539
540static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
541{
542 set_bit(flag, &zone->flags);
543}
544
545static inline int zone_test_and_set_flag(struct zone *zone, zone_flags_t flag)
546{
547 return test_and_set_bit(flag, &zone->flags);
548}
549
550static inline void zone_clear_flag(struct zone *zone, zone_flags_t flag)
551{
552 clear_bit(flag, &zone->flags);
553}
554
555static inline int zone_is_reclaim_congested(const struct zone *zone)
556{
557 return test_bit(ZONE_CONGESTED, &zone->flags);
558}
559
560static inline int zone_is_reclaim_dirty(const struct zone *zone)
561{
562 return test_bit(ZONE_TAIL_LRU_DIRTY, &zone->flags);
563}
564
565static inline int zone_is_reclaim_writeback(const struct zone *zone)
566{
567 return test_bit(ZONE_WRITEBACK, &zone->flags);
568}
569
570static inline int zone_is_reclaim_locked(const struct zone *zone)
571{
572 return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
573}
574
575static inline int zone_is_fair_depleted(const struct zone *zone)
576{
577 return test_bit(ZONE_FAIR_DEPLETED, &zone->flags);
578}
579
580static inline int zone_is_oom_locked(const struct zone *zone)
581{
582 return test_bit(ZONE_OOM_LOCKED, &zone->flags);
583}
584 539
585static inline unsigned long zone_end_pfn(const struct zone *zone) 540static inline unsigned long zone_end_pfn(const struct zone *zone)
586{ 541{
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 19191d39c4f3..7ea069cd3257 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -24,8 +24,7 @@ enum mapping_flags {
24 AS_ENOSPC = __GFP_BITS_SHIFT + 1, /* ENOSPC on async write */ 24 AS_ENOSPC = __GFP_BITS_SHIFT + 1, /* ENOSPC on async write */
25 AS_MM_ALL_LOCKS = __GFP_BITS_SHIFT + 2, /* under mm_take_all_locks() */ 25 AS_MM_ALL_LOCKS = __GFP_BITS_SHIFT + 2, /* under mm_take_all_locks() */
26 AS_UNEVICTABLE = __GFP_BITS_SHIFT + 3, /* e.g., ramdisk, SHM_LOCK */ 26 AS_UNEVICTABLE = __GFP_BITS_SHIFT + 3, /* e.g., ramdisk, SHM_LOCK */
27 AS_BALLOON_MAP = __GFP_BITS_SHIFT + 4, /* balloon page special map */ 27 AS_EXITING = __GFP_BITS_SHIFT + 4, /* final truncate in progress */
28 AS_EXITING = __GFP_BITS_SHIFT + 5, /* final truncate in progress */
29}; 28};
30 29
31static inline void mapping_set_error(struct address_space *mapping, int error) 30static inline void mapping_set_error(struct address_space *mapping, int error)
@@ -55,21 +54,6 @@ static inline int mapping_unevictable(struct address_space *mapping)
55 return !!mapping; 54 return !!mapping;
56} 55}
57 56
58static inline void mapping_set_balloon(struct address_space *mapping)
59{
60 set_bit(AS_BALLOON_MAP, &mapping->flags);
61}
62
63static inline void mapping_clear_balloon(struct address_space *mapping)
64{
65 clear_bit(AS_BALLOON_MAP, &mapping->flags);
66}
67
68static inline int mapping_balloon(struct address_space *mapping)
69{
70 return mapping && test_bit(AS_BALLOON_MAP, &mapping->flags);
71}
72
73static inline void mapping_set_exiting(struct address_space *mapping) 57static inline void mapping_set_exiting(struct address_space *mapping)
74{ 58{
75 set_bit(AS_EXITING, &mapping->flags); 59 set_bit(AS_EXITING, &mapping->flags);
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index be574506e6a9..c0c2bce6b0b7 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -150,7 +150,7 @@ int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);
150static inline void anon_vma_merge(struct vm_area_struct *vma, 150static inline void anon_vma_merge(struct vm_area_struct *vma,
151 struct vm_area_struct *next) 151 struct vm_area_struct *next)
152{ 152{
153 VM_BUG_ON(vma->anon_vma != next->anon_vma); 153 VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma);
154 unlink_anon_vmas(next); 154 unlink_anon_vmas(next);
155} 155}
156 156
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9c6353d9e63a..5e63ba59258c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1935,11 +1935,13 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
1935#define tsk_used_math(p) ((p)->flags & PF_USED_MATH) 1935#define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
1936#define used_math() tsk_used_math(current) 1936#define used_math() tsk_used_math(current)
1937 1937
1938/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags */ 1938/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags
1939 * __GFP_FS is also cleared as it implies __GFP_IO.
1940 */
1939static inline gfp_t memalloc_noio_flags(gfp_t flags) 1941static inline gfp_t memalloc_noio_flags(gfp_t flags)
1940{ 1942{
1941 if (unlikely(current->flags & PF_MEMALLOC_NOIO)) 1943 if (unlikely(current->flags & PF_MEMALLOC_NOIO))
1942 flags &= ~__GFP_IO; 1944 flags &= ~(__GFP_IO | __GFP_FS);
1943 return flags; 1945 return flags;
1944} 1946}
1945 1947
diff --git a/include/linux/screen_info.h b/include/linux/screen_info.h
index 005bf3e38db5..f0f8bad54be9 100644
--- a/include/linux/screen_info.h
+++ b/include/linux/screen_info.h
@@ -5,12 +5,4 @@
5 5
6extern struct screen_info screen_info; 6extern struct screen_info screen_info;
7 7
8#define ORIG_X (screen_info.orig_x)
9#define ORIG_Y (screen_info.orig_y)
10#define ORIG_VIDEO_MODE (screen_info.orig_video_mode)
11#define ORIG_VIDEO_COLS (screen_info.orig_video_cols)
12#define ORIG_VIDEO_EGA_BX (screen_info.orig_video_ega_bx)
13#define ORIG_VIDEO_LINES (screen_info.orig_video_lines)
14#define ORIG_VIDEO_ISVGA (screen_info.orig_video_isVGA)
15#define ORIG_VIDEO_POINTS (screen_info.orig_video_points)
16#endif /* _SCREEN_INFO_H */ 8#endif /* _SCREEN_INFO_H */
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 1d9abb7d22a0..c265bec6a57d 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -158,31 +158,6 @@ size_t ksize(const void *);
158#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) 158#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
159#endif 159#endif
160 160
161#ifdef CONFIG_SLOB
162/*
163 * Common fields provided in kmem_cache by all slab allocators
164 * This struct is either used directly by the allocator (SLOB)
165 * or the allocator must include definitions for all fields
166 * provided in kmem_cache_common in their definition of kmem_cache.
167 *
168 * Once we can do anonymous structs (C11 standard) we could put a
169 * anonymous struct definition in these allocators so that the
170 * separate allocations in the kmem_cache structure of SLAB and
171 * SLUB is no longer needed.
172 */
173struct kmem_cache {
174 unsigned int object_size;/* The original size of the object */
175 unsigned int size; /* The aligned/padded/added on size */
176 unsigned int align; /* Alignment as calculated */
177 unsigned long flags; /* Active flags on the slab */
178 const char *name; /* Slab name for sysfs */
179 int refcount; /* Use counter */
180 void (*ctor)(void *); /* Called on object slot creation */
181 struct list_head list; /* List of all slab caches on the system */
182};
183
184#endif /* CONFIG_SLOB */
185
186/* 161/*
187 * Kmalloc array related definitions 162 * Kmalloc array related definitions
188 */ 163 */
@@ -363,14 +338,6 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s,
363} 338}
364#endif /* CONFIG_TRACING */ 339#endif /* CONFIG_TRACING */
365 340
366#ifdef CONFIG_SLAB
367#include <linux/slab_def.h>
368#endif
369
370#ifdef CONFIG_SLUB
371#include <linux/slub_def.h>
372#endif
373
374extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order); 341extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order);
375 342
376#ifdef CONFIG_TRACING 343#ifdef CONFIG_TRACING
@@ -582,37 +549,15 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
582 * allocator where we care about the real place the memory allocation 549 * allocator where we care about the real place the memory allocation
583 * request comes from. 550 * request comes from.
584 */ 551 */
585#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB) || \
586 (defined(CONFIG_SLAB) && defined(CONFIG_TRACING)) || \
587 (defined(CONFIG_SLOB) && defined(CONFIG_TRACING))
588extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long); 552extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long);
589#define kmalloc_track_caller(size, flags) \ 553#define kmalloc_track_caller(size, flags) \
590 __kmalloc_track_caller(size, flags, _RET_IP_) 554 __kmalloc_track_caller(size, flags, _RET_IP_)
591#else
592#define kmalloc_track_caller(size, flags) \
593 __kmalloc(size, flags)
594#endif /* DEBUG_SLAB */
595 555
596#ifdef CONFIG_NUMA 556#ifdef CONFIG_NUMA
597/*
598 * kmalloc_node_track_caller is a special version of kmalloc_node that
599 * records the calling function of the routine calling it for slab leak
600 * tracking instead of just the calling function (confusing, eh?).
601 * It's useful when the call to kmalloc_node comes from a widely-used
602 * standard allocator where we care about the real place the memory
603 * allocation request comes from.
604 */
605#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB) || \
606 (defined(CONFIG_SLAB) && defined(CONFIG_TRACING)) || \
607 (defined(CONFIG_SLOB) && defined(CONFIG_TRACING))
608extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long); 557extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long);
609#define kmalloc_node_track_caller(size, flags, node) \ 558#define kmalloc_node_track_caller(size, flags, node) \
610 __kmalloc_node_track_caller(size, flags, node, \ 559 __kmalloc_node_track_caller(size, flags, node, \
611 _RET_IP_) 560 _RET_IP_)
612#else
613#define kmalloc_node_track_caller(size, flags, node) \
614 __kmalloc_node(size, flags, node)
615#endif
616 561
617#else /* CONFIG_NUMA */ 562#else /* CONFIG_NUMA */
618 563
@@ -650,14 +595,7 @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node)
650 return kmalloc_node(size, flags | __GFP_ZERO, node); 595 return kmalloc_node(size, flags | __GFP_ZERO, node);
651} 596}
652 597
653/* 598unsigned int kmem_cache_size(struct kmem_cache *s);
654 * Determine the size of a slab object
655 */
656static inline unsigned int kmem_cache_size(struct kmem_cache *s)
657{
658 return s->object_size;
659}
660
661void __init kmem_cache_init_late(void); 599void __init kmem_cache_init_late(void);
662 600
663#endif /* _LINUX_SLAB_H */ 601#endif /* _LINUX_SLAB_H */
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 8235dfbb3b05..b869d1662ba3 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -8,6 +8,8 @@
8 */ 8 */
9 9
10struct kmem_cache { 10struct kmem_cache {
11 struct array_cache __percpu *cpu_cache;
12
11/* 1) Cache tunables. Protected by slab_mutex */ 13/* 1) Cache tunables. Protected by slab_mutex */
12 unsigned int batchcount; 14 unsigned int batchcount;
13 unsigned int limit; 15 unsigned int limit;
@@ -71,23 +73,7 @@ struct kmem_cache {
71 struct memcg_cache_params *memcg_params; 73 struct memcg_cache_params *memcg_params;
72#endif 74#endif
73 75
74/* 6) per-cpu/per-node data, touched during every alloc/free */ 76 struct kmem_cache_node *node[MAX_NUMNODES];
75 /*
76 * We put array[] at the end of kmem_cache, because we want to size
77 * this array to nr_cpu_ids slots instead of NR_CPUS
78 * (see kmem_cache_init())
79 * We still use [NR_CPUS] and not [1] or [0] because cache_cache
80 * is statically defined, so we reserve the max number of cpus.
81 *
82 * We also need to guarantee that the list is able to accomodate a
83 * pointer for each node since "nodelists" uses the remainder of
84 * available pointers.
85 */
86 struct kmem_cache_node **node;
87 struct array_cache *array[NR_CPUS + MAX_NUMNODES];
88 /*
89 * Do not add fields after array[]
90 */
91}; 77};
92 78
93#endif /* _LINUX_SLAB_DEF_H */ 79#endif /* _LINUX_SLAB_DEF_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 1b72060f093a..37a585beef5c 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -327,8 +327,10 @@ extern void lru_cache_add_active_or_unevictable(struct page *page,
327extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, 327extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
328 gfp_t gfp_mask, nodemask_t *mask); 328 gfp_t gfp_mask, nodemask_t *mask);
329extern int __isolate_lru_page(struct page *page, isolate_mode_t mode); 329extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
330extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem, 330extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
331 gfp_t gfp_mask, bool noswap); 331 unsigned long nr_pages,
332 gfp_t gfp_mask,
333 bool may_swap);
332extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, 334extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
333 gfp_t gfp_mask, bool noswap, 335 gfp_t gfp_mask, bool noswap,
334 struct zone *zone, 336 struct zone *zone,
@@ -354,22 +356,6 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
354extern int page_evictable(struct page *page); 356extern int page_evictable(struct page *page);
355extern void check_move_unevictable_pages(struct page **, int nr_pages); 357extern void check_move_unevictable_pages(struct page **, int nr_pages);
356 358
357extern unsigned long scan_unevictable_pages;
358extern int scan_unevictable_handler(struct ctl_table *, int,
359 void __user *, size_t *, loff_t *);
360#ifdef CONFIG_NUMA
361extern int scan_unevictable_register_node(struct node *node);
362extern void scan_unevictable_unregister_node(struct node *node);
363#else
364static inline int scan_unevictable_register_node(struct node *node)
365{
366 return 0;
367}
368static inline void scan_unevictable_unregister_node(struct node *node)
369{
370}
371#endif
372
373extern int kswapd_run(int nid); 359extern int kswapd_run(int nid);
374extern void kswapd_stop(int nid); 360extern void kswapd_stop(int nid);
375#ifdef CONFIG_MEMCG 361#ifdef CONFIG_MEMCG
diff --git a/include/linux/topology.h b/include/linux/topology.h
index dda6ee521e74..909b6e43b694 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -119,11 +119,20 @@ static inline int numa_node_id(void)
119 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem(). 119 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem().
120 */ 120 */
121DECLARE_PER_CPU(int, _numa_mem_); 121DECLARE_PER_CPU(int, _numa_mem_);
122extern int _node_numa_mem_[MAX_NUMNODES];
122 123
123#ifndef set_numa_mem 124#ifndef set_numa_mem
124static inline void set_numa_mem(int node) 125static inline void set_numa_mem(int node)
125{ 126{
126 this_cpu_write(_numa_mem_, node); 127 this_cpu_write(_numa_mem_, node);
128 _node_numa_mem_[numa_node_id()] = node;
129}
130#endif
131
132#ifndef node_to_mem_node
133static inline int node_to_mem_node(int node)
134{
135 return _node_numa_mem_[node];
127} 136}
128#endif 137#endif
129 138
@@ -146,6 +155,7 @@ static inline int cpu_to_mem(int cpu)
146static inline void set_cpu_numa_mem(int cpu, int node) 155static inline void set_cpu_numa_mem(int cpu, int node)
147{ 156{
148 per_cpu(_numa_mem_, cpu) = node; 157 per_cpu(_numa_mem_, cpu) = node;
158 _node_numa_mem_[cpu_to_node(cpu)] = node;
149} 159}
150#endif 160#endif
151 161
@@ -159,6 +169,13 @@ static inline int numa_mem_id(void)
159} 169}
160#endif 170#endif
161 171
172#ifndef node_to_mem_node
173static inline int node_to_mem_node(int node)
174{
175 return node;
176}
177#endif
178
162#ifndef cpu_to_mem 179#ifndef cpu_to_mem
163static inline int cpu_to_mem(int cpu) 180static inline int cpu_to_mem(int cpu)
164{ 181{
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index ced92345c963..730334cdf037 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -72,6 +72,13 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
72 THP_ZERO_PAGE_ALLOC, 72 THP_ZERO_PAGE_ALLOC,
73 THP_ZERO_PAGE_ALLOC_FAILED, 73 THP_ZERO_PAGE_ALLOC_FAILED,
74#endif 74#endif
75#ifdef CONFIG_MEMORY_BALLOON
76 BALLOON_INFLATE,
77 BALLOON_DEFLATE,
78#ifdef CONFIG_BALLOON_COMPACTION
79 BALLOON_MIGRATE,
80#endif
81#endif
75#ifdef CONFIG_DEBUG_TLBFLUSH 82#ifdef CONFIG_DEBUG_TLBFLUSH
76#ifdef CONFIG_SMP 83#ifdef CONFIG_SMP
77 NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */ 84 NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index e44d634e7fb7..05c214760977 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -46,6 +46,6 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
46 enum zs_mapmode mm); 46 enum zs_mapmode mm);
47void zs_unmap_object(struct zs_pool *pool, unsigned long handle); 47void zs_unmap_object(struct zs_pool *pool, unsigned long handle);
48 48
49u64 zs_get_total_size_bytes(struct zs_pool *pool); 49unsigned long zs_get_total_pages(struct zs_pool *pool);
50 50
51#endif 51#endif
diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h
index 5116a0e48172..2f96d233c980 100644
--- a/include/uapi/linux/kernel-page-flags.h
+++ b/include/uapi/linux/kernel-page-flags.h
@@ -31,6 +31,7 @@
31 31
32#define KPF_KSM 21 32#define KPF_KSM 21
33#define KPF_THP 22 33#define KPF_THP 22
34#define KPF_BALLOON 23
34 35
35 36
36#endif /* _UAPILINUX_KERNEL_PAGE_FLAGS_H */ 37#endif /* _UAPILINUX_KERNEL_PAGE_FLAGS_H */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 58afc04c107e..513df75d0fc9 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -1,6 +1,8 @@
1#ifndef _LINUX_PRCTL_H 1#ifndef _LINUX_PRCTL_H
2#define _LINUX_PRCTL_H 2#define _LINUX_PRCTL_H
3 3
4#include <linux/types.h>
5
4/* Values to pass as first argument to prctl() */ 6/* Values to pass as first argument to prctl() */
5 7
6#define PR_SET_PDEATHSIG 1 /* Second arg is a signal */ 8#define PR_SET_PDEATHSIG 1 /* Second arg is a signal */
@@ -119,6 +121,31 @@
119# define PR_SET_MM_ENV_END 11 121# define PR_SET_MM_ENV_END 11
120# define PR_SET_MM_AUXV 12 122# define PR_SET_MM_AUXV 12
121# define PR_SET_MM_EXE_FILE 13 123# define PR_SET_MM_EXE_FILE 13
124# define PR_SET_MM_MAP 14
125# define PR_SET_MM_MAP_SIZE 15
126
127/*
128 * This structure provides new memory descriptor
129 * map which mostly modifies /proc/pid/stat[m]
130 * output for a task. This mostly done in a
131 * sake of checkpoint/restore functionality.
132 */
133struct prctl_mm_map {
134 __u64 start_code; /* code section bounds */
135 __u64 end_code;
136 __u64 start_data; /* data section bounds */
137 __u64 end_data;
138 __u64 start_brk; /* heap for brk() syscall */
139 __u64 brk;
140 __u64 start_stack; /* stack starts at */
141 __u64 arg_start; /* command line arguments bounds */
142 __u64 arg_end;
143 __u64 env_start; /* environment variables bounds */
144 __u64 env_end;
145 __u64 *auxv; /* auxiliary vector */
146 __u32 auxv_size; /* vector size */
147 __u32 exe_fd; /* /proc/$pid/exe link file */
148};
122 149
123/* 150/*
124 * Set specific pid that is allowed to ptrace the current task. 151 * Set specific pid that is allowed to ptrace the current task.
diff --git a/init/Kconfig b/init/Kconfig
index e25a82a291a6..d2355812ba48 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -889,17 +889,6 @@ config ARCH_SUPPORTS_INT128
889config ARCH_WANT_NUMA_VARIABLE_LOCALITY 889config ARCH_WANT_NUMA_VARIABLE_LOCALITY
890 bool 890 bool
891 891
892#
893# For architectures that are willing to define _PAGE_NUMA as _PAGE_PROTNONE
894config ARCH_WANTS_PROT_NUMA_PROT_NONE
895 bool
896
897config ARCH_USES_NUMA_PROT_NONE
898 bool
899 default y
900 depends on ARCH_WANTS_PROT_NUMA_PROT_NONE
901 depends on NUMA_BALANCING
902
903config NUMA_BALANCING_DEFAULT_ENABLED 892config NUMA_BALANCING_DEFAULT_ENABLED
904 bool "Automatically enable NUMA aware memory/task placement" 893 bool "Automatically enable NUMA aware memory/task placement"
905 default y 894 default y
diff --git a/kernel/acct.c b/kernel/acct.c
index b4c667d22e79..33738ef972f3 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -472,7 +472,6 @@ static void do_acct_process(struct bsd_acct_struct *acct)
472 acct_t ac; 472 acct_t ac;
473 unsigned long flim; 473 unsigned long flim;
474 const struct cred *orig_cred; 474 const struct cred *orig_cred;
475 struct pid_namespace *ns = acct->ns;
476 struct file *file = acct->file; 475 struct file *file = acct->file;
477 476
478 /* 477 /*
@@ -500,10 +499,15 @@ static void do_acct_process(struct bsd_acct_struct *acct)
500 ac.ac_gid16 = ac.ac_gid; 499 ac.ac_gid16 = ac.ac_gid;
501#endif 500#endif
502#if ACCT_VERSION == 3 501#if ACCT_VERSION == 3
503 ac.ac_pid = task_tgid_nr_ns(current, ns); 502 {
504 rcu_read_lock(); 503 struct pid_namespace *ns = acct->ns;
505 ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); 504
506 rcu_read_unlock(); 505 ac.ac_pid = task_tgid_nr_ns(current, ns);
506 rcu_read_lock();
507 ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent),
508 ns);
509 rcu_read_unlock();
510 }
507#endif 511#endif
508 /* 512 /*
509 * Get freeze protection. If the fs is frozen, just skip the write 513 * Get freeze protection. If the fs is frozen, just skip the write
diff --git a/kernel/async.c b/kernel/async.c
index 61f023ce0228..4c3773c0bf63 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -115,7 +115,7 @@ static void async_run_entry_fn(struct work_struct *work)
115 115
116 /* 1) run (and print duration) */ 116 /* 1) run (and print duration) */
117 if (initcall_debug && system_state == SYSTEM_BOOTING) { 117 if (initcall_debug && system_state == SYSTEM_BOOTING) {
118 printk(KERN_DEBUG "calling %lli_%pF @ %i\n", 118 pr_debug("calling %lli_%pF @ %i\n",
119 (long long)entry->cookie, 119 (long long)entry->cookie,
120 entry->func, task_pid_nr(current)); 120 entry->func, task_pid_nr(current));
121 calltime = ktime_get(); 121 calltime = ktime_get();
@@ -124,7 +124,7 @@ static void async_run_entry_fn(struct work_struct *work)
124 if (initcall_debug && system_state == SYSTEM_BOOTING) { 124 if (initcall_debug && system_state == SYSTEM_BOOTING) {
125 rettime = ktime_get(); 125 rettime = ktime_get();
126 delta = ktime_sub(rettime, calltime); 126 delta = ktime_sub(rettime, calltime);
127 printk(KERN_DEBUG "initcall %lli_%pF returned 0 after %lld usecs\n", 127 pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n",
128 (long long)entry->cookie, 128 (long long)entry->cookie,
129 entry->func, 129 entry->func,
130 (long long)ktime_to_ns(delta) >> 10); 130 (long long)ktime_to_ns(delta) >> 10);
@@ -285,7 +285,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain
285 ktime_t uninitialized_var(starttime), delta, endtime; 285 ktime_t uninitialized_var(starttime), delta, endtime;
286 286
287 if (initcall_debug && system_state == SYSTEM_BOOTING) { 287 if (initcall_debug && system_state == SYSTEM_BOOTING) {
288 printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); 288 pr_debug("async_waiting @ %i\n", task_pid_nr(current));
289 starttime = ktime_get(); 289 starttime = ktime_get();
290 } 290 }
291 291
@@ -295,7 +295,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain
295 endtime = ktime_get(); 295 endtime = ktime_get();
296 delta = ktime_sub(endtime, starttime); 296 delta = ktime_sub(endtime, starttime);
297 297
298 printk(KERN_DEBUG "async_continuing @ %i after %lli usec\n", 298 pr_debug("async_continuing @ %i after %lli usec\n",
299 task_pid_nr(current), 299 task_pid_nr(current),
300 (long long)ktime_to_ns(delta) >> 10); 300 (long long)ktime_to_ns(delta) >> 10);
301 } 301 }
diff --git a/kernel/fork.c b/kernel/fork.c
index a91e47d86de2..8c162d102740 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -601,9 +601,8 @@ static void check_mm(struct mm_struct *mm)
601 printk(KERN_ALERT "BUG: Bad rss-counter state " 601 printk(KERN_ALERT "BUG: Bad rss-counter state "
602 "mm:%p idx:%d val:%ld\n", mm, i, x); 602 "mm:%p idx:%d val:%ld\n", mm, i, x);
603 } 603 }
604
605#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS 604#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
606 VM_BUG_ON(mm->pmd_huge_pte); 605 VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
607#endif 606#endif
608} 607}
609 608
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ef483220e855..10e489c448fe 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -369,7 +369,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
369{ 369{
370 struct task_struct *p; 370 struct task_struct *p;
371 371
372 p = kthread_create_on_node(threadfn, data, cpu_to_mem(cpu), namefmt, 372 p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt,
373 cpu); 373 cpu);
374 if (IS_ERR(p)) 374 if (IS_ERR(p))
375 return p; 375 return p;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bfa3c86d0d68..82088b29704e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1946,7 +1946,7 @@ void task_numa_work(struct callback_head *work)
1946 vma = mm->mmap; 1946 vma = mm->mmap;
1947 } 1947 }
1948 for (; vma; vma = vma->vm_next) { 1948 for (; vma; vma = vma->vm_next) {
1949 if (!vma_migratable(vma) || !vma_policy_mof(p, vma)) 1949 if (!vma_migratable(vma) || !vma_policy_mof(vma))
1950 continue; 1950 continue;
1951 1951
1952 /* 1952 /*
diff --git a/kernel/sys.c b/kernel/sys.c
index ce8129192a26..dfce4debd138 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -62,28 +62,28 @@
62#include <asm/unistd.h> 62#include <asm/unistd.h>
63 63
64#ifndef SET_UNALIGN_CTL 64#ifndef SET_UNALIGN_CTL
65# define SET_UNALIGN_CTL(a,b) (-EINVAL) 65# define SET_UNALIGN_CTL(a, b) (-EINVAL)
66#endif 66#endif
67#ifndef GET_UNALIGN_CTL 67#ifndef GET_UNALIGN_CTL
68# define GET_UNALIGN_CTL(a,b) (-EINVAL) 68# define GET_UNALIGN_CTL(a, b) (-EINVAL)
69#endif 69#endif
70#ifndef SET_FPEMU_CTL 70#ifndef SET_FPEMU_CTL
71# define SET_FPEMU_CTL(a,b) (-EINVAL) 71# define SET_FPEMU_CTL(a, b) (-EINVAL)
72#endif 72#endif
73#ifndef GET_FPEMU_CTL 73#ifndef GET_FPEMU_CTL
74# define GET_FPEMU_CTL(a,b) (-EINVAL) 74# define GET_FPEMU_CTL(a, b) (-EINVAL)
75#endif 75#endif
76#ifndef SET_FPEXC_CTL 76#ifndef SET_FPEXC_CTL
77# define SET_FPEXC_CTL(a,b) (-EINVAL) 77# define SET_FPEXC_CTL(a, b) (-EINVAL)
78#endif 78#endif
79#ifndef GET_FPEXC_CTL 79#ifndef GET_FPEXC_CTL
80# define GET_FPEXC_CTL(a,b) (-EINVAL) 80# define GET_FPEXC_CTL(a, b) (-EINVAL)
81#endif 81#endif
82#ifndef GET_ENDIAN 82#ifndef GET_ENDIAN
83# define GET_ENDIAN(a,b) (-EINVAL) 83# define GET_ENDIAN(a, b) (-EINVAL)
84#endif 84#endif
85#ifndef SET_ENDIAN 85#ifndef SET_ENDIAN
86# define SET_ENDIAN(a,b) (-EINVAL) 86# define SET_ENDIAN(a, b) (-EINVAL)
87#endif 87#endif
88#ifndef GET_TSC_CTL 88#ifndef GET_TSC_CTL
89# define GET_TSC_CTL(a) (-EINVAL) 89# define GET_TSC_CTL(a) (-EINVAL)
@@ -182,39 +182,40 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
182 rcu_read_lock(); 182 rcu_read_lock();
183 read_lock(&tasklist_lock); 183 read_lock(&tasklist_lock);
184 switch (which) { 184 switch (which) {
185 case PRIO_PROCESS: 185 case PRIO_PROCESS:
186 if (who) 186 if (who)
187 p = find_task_by_vpid(who); 187 p = find_task_by_vpid(who);
188 else 188 else
189 p = current; 189 p = current;
190 if (p) 190 if (p)
191 error = set_one_prio(p, niceval, error); 191 error = set_one_prio(p, niceval, error);
192 break; 192 break;
193 case PRIO_PGRP: 193 case PRIO_PGRP:
194 if (who) 194 if (who)
195 pgrp = find_vpid(who); 195 pgrp = find_vpid(who);
196 else 196 else
197 pgrp = task_pgrp(current); 197 pgrp = task_pgrp(current);
198 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { 198 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
199 error = set_one_prio(p, niceval, error); 199 error = set_one_prio(p, niceval, error);
200 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); 200 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
201 break; 201 break;
202 case PRIO_USER: 202 case PRIO_USER:
203 uid = make_kuid(cred->user_ns, who); 203 uid = make_kuid(cred->user_ns, who);
204 user = cred->user; 204 user = cred->user;
205 if (!who) 205 if (!who)
206 uid = cred->uid; 206 uid = cred->uid;
207 else if (!uid_eq(uid, cred->uid) && 207 else if (!uid_eq(uid, cred->uid)) {
208 !(user = find_user(uid))) 208 user = find_user(uid);
209 if (!user)
209 goto out_unlock; /* No processes for this user */ 210 goto out_unlock; /* No processes for this user */
210 211 }
211 do_each_thread(g, p) { 212 do_each_thread(g, p) {
212 if (uid_eq(task_uid(p), uid)) 213 if (uid_eq(task_uid(p), uid))
213 error = set_one_prio(p, niceval, error); 214 error = set_one_prio(p, niceval, error);
214 } while_each_thread(g, p); 215 } while_each_thread(g, p);
215 if (!uid_eq(uid, cred->uid)) 216 if (!uid_eq(uid, cred->uid))
216 free_uid(user); /* For find_user() */ 217 free_uid(user); /* For find_user() */
217 break; 218 break;
218 } 219 }
219out_unlock: 220out_unlock:
220 read_unlock(&tasklist_lock); 221 read_unlock(&tasklist_lock);
@@ -244,47 +245,48 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
244 rcu_read_lock(); 245 rcu_read_lock();
245 read_lock(&tasklist_lock); 246 read_lock(&tasklist_lock);
246 switch (which) { 247 switch (which) {
247 case PRIO_PROCESS: 248 case PRIO_PROCESS:
248 if (who) 249 if (who)
249 p = find_task_by_vpid(who); 250 p = find_task_by_vpid(who);
250 else 251 else
251 p = current; 252 p = current;
252 if (p) { 253 if (p) {
254 niceval = nice_to_rlimit(task_nice(p));
255 if (niceval > retval)
256 retval = niceval;
257 }
258 break;
259 case PRIO_PGRP:
260 if (who)
261 pgrp = find_vpid(who);
262 else
263 pgrp = task_pgrp(current);
264 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
265 niceval = nice_to_rlimit(task_nice(p));
266 if (niceval > retval)
267 retval = niceval;
268 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
269 break;
270 case PRIO_USER:
271 uid = make_kuid(cred->user_ns, who);
272 user = cred->user;
273 if (!who)
274 uid = cred->uid;
275 else if (!uid_eq(uid, cred->uid)) {
276 user = find_user(uid);
277 if (!user)
278 goto out_unlock; /* No processes for this user */
279 }
280 do_each_thread(g, p) {
281 if (uid_eq(task_uid(p), uid)) {
253 niceval = nice_to_rlimit(task_nice(p)); 282 niceval = nice_to_rlimit(task_nice(p));
254 if (niceval > retval) 283 if (niceval > retval)
255 retval = niceval; 284 retval = niceval;
256 } 285 }
257 break; 286 } while_each_thread(g, p);
258 case PRIO_PGRP: 287 if (!uid_eq(uid, cred->uid))
259 if (who) 288 free_uid(user); /* for find_user() */
260 pgrp = find_vpid(who); 289 break;
261 else
262 pgrp = task_pgrp(current);
263 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
264 niceval = nice_to_rlimit(task_nice(p));
265 if (niceval > retval)
266 retval = niceval;
267 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
268 break;
269 case PRIO_USER:
270 uid = make_kuid(cred->user_ns, who);
271 user = cred->user;
272 if (!who)
273 uid = cred->uid;
274 else if (!uid_eq(uid, cred->uid) &&
275 !(user = find_user(uid)))
276 goto out_unlock; /* No processes for this user */
277
278 do_each_thread(g, p) {
279 if (uid_eq(task_uid(p), uid)) {
280 niceval = nice_to_rlimit(task_nice(p));
281 if (niceval > retval)
282 retval = niceval;
283 }
284 } while_each_thread(g, p);
285 if (!uid_eq(uid, cred->uid))
286 free_uid(user); /* for find_user() */
287 break;
288 } 290 }
289out_unlock: 291out_unlock:
290 read_unlock(&tasklist_lock); 292 read_unlock(&tasklist_lock);
@@ -306,7 +308,7 @@ out_unlock:
306 * 308 *
307 * The general idea is that a program which uses just setregid() will be 309 * The general idea is that a program which uses just setregid() will be
308 * 100% compatible with BSD. A program which uses just setgid() will be 310 * 100% compatible with BSD. A program which uses just setgid() will be
309 * 100% compatible with POSIX with saved IDs. 311 * 100% compatible with POSIX with saved IDs.
310 * 312 *
311 * SMP: There are not races, the GIDs are checked only by filesystem 313 * SMP: There are not races, the GIDs are checked only by filesystem
312 * operations (as far as semantic preservation is concerned). 314 * operations (as far as semantic preservation is concerned).
@@ -364,7 +366,7 @@ error:
364} 366}
365 367
366/* 368/*
367 * setgid() is implemented like SysV w/ SAVED_IDS 369 * setgid() is implemented like SysV w/ SAVED_IDS
368 * 370 *
369 * SMP: Same implicit races as above. 371 * SMP: Same implicit races as above.
370 */ 372 */
@@ -442,7 +444,7 @@ static int set_user(struct cred *new)
442 * 444 *
443 * The general idea is that a program which uses just setreuid() will be 445 * The general idea is that a program which uses just setreuid() will be
444 * 100% compatible with BSD. A program which uses just setuid() will be 446 * 100% compatible with BSD. A program which uses just setuid() will be
445 * 100% compatible with POSIX with saved IDs. 447 * 100% compatible with POSIX with saved IDs.
446 */ 448 */
447SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) 449SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
448{ 450{
@@ -503,17 +505,17 @@ error:
503 abort_creds(new); 505 abort_creds(new);
504 return retval; 506 return retval;
505} 507}
506 508
507/* 509/*
508 * setuid() is implemented like SysV with SAVED_IDS 510 * setuid() is implemented like SysV with SAVED_IDS
509 * 511 *
510 * Note that SAVED_ID's is deficient in that a setuid root program 512 * Note that SAVED_ID's is deficient in that a setuid root program
511 * like sendmail, for example, cannot set its uid to be a normal 513 * like sendmail, for example, cannot set its uid to be a normal
512 * user and then switch back, because if you're root, setuid() sets 514 * user and then switch back, because if you're root, setuid() sets
513 * the saved uid too. If you don't like this, blame the bright people 515 * the saved uid too. If you don't like this, blame the bright people
514 * in the POSIX committee and/or USG. Note that the BSD-style setreuid() 516 * in the POSIX committee and/or USG. Note that the BSD-style setreuid()
515 * will allow a root program to temporarily drop privileges and be able to 517 * will allow a root program to temporarily drop privileges and be able to
516 * regain them by swapping the real and effective uid. 518 * regain them by swapping the real and effective uid.
517 */ 519 */
518SYSCALL_DEFINE1(setuid, uid_t, uid) 520SYSCALL_DEFINE1(setuid, uid_t, uid)
519{ 521{
@@ -637,10 +639,12 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t _
637 euid = from_kuid_munged(cred->user_ns, cred->euid); 639 euid = from_kuid_munged(cred->user_ns, cred->euid);
638 suid = from_kuid_munged(cred->user_ns, cred->suid); 640 suid = from_kuid_munged(cred->user_ns, cred->suid);
639 641
640 if (!(retval = put_user(ruid, ruidp)) && 642 retval = put_user(ruid, ruidp);
641 !(retval = put_user(euid, euidp))) 643 if (!retval) {
642 retval = put_user(suid, suidp); 644 retval = put_user(euid, euidp);
643 645 if (!retval)
646 return put_user(suid, suidp);
647 }
644 return retval; 648 return retval;
645} 649}
646 650
@@ -709,9 +713,12 @@ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t _
709 egid = from_kgid_munged(cred->user_ns, cred->egid); 713 egid = from_kgid_munged(cred->user_ns, cred->egid);
710 sgid = from_kgid_munged(cred->user_ns, cred->sgid); 714 sgid = from_kgid_munged(cred->user_ns, cred->sgid);
711 715
712 if (!(retval = put_user(rgid, rgidp)) && 716 retval = put_user(rgid, rgidp);
713 !(retval = put_user(egid, egidp))) 717 if (!retval) {
714 retval = put_user(sgid, sgidp); 718 retval = put_user(egid, egidp);
719 if (!retval)
720 retval = put_user(sgid, sgidp);
721 }
715 722
716 return retval; 723 return retval;
717} 724}
@@ -1284,7 +1291,6 @@ SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
1284/* 1291/*
1285 * Back compatibility for getrlimit. Needed for some apps. 1292 * Back compatibility for getrlimit. Needed for some apps.
1286 */ 1293 */
1287
1288SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, 1294SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
1289 struct rlimit __user *, rlim) 1295 struct rlimit __user *, rlim)
1290{ 1296{
@@ -1299,7 +1305,7 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
1299 x.rlim_cur = 0x7FFFFFFF; 1305 x.rlim_cur = 0x7FFFFFFF;
1300 if (x.rlim_max > 0x7FFFFFFF) 1306 if (x.rlim_max > 0x7FFFFFFF)
1301 x.rlim_max = 0x7FFFFFFF; 1307 x.rlim_max = 0x7FFFFFFF;
1302 return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0; 1308 return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0;
1303} 1309}
1304 1310
1305#endif 1311#endif
@@ -1527,7 +1533,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1527 cputime_t tgutime, tgstime, utime, stime; 1533 cputime_t tgutime, tgstime, utime, stime;
1528 unsigned long maxrss = 0; 1534 unsigned long maxrss = 0;
1529 1535
1530 memset((char *) r, 0, sizeof *r); 1536 memset((char *)r, 0, sizeof (*r));
1531 utime = stime = 0; 1537 utime = stime = 0;
1532 1538
1533 if (who == RUSAGE_THREAD) { 1539 if (who == RUSAGE_THREAD) {
@@ -1541,41 +1547,41 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1541 return; 1547 return;
1542 1548
1543 switch (who) { 1549 switch (who) {
1544 case RUSAGE_BOTH: 1550 case RUSAGE_BOTH:
1545 case RUSAGE_CHILDREN: 1551 case RUSAGE_CHILDREN:
1546 utime = p->signal->cutime; 1552 utime = p->signal->cutime;
1547 stime = p->signal->cstime; 1553 stime = p->signal->cstime;
1548 r->ru_nvcsw = p->signal->cnvcsw; 1554 r->ru_nvcsw = p->signal->cnvcsw;
1549 r->ru_nivcsw = p->signal->cnivcsw; 1555 r->ru_nivcsw = p->signal->cnivcsw;
1550 r->ru_minflt = p->signal->cmin_flt; 1556 r->ru_minflt = p->signal->cmin_flt;
1551 r->ru_majflt = p->signal->cmaj_flt; 1557 r->ru_majflt = p->signal->cmaj_flt;
1552 r->ru_inblock = p->signal->cinblock; 1558 r->ru_inblock = p->signal->cinblock;
1553 r->ru_oublock = p->signal->coublock; 1559 r->ru_oublock = p->signal->coublock;
1554 maxrss = p->signal->cmaxrss; 1560 maxrss = p->signal->cmaxrss;
1555 1561
1556 if (who == RUSAGE_CHILDREN) 1562 if (who == RUSAGE_CHILDREN)
1557 break;
1558
1559 case RUSAGE_SELF:
1560 thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1561 utime += tgutime;
1562 stime += tgstime;
1563 r->ru_nvcsw += p->signal->nvcsw;
1564 r->ru_nivcsw += p->signal->nivcsw;
1565 r->ru_minflt += p->signal->min_flt;
1566 r->ru_majflt += p->signal->maj_flt;
1567 r->ru_inblock += p->signal->inblock;
1568 r->ru_oublock += p->signal->oublock;
1569 if (maxrss < p->signal->maxrss)
1570 maxrss = p->signal->maxrss;
1571 t = p;
1572 do {
1573 accumulate_thread_rusage(t, r);
1574 } while_each_thread(p, t);
1575 break; 1563 break;
1576 1564
1577 default: 1565 case RUSAGE_SELF:
1578 BUG(); 1566 thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1567 utime += tgutime;
1568 stime += tgstime;
1569 r->ru_nvcsw += p->signal->nvcsw;
1570 r->ru_nivcsw += p->signal->nivcsw;
1571 r->ru_minflt += p->signal->min_flt;
1572 r->ru_majflt += p->signal->maj_flt;
1573 r->ru_inblock += p->signal->inblock;
1574 r->ru_oublock += p->signal->oublock;
1575 if (maxrss < p->signal->maxrss)
1576 maxrss = p->signal->maxrss;
1577 t = p;
1578 do {
1579 accumulate_thread_rusage(t, r);
1580 } while_each_thread(p, t);
1581 break;
1582
1583 default:
1584 BUG();
1579 } 1585 }
1580 unlock_task_sighand(p, &flags); 1586 unlock_task_sighand(p, &flags);
1581 1587
@@ -1585,6 +1591,7 @@ out:
1585 1591
1586 if (who != RUSAGE_CHILDREN) { 1592 if (who != RUSAGE_CHILDREN) {
1587 struct mm_struct *mm = get_task_mm(p); 1593 struct mm_struct *mm = get_task_mm(p);
1594
1588 if (mm) { 1595 if (mm) {
1589 setmax_mm_hiwater_rss(&maxrss, mm); 1596 setmax_mm_hiwater_rss(&maxrss, mm);
1590 mmput(mm); 1597 mmput(mm);
@@ -1596,6 +1603,7 @@ out:
1596int getrusage(struct task_struct *p, int who, struct rusage __user *ru) 1603int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
1597{ 1604{
1598 struct rusage r; 1605 struct rusage r;
1606
1599 k_getrusage(p, who, &r); 1607 k_getrusage(p, who, &r);
1600 return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; 1608 return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
1601} 1609}
@@ -1628,12 +1636,14 @@ SYSCALL_DEFINE1(umask, int, mask)
1628 return mask; 1636 return mask;
1629} 1637}
1630 1638
1631static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) 1639static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
1632{ 1640{
1633 struct fd exe; 1641 struct fd exe;
1634 struct inode *inode; 1642 struct inode *inode;
1635 int err; 1643 int err;
1636 1644
1645 VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
1646
1637 exe = fdget(fd); 1647 exe = fdget(fd);
1638 if (!exe.file) 1648 if (!exe.file)
1639 return -EBADF; 1649 return -EBADF;
@@ -1654,8 +1664,6 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1654 if (err) 1664 if (err)
1655 goto exit; 1665 goto exit;
1656 1666
1657 down_write(&mm->mmap_sem);
1658
1659 /* 1667 /*
1660 * Forbid mm->exe_file change if old file still mapped. 1668 * Forbid mm->exe_file change if old file still mapped.
1661 */ 1669 */
@@ -1667,7 +1675,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1667 if (vma->vm_file && 1675 if (vma->vm_file &&
1668 path_equal(&vma->vm_file->f_path, 1676 path_equal(&vma->vm_file->f_path,
1669 &mm->exe_file->f_path)) 1677 &mm->exe_file->f_path))
1670 goto exit_unlock; 1678 goto exit;
1671 } 1679 }
1672 1680
1673 /* 1681 /*
@@ -1678,34 +1686,222 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1678 */ 1686 */
1679 err = -EPERM; 1687 err = -EPERM;
1680 if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) 1688 if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
1681 goto exit_unlock; 1689 goto exit;
1682 1690
1683 err = 0; 1691 err = 0;
1684 set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */ 1692 set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */
1685exit_unlock:
1686 up_write(&mm->mmap_sem);
1687
1688exit: 1693exit:
1689 fdput(exe); 1694 fdput(exe);
1690 return err; 1695 return err;
1691} 1696}
1692 1697
1698#ifdef CONFIG_CHECKPOINT_RESTORE
1699/*
1700 * WARNING: we don't require any capability here so be very careful
1701 * in what is allowed for modification from userspace.
1702 */
1703static int validate_prctl_map(struct prctl_mm_map *prctl_map)
1704{
1705 unsigned long mmap_max_addr = TASK_SIZE;
1706 struct mm_struct *mm = current->mm;
1707 int error = -EINVAL, i;
1708
1709 static const unsigned char offsets[] = {
1710 offsetof(struct prctl_mm_map, start_code),
1711 offsetof(struct prctl_mm_map, end_code),
1712 offsetof(struct prctl_mm_map, start_data),
1713 offsetof(struct prctl_mm_map, end_data),
1714 offsetof(struct prctl_mm_map, start_brk),
1715 offsetof(struct prctl_mm_map, brk),
1716 offsetof(struct prctl_mm_map, start_stack),
1717 offsetof(struct prctl_mm_map, arg_start),
1718 offsetof(struct prctl_mm_map, arg_end),
1719 offsetof(struct prctl_mm_map, env_start),
1720 offsetof(struct prctl_mm_map, env_end),
1721 };
1722
1723 /*
1724 * Make sure the members are not somewhere outside
1725 * of allowed address space.
1726 */
1727 for (i = 0; i < ARRAY_SIZE(offsets); i++) {
1728 u64 val = *(u64 *)((char *)prctl_map + offsets[i]);
1729
1730 if ((unsigned long)val >= mmap_max_addr ||
1731 (unsigned long)val < mmap_min_addr)
1732 goto out;
1733 }
1734
1735 /*
1736 * Make sure the pairs are ordered.
1737 */
1738#define __prctl_check_order(__m1, __op, __m2) \
1739 ((unsigned long)prctl_map->__m1 __op \
1740 (unsigned long)prctl_map->__m2) ? 0 : -EINVAL
1741 error = __prctl_check_order(start_code, <, end_code);
1742 error |= __prctl_check_order(start_data, <, end_data);
1743 error |= __prctl_check_order(start_brk, <=, brk);
1744 error |= __prctl_check_order(arg_start, <=, arg_end);
1745 error |= __prctl_check_order(env_start, <=, env_end);
1746 if (error)
1747 goto out;
1748#undef __prctl_check_order
1749
1750 error = -EINVAL;
1751
1752 /*
1753 * @brk should be after @end_data in traditional maps.
1754 */
1755 if (prctl_map->start_brk <= prctl_map->end_data ||
1756 prctl_map->brk <= prctl_map->end_data)
1757 goto out;
1758
1759 /*
1760 * Neither we should allow to override limits if they set.
1761 */
1762 if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk,
1763 prctl_map->start_brk, prctl_map->end_data,
1764 prctl_map->start_data))
1765 goto out;
1766
1767 /*
1768 * Someone is trying to cheat the auxv vector.
1769 */
1770 if (prctl_map->auxv_size) {
1771 if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv))
1772 goto out;
1773 }
1774
1775 /*
1776 * Finally, make sure the caller has the rights to
1777 * change /proc/pid/exe link: only local root should
1778 * be allowed to.
1779 */
1780 if (prctl_map->exe_fd != (u32)-1) {
1781 struct user_namespace *ns = current_user_ns();
1782 const struct cred *cred = current_cred();
1783
1784 if (!uid_eq(cred->uid, make_kuid(ns, 0)) ||
1785 !gid_eq(cred->gid, make_kgid(ns, 0)))
1786 goto out;
1787 }
1788
1789 error = 0;
1790out:
1791 return error;
1792}
1793
1794static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size)
1795{
1796 struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, };
1797 unsigned long user_auxv[AT_VECTOR_SIZE];
1798 struct mm_struct *mm = current->mm;
1799 int error;
1800
1801 BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
1802 BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256);
1803
1804 if (opt == PR_SET_MM_MAP_SIZE)
1805 return put_user((unsigned int)sizeof(prctl_map),
1806 (unsigned int __user *)addr);
1807
1808 if (data_size != sizeof(prctl_map))
1809 return -EINVAL;
1810
1811 if (copy_from_user(&prctl_map, addr, sizeof(prctl_map)))
1812 return -EFAULT;
1813
1814 error = validate_prctl_map(&prctl_map);
1815 if (error)
1816 return error;
1817
1818 if (prctl_map.auxv_size) {
1819 memset(user_auxv, 0, sizeof(user_auxv));
1820 if (copy_from_user(user_auxv,
1821 (const void __user *)prctl_map.auxv,
1822 prctl_map.auxv_size))
1823 return -EFAULT;
1824
1825 /* Last entry must be AT_NULL as specification requires */
1826 user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL;
1827 user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
1828 }
1829
1830 down_write(&mm->mmap_sem);
1831 if (prctl_map.exe_fd != (u32)-1)
1832 error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd);
1833 downgrade_write(&mm->mmap_sem);
1834 if (error)
1835 goto out;
1836
1837 /*
1838 * We don't validate if these members are pointing to
1839 * real present VMAs because application may have correspond
1840 * VMAs already unmapped and kernel uses these members for statistics
1841 * output in procfs mostly, except
1842 *
1843 * - @start_brk/@brk which are used in do_brk but kernel lookups
1844 * for VMAs when updating these memvers so anything wrong written
1845 * here cause kernel to swear at userspace program but won't lead
1846 * to any problem in kernel itself
1847 */
1848
1849 mm->start_code = prctl_map.start_code;
1850 mm->end_code = prctl_map.end_code;
1851 mm->start_data = prctl_map.start_data;
1852 mm->end_data = prctl_map.end_data;
1853 mm->start_brk = prctl_map.start_brk;
1854 mm->brk = prctl_map.brk;
1855 mm->start_stack = prctl_map.start_stack;
1856 mm->arg_start = prctl_map.arg_start;
1857 mm->arg_end = prctl_map.arg_end;
1858 mm->env_start = prctl_map.env_start;
1859 mm->env_end = prctl_map.env_end;
1860
1861 /*
1862 * Note this update of @saved_auxv is lockless thus
1863 * if someone reads this member in procfs while we're
1864 * updating -- it may get partly updated results. It's
1865 * known and acceptable trade off: we leave it as is to
1866 * not introduce additional locks here making the kernel
1867 * more complex.
1868 */
1869 if (prctl_map.auxv_size)
1870 memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv));
1871
1872 error = 0;
1873out:
1874 up_read(&mm->mmap_sem);
1875 return error;
1876}
1877#endif /* CONFIG_CHECKPOINT_RESTORE */
1878
1693static int prctl_set_mm(int opt, unsigned long addr, 1879static int prctl_set_mm(int opt, unsigned long addr,
1694 unsigned long arg4, unsigned long arg5) 1880 unsigned long arg4, unsigned long arg5)
1695{ 1881{
1696 unsigned long rlim = rlimit(RLIMIT_DATA);
1697 struct mm_struct *mm = current->mm; 1882 struct mm_struct *mm = current->mm;
1698 struct vm_area_struct *vma; 1883 struct vm_area_struct *vma;
1699 int error; 1884 int error;
1700 1885
1701 if (arg5 || (arg4 && opt != PR_SET_MM_AUXV)) 1886 if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV &&
1887 opt != PR_SET_MM_MAP &&
1888 opt != PR_SET_MM_MAP_SIZE)))
1702 return -EINVAL; 1889 return -EINVAL;
1703 1890
1891#ifdef CONFIG_CHECKPOINT_RESTORE
1892 if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE)
1893 return prctl_set_mm_map(opt, (const void __user *)addr, arg4);
1894#endif
1895
1704 if (!capable(CAP_SYS_RESOURCE)) 1896 if (!capable(CAP_SYS_RESOURCE))
1705 return -EPERM; 1897 return -EPERM;
1706 1898
1707 if (opt == PR_SET_MM_EXE_FILE) 1899 if (opt == PR_SET_MM_EXE_FILE) {
1708 return prctl_set_mm_exe_file(mm, (unsigned int)addr); 1900 down_write(&mm->mmap_sem);
1901 error = prctl_set_mm_exe_file_locked(mm, (unsigned int)addr);
1902 up_write(&mm->mmap_sem);
1903 return error;
1904 }
1709 1905
1710 if (addr >= TASK_SIZE || addr < mmap_min_addr) 1906 if (addr >= TASK_SIZE || addr < mmap_min_addr)
1711 return -EINVAL; 1907 return -EINVAL;
@@ -1733,9 +1929,8 @@ static int prctl_set_mm(int opt, unsigned long addr,
1733 if (addr <= mm->end_data) 1929 if (addr <= mm->end_data)
1734 goto out; 1930 goto out;
1735 1931
1736 if (rlim < RLIM_INFINITY && 1932 if (check_data_rlimit(rlimit(RLIMIT_DATA), mm->brk, addr,
1737 (mm->brk - addr) + 1933 mm->end_data, mm->start_data))
1738 (mm->end_data - mm->start_data) > rlim)
1739 goto out; 1934 goto out;
1740 1935
1741 mm->start_brk = addr; 1936 mm->start_brk = addr;
@@ -1745,9 +1940,8 @@ static int prctl_set_mm(int opt, unsigned long addr,
1745 if (addr <= mm->end_data) 1940 if (addr <= mm->end_data)
1746 goto out; 1941 goto out;
1747 1942
1748 if (rlim < RLIM_INFINITY && 1943 if (check_data_rlimit(rlimit(RLIMIT_DATA), addr, mm->start_brk,
1749 (addr - mm->start_brk) + 1944 mm->end_data, mm->start_data))
1750 (mm->end_data - mm->start_data) > rlim)
1751 goto out; 1945 goto out;
1752 1946
1753 mm->brk = addr; 1947 mm->brk = addr;
@@ -2023,6 +2217,7 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
2023{ 2217{
2024 int err = 0; 2218 int err = 0;
2025 int cpu = raw_smp_processor_id(); 2219 int cpu = raw_smp_processor_id();
2220
2026 if (cpup) 2221 if (cpup)
2027 err |= put_user(cpu, cpup); 2222 err |= put_user(cpu, cpup);
2028 if (nodep) 2223 if (nodep)
@@ -2135,7 +2330,7 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
2135 /* Check to see if any memory value is too large for 32-bit and scale 2330 /* Check to see if any memory value is too large for 32-bit and scale
2136 * down if needed 2331 * down if needed
2137 */ 2332 */
2138 if ((s.totalram >> 32) || (s.totalswap >> 32)) { 2333 if (upper_32_bits(s.totalram) || upper_32_bits(s.totalswap)) {
2139 int bitcount = 0; 2334 int bitcount = 0;
2140 2335
2141 while (s.mem_unit < PAGE_SIZE) { 2336 while (s.mem_unit < PAGE_SIZE) {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 75875a741b5e..91180987e40e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1460,13 +1460,6 @@ static struct ctl_table vm_table[] = {
1460 .extra2 = &one, 1460 .extra2 = &one,
1461 }, 1461 },
1462#endif 1462#endif
1463 {
1464 .procname = "scan_unevictable_pages",
1465 .data = &scan_unevictable_pages,
1466 .maxlen = sizeof(scan_unevictable_pages),
1467 .mode = 0644,
1468 .proc_handler = scan_unevictable_handler,
1469 },
1470#ifdef CONFIG_MEMORY_FAILURE 1463#ifdef CONFIG_MEMORY_FAILURE
1471 { 1464 {
1472 .procname = "memory_failure_early_kill", 1465 .procname = "memory_failure_early_kill",
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index a8d6914030fe..7b223b212683 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -47,6 +47,7 @@ static DEFINE_PER_CPU(bool, softlockup_touch_sync);
47static DEFINE_PER_CPU(bool, soft_watchdog_warn); 47static DEFINE_PER_CPU(bool, soft_watchdog_warn);
48static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); 48static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
49static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); 49static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
50static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
50#ifdef CONFIG_HARDLOCKUP_DETECTOR 51#ifdef CONFIG_HARDLOCKUP_DETECTOR
51static DEFINE_PER_CPU(bool, hard_watchdog_warn); 52static DEFINE_PER_CPU(bool, hard_watchdog_warn);
52static DEFINE_PER_CPU(bool, watchdog_nmi_touch); 53static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
@@ -333,8 +334,22 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
333 return HRTIMER_RESTART; 334 return HRTIMER_RESTART;
334 335
335 /* only warn once */ 336 /* only warn once */
336 if (__this_cpu_read(soft_watchdog_warn) == true) 337 if (__this_cpu_read(soft_watchdog_warn) == true) {
338 /*
339 * When multiple processes are causing softlockups the
340 * softlockup detector only warns on the first one
341 * because the code relies on a full quiet cycle to
342 * re-arm. The second process prevents the quiet cycle
343 * and never gets reported. Use task pointers to detect
344 * this.
345 */
346 if (__this_cpu_read(softlockup_task_ptr_saved) !=
347 current) {
348 __this_cpu_write(soft_watchdog_warn, false);
349 __touch_watchdog();
350 }
337 return HRTIMER_RESTART; 351 return HRTIMER_RESTART;
352 }
338 353
339 if (softlockup_all_cpu_backtrace) { 354 if (softlockup_all_cpu_backtrace) {
340 /* Prevent multiple soft-lockup reports if one cpu is already 355 /* Prevent multiple soft-lockup reports if one cpu is already
@@ -350,6 +365,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
350 pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", 365 pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
351 smp_processor_id(), duration, 366 smp_processor_id(), duration,
352 current->comm, task_pid_nr(current)); 367 current->comm, task_pid_nr(current));
368 __this_cpu_write(softlockup_task_ptr_saved, current);
353 print_modules(); 369 print_modules();
354 print_irqtrace_events(current); 370 print_irqtrace_events(current);
355 if (regs) 371 if (regs)
diff --git a/lib/genalloc.c b/lib/genalloc.c
index 38d2db82228c..cce4dd68c40d 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -403,6 +403,35 @@ void gen_pool_for_each_chunk(struct gen_pool *pool,
403EXPORT_SYMBOL(gen_pool_for_each_chunk); 403EXPORT_SYMBOL(gen_pool_for_each_chunk);
404 404
405/** 405/**
406 * addr_in_gen_pool - checks if an address falls within the range of a pool
407 * @pool: the generic memory pool
408 * @start: start address
409 * @size: size of the region
410 *
411 * Check if the range of addresses falls within the specified pool. Returns
412 * true if the entire range is contained in the pool and false otherwise.
413 */
414bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
415 size_t size)
416{
417 bool found = false;
418 unsigned long end = start + size;
419 struct gen_pool_chunk *chunk;
420
421 rcu_read_lock();
422 list_for_each_entry_rcu(chunk, &(pool)->chunks, next_chunk) {
423 if (start >= chunk->start_addr && start <= chunk->end_addr) {
424 if (end <= chunk->end_addr) {
425 found = true;
426 break;
427 }
428 }
429 }
430 rcu_read_unlock();
431 return found;
432}
433
434/**
406 * gen_pool_avail - get available free space of the pool 435 * gen_pool_avail - get available free space of the pool
407 * @pool: pool to get available free space 436 * @pool: pool to get available free space
408 * 437 *
@@ -481,6 +510,26 @@ unsigned long gen_pool_first_fit(unsigned long *map, unsigned long size,
481EXPORT_SYMBOL(gen_pool_first_fit); 510EXPORT_SYMBOL(gen_pool_first_fit);
482 511
483/** 512/**
513 * gen_pool_first_fit_order_align - find the first available region
514 * of memory matching the size requirement. The region will be aligned
515 * to the order of the size specified.
516 * @map: The address to base the search on
517 * @size: The bitmap size in bits
518 * @start: The bitnumber to start searching at
519 * @nr: The number of zeroed bits we're looking for
520 * @data: additional data - unused
521 */
522unsigned long gen_pool_first_fit_order_align(unsigned long *map,
523 unsigned long size, unsigned long start,
524 unsigned int nr, void *data)
525{
526 unsigned long align_mask = roundup_pow_of_two(nr) - 1;
527
528 return bitmap_find_next_zero_area(map, size, start, nr, align_mask);
529}
530EXPORT_SYMBOL(gen_pool_first_fit_order_align);
531
532/**
484 * gen_pool_best_fit - find the best fitting region of memory 533 * gen_pool_best_fit - find the best fitting region of memory
485 * macthing the size requirement (no alignment constraint) 534 * macthing the size requirement (no alignment constraint)
486 * @map: The address to base the search on 535 * @map: The address to base the search on
diff --git a/mm/Kconfig b/mm/Kconfig
index 886db2158538..1d1ae6b078fd 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -137,6 +137,9 @@ config HAVE_MEMBLOCK_NODE_MAP
137config HAVE_MEMBLOCK_PHYS_MAP 137config HAVE_MEMBLOCK_PHYS_MAP
138 boolean 138 boolean
139 139
140config HAVE_GENERIC_RCU_GUP
141 boolean
142
140config ARCH_DISCARD_MEMBLOCK 143config ARCH_DISCARD_MEMBLOCK
141 boolean 144 boolean
142 145
@@ -228,11 +231,16 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK
228 boolean 231 boolean
229 232
230# 233#
234# support for memory balloon
235config MEMORY_BALLOON
236 boolean
237
238#
231# support for memory balloon compaction 239# support for memory balloon compaction
232config BALLOON_COMPACTION 240config BALLOON_COMPACTION
233 bool "Allow for balloon memory compaction/migration" 241 bool "Allow for balloon memory compaction/migration"
234 def_bool y 242 def_bool y
235 depends on COMPACTION && VIRTIO_BALLOON 243 depends on COMPACTION && MEMORY_BALLOON
236 help 244 help
237 Memory fragmentation introduced by ballooning might reduce 245 Memory fragmentation introduced by ballooning might reduce
238 significantly the number of 2MB contiguous memory blocks that can be 246 significantly the number of 2MB contiguous memory blocks that can be
diff --git a/mm/Makefile b/mm/Makefile
index fe7a053c0f45..1f534a7f0a71 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -16,9 +16,9 @@ obj-y := filemap.o mempool.o oom_kill.o \
16 readahead.o swap.o truncate.o vmscan.o shmem.o \ 16 readahead.o swap.o truncate.o vmscan.o shmem.o \
17 util.o mmzone.o vmstat.o backing-dev.o \ 17 util.o mmzone.o vmstat.o backing-dev.o \
18 mm_init.o mmu_context.o percpu.o slab_common.o \ 18 mm_init.o mmu_context.o percpu.o slab_common.o \
19 compaction.o balloon_compaction.o vmacache.o \ 19 compaction.o vmacache.o \
20 interval_tree.o list_lru.o workingset.o \ 20 interval_tree.o list_lru.o workingset.o \
21 iov_iter.o $(mmu-y) 21 iov_iter.o debug.o $(mmu-y)
22 22
23obj-y += init-mm.o 23obj-y += init-mm.o
24 24
@@ -67,3 +67,4 @@ obj-$(CONFIG_ZBUD) += zbud.o
67obj-$(CONFIG_ZSMALLOC) += zsmalloc.o 67obj-$(CONFIG_ZSMALLOC) += zsmalloc.o
68obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o 68obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
69obj-$(CONFIG_CMA) += cma.o 69obj-$(CONFIG_CMA) += cma.o
70obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 1706cbbdf5f0..b27714f1b40f 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -631,7 +631,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
631 * of sleeping on the congestion queue 631 * of sleeping on the congestion queue
632 */ 632 */
633 if (atomic_read(&nr_bdi_congested[sync]) == 0 || 633 if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
634 !zone_is_reclaim_congested(zone)) { 634 !test_bit(ZONE_CONGESTED, &zone->flags)) {
635 cond_resched(); 635 cond_resched();
636 636
637 /* In case we scheduled, work out time remaining */ 637 /* In case we scheduled, work out time remaining */
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 6e45a5074bf0..b3cbe19f71b5 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -11,32 +11,6 @@
11#include <linux/balloon_compaction.h> 11#include <linux/balloon_compaction.h>
12 12
13/* 13/*
14 * balloon_devinfo_alloc - allocates a balloon device information descriptor.
15 * @balloon_dev_descriptor: pointer to reference the balloon device which
16 * this struct balloon_dev_info will be servicing.
17 *
18 * Driver must call it to properly allocate and initialize an instance of
19 * struct balloon_dev_info which will be used to reference a balloon device
20 * as well as to keep track of the balloon device page list.
21 */
22struct balloon_dev_info *balloon_devinfo_alloc(void *balloon_dev_descriptor)
23{
24 struct balloon_dev_info *b_dev_info;
25 b_dev_info = kmalloc(sizeof(*b_dev_info), GFP_KERNEL);
26 if (!b_dev_info)
27 return ERR_PTR(-ENOMEM);
28
29 b_dev_info->balloon_device = balloon_dev_descriptor;
30 b_dev_info->mapping = NULL;
31 b_dev_info->isolated_pages = 0;
32 spin_lock_init(&b_dev_info->pages_lock);
33 INIT_LIST_HEAD(&b_dev_info->pages);
34
35 return b_dev_info;
36}
37EXPORT_SYMBOL_GPL(balloon_devinfo_alloc);
38
39/*
40 * balloon_page_enqueue - allocates a new page and inserts it into the balloon 14 * balloon_page_enqueue - allocates a new page and inserts it into the balloon
41 * page list. 15 * page list.
42 * @b_dev_info: balloon device decriptor where we will insert a new page to 16 * @b_dev_info: balloon device decriptor where we will insert a new page to
@@ -61,7 +35,8 @@ struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info)
61 */ 35 */
62 BUG_ON(!trylock_page(page)); 36 BUG_ON(!trylock_page(page));
63 spin_lock_irqsave(&b_dev_info->pages_lock, flags); 37 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
64 balloon_page_insert(page, b_dev_info->mapping, &b_dev_info->pages); 38 balloon_page_insert(b_dev_info, page);
39 __count_vm_event(BALLOON_INFLATE);
65 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); 40 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
66 unlock_page(page); 41 unlock_page(page);
67 return page; 42 return page;
@@ -93,18 +68,14 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
93 * to be released by the balloon driver. 68 * to be released by the balloon driver.
94 */ 69 */
95 if (trylock_page(page)) { 70 if (trylock_page(page)) {
71 if (!PagePrivate(page)) {
72 /* raced with isolation */
73 unlock_page(page);
74 continue;
75 }
96 spin_lock_irqsave(&b_dev_info->pages_lock, flags); 76 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
97 /*
98 * Raise the page refcount here to prevent any wrong
99 * attempt to isolate this page, in case of coliding
100 * with balloon_page_isolate() just after we release
101 * the page lock.
102 *
103 * balloon_page_free() will take care of dropping
104 * this extra refcount later.
105 */
106 get_page(page);
107 balloon_page_delete(page); 77 balloon_page_delete(page);
78 __count_vm_event(BALLOON_DEFLATE);
108 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); 79 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
109 unlock_page(page); 80 unlock_page(page);
110 dequeued_page = true; 81 dequeued_page = true;
@@ -132,62 +103,14 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
132EXPORT_SYMBOL_GPL(balloon_page_dequeue); 103EXPORT_SYMBOL_GPL(balloon_page_dequeue);
133 104
134#ifdef CONFIG_BALLOON_COMPACTION 105#ifdef CONFIG_BALLOON_COMPACTION
135/*
136 * balloon_mapping_alloc - allocates a special ->mapping for ballooned pages.
137 * @b_dev_info: holds the balloon device information descriptor.
138 * @a_ops: balloon_mapping address_space_operations descriptor.
139 *
140 * Driver must call it to properly allocate and initialize an instance of
141 * struct address_space which will be used as the special page->mapping for
142 * balloon device enlisted page instances.
143 */
144struct address_space *balloon_mapping_alloc(struct balloon_dev_info *b_dev_info,
145 const struct address_space_operations *a_ops)
146{
147 struct address_space *mapping;
148
149 mapping = kmalloc(sizeof(*mapping), GFP_KERNEL);
150 if (!mapping)
151 return ERR_PTR(-ENOMEM);
152
153 /*
154 * Give a clean 'zeroed' status to all elements of this special
155 * balloon page->mapping struct address_space instance.
156 */
157 address_space_init_once(mapping);
158
159 /*
160 * Set mapping->flags appropriately, to allow balloon pages
161 * ->mapping identification.
162 */
163 mapping_set_balloon(mapping);
164 mapping_set_gfp_mask(mapping, balloon_mapping_gfp_mask());
165
166 /* balloon's page->mapping->a_ops callback descriptor */
167 mapping->a_ops = a_ops;
168
169 /*
170 * Establish a pointer reference back to the balloon device descriptor
171 * this particular page->mapping will be servicing.
172 * This is used by compaction / migration procedures to identify and
173 * access the balloon device pageset while isolating / migrating pages.
174 *
175 * As some balloon drivers can register multiple balloon devices
176 * for a single guest, this also helps compaction / migration to
177 * properly deal with multiple balloon pagesets, when required.
178 */
179 mapping->private_data = b_dev_info;
180 b_dev_info->mapping = mapping;
181
182 return mapping;
183}
184EXPORT_SYMBOL_GPL(balloon_mapping_alloc);
185 106
186static inline void __isolate_balloon_page(struct page *page) 107static inline void __isolate_balloon_page(struct page *page)
187{ 108{
188 struct balloon_dev_info *b_dev_info = page->mapping->private_data; 109 struct balloon_dev_info *b_dev_info = balloon_page_device(page);
189 unsigned long flags; 110 unsigned long flags;
111
190 spin_lock_irqsave(&b_dev_info->pages_lock, flags); 112 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
113 ClearPagePrivate(page);
191 list_del(&page->lru); 114 list_del(&page->lru);
192 b_dev_info->isolated_pages++; 115 b_dev_info->isolated_pages++;
193 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); 116 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
@@ -195,20 +118,16 @@ static inline void __isolate_balloon_page(struct page *page)
195 118
196static inline void __putback_balloon_page(struct page *page) 119static inline void __putback_balloon_page(struct page *page)
197{ 120{
198 struct balloon_dev_info *b_dev_info = page->mapping->private_data; 121 struct balloon_dev_info *b_dev_info = balloon_page_device(page);
199 unsigned long flags; 122 unsigned long flags;
123
200 spin_lock_irqsave(&b_dev_info->pages_lock, flags); 124 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
125 SetPagePrivate(page);
201 list_add(&page->lru, &b_dev_info->pages); 126 list_add(&page->lru, &b_dev_info->pages);
202 b_dev_info->isolated_pages--; 127 b_dev_info->isolated_pages--;
203 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); 128 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
204} 129}
205 130
206static inline int __migrate_balloon_page(struct address_space *mapping,
207 struct page *newpage, struct page *page, enum migrate_mode mode)
208{
209 return page->mapping->a_ops->migratepage(mapping, newpage, page, mode);
210}
211
212/* __isolate_lru_page() counterpart for a ballooned page */ 131/* __isolate_lru_page() counterpart for a ballooned page */
213bool balloon_page_isolate(struct page *page) 132bool balloon_page_isolate(struct page *page)
214{ 133{
@@ -235,12 +154,11 @@ bool balloon_page_isolate(struct page *page)
235 */ 154 */
236 if (likely(trylock_page(page))) { 155 if (likely(trylock_page(page))) {
237 /* 156 /*
238 * A ballooned page, by default, has just one refcount. 157 * A ballooned page, by default, has PagePrivate set.
239 * Prevent concurrent compaction threads from isolating 158 * Prevent concurrent compaction threads from isolating
240 * an already isolated balloon page by refcount check. 159 * an already isolated balloon page by clearing it.
241 */ 160 */
242 if (__is_movable_balloon_page(page) && 161 if (balloon_page_movable(page)) {
243 page_count(page) == 2) {
244 __isolate_balloon_page(page); 162 __isolate_balloon_page(page);
245 unlock_page(page); 163 unlock_page(page);
246 return true; 164 return true;
@@ -276,7 +194,7 @@ void balloon_page_putback(struct page *page)
276int balloon_page_migrate(struct page *newpage, 194int balloon_page_migrate(struct page *newpage,
277 struct page *page, enum migrate_mode mode) 195 struct page *page, enum migrate_mode mode)
278{ 196{
279 struct address_space *mapping; 197 struct balloon_dev_info *balloon = balloon_page_device(page);
280 int rc = -EAGAIN; 198 int rc = -EAGAIN;
281 199
282 /* 200 /*
@@ -292,9 +210,8 @@ int balloon_page_migrate(struct page *newpage,
292 return rc; 210 return rc;
293 } 211 }
294 212
295 mapping = page->mapping; 213 if (balloon && balloon->migratepage)
296 if (mapping) 214 rc = balloon->migratepage(balloon, newpage, page, mode);
297 rc = __migrate_balloon_page(mapping, newpage, page, mode);
298 215
299 unlock_page(newpage); 216 unlock_page(newpage);
300 return rc; 217 return rc;
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 90bd3507b413..8a000cebb0d7 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -16,9 +16,9 @@
16#include <linux/kmemleak.h> 16#include <linux/kmemleak.h>
17#include <linux/range.h> 17#include <linux/range.h>
18#include <linux/memblock.h> 18#include <linux/memblock.h>
19#include <linux/bug.h>
20#include <linux/io.h>
19 21
20#include <asm/bug.h>
21#include <asm/io.h>
22#include <asm/processor.h> 22#include <asm/processor.h>
23 23
24#include "internal.h" 24#include "internal.h"
diff --git a/mm/cma.c b/mm/cma.c
index c17751c0dcaf..474c644a0dc6 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -32,6 +32,7 @@
32#include <linux/slab.h> 32#include <linux/slab.h>
33#include <linux/log2.h> 33#include <linux/log2.h>
34#include <linux/cma.h> 34#include <linux/cma.h>
35#include <linux/highmem.h>
35 36
36struct cma { 37struct cma {
37 unsigned long base_pfn; 38 unsigned long base_pfn;
@@ -163,6 +164,8 @@ int __init cma_declare_contiguous(phys_addr_t base,
163 bool fixed, struct cma **res_cma) 164 bool fixed, struct cma **res_cma)
164{ 165{
165 struct cma *cma; 166 struct cma *cma;
167 phys_addr_t memblock_end = memblock_end_of_DRAM();
168 phys_addr_t highmem_start = __pa(high_memory);
166 int ret = 0; 169 int ret = 0;
167 170
168 pr_debug("%s(size %lx, base %08lx, limit %08lx alignment %08lx)\n", 171 pr_debug("%s(size %lx, base %08lx, limit %08lx alignment %08lx)\n",
@@ -196,6 +199,24 @@ int __init cma_declare_contiguous(phys_addr_t base,
196 if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit)) 199 if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit))
197 return -EINVAL; 200 return -EINVAL;
198 201
202 /*
203 * adjust limit to avoid crossing low/high memory boundary for
204 * automatically allocated regions
205 */
206 if (((limit == 0 || limit > memblock_end) &&
207 (memblock_end - size < highmem_start &&
208 memblock_end > highmem_start)) ||
209 (!fixed && limit > highmem_start && limit - size < highmem_start)) {
210 limit = highmem_start;
211 }
212
213 if (fixed && base < highmem_start && base+size > highmem_start) {
214 ret = -EINVAL;
215 pr_err("Region at %08lx defined on low/high memory boundary (%08lx)\n",
216 (unsigned long)base, (unsigned long)highmem_start);
217 goto err;
218 }
219
199 /* Reserve memory */ 220 /* Reserve memory */
200 if (base && fixed) { 221 if (base && fixed) {
201 if (memblock_is_region_reserved(base, size) || 222 if (memblock_is_region_reserved(base, size) ||
diff --git a/mm/compaction.c b/mm/compaction.c
index 21bf292b642a..edba18aed173 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -67,6 +67,49 @@ static inline bool migrate_async_suitable(int migratetype)
67 return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; 67 return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
68} 68}
69 69
70/*
71 * Check that the whole (or subset of) a pageblock given by the interval of
72 * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
73 * with the migration of free compaction scanner. The scanners then need to
74 * use only pfn_valid_within() check for arches that allow holes within
75 * pageblocks.
76 *
77 * Return struct page pointer of start_pfn, or NULL if checks were not passed.
78 *
79 * It's possible on some configurations to have a setup like node0 node1 node0
80 * i.e. it's possible that all pages within a zones range of pages do not
81 * belong to a single zone. We assume that a border between node0 and node1
82 * can occur within a single pageblock, but not a node0 node1 node0
83 * interleaving within a single pageblock. It is therefore sufficient to check
84 * the first and last page of a pageblock and avoid checking each individual
85 * page in a pageblock.
86 */
87static struct page *pageblock_pfn_to_page(unsigned long start_pfn,
88 unsigned long end_pfn, struct zone *zone)
89{
90 struct page *start_page;
91 struct page *end_page;
92
93 /* end_pfn is one past the range we are checking */
94 end_pfn--;
95
96 if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
97 return NULL;
98
99 start_page = pfn_to_page(start_pfn);
100
101 if (page_zone(start_page) != zone)
102 return NULL;
103
104 end_page = pfn_to_page(end_pfn);
105
106 /* This gives a shorter code than deriving page_zone(end_page) */
107 if (page_zone_id(start_page) != page_zone_id(end_page))
108 return NULL;
109
110 return start_page;
111}
112
70#ifdef CONFIG_COMPACTION 113#ifdef CONFIG_COMPACTION
71/* Returns true if the pageblock should be scanned for pages to isolate. */ 114/* Returns true if the pageblock should be scanned for pages to isolate. */
72static inline bool isolation_suitable(struct compact_control *cc, 115static inline bool isolation_suitable(struct compact_control *cc,
@@ -132,7 +175,7 @@ void reset_isolation_suitable(pg_data_t *pgdat)
132 */ 175 */
133static void update_pageblock_skip(struct compact_control *cc, 176static void update_pageblock_skip(struct compact_control *cc,
134 struct page *page, unsigned long nr_isolated, 177 struct page *page, unsigned long nr_isolated,
135 bool set_unsuitable, bool migrate_scanner) 178 bool migrate_scanner)
136{ 179{
137 struct zone *zone = cc->zone; 180 struct zone *zone = cc->zone;
138 unsigned long pfn; 181 unsigned long pfn;
@@ -146,12 +189,7 @@ static void update_pageblock_skip(struct compact_control *cc,
146 if (nr_isolated) 189 if (nr_isolated)
147 return; 190 return;
148 191
149 /* 192 set_pageblock_skip(page);
150 * Only skip pageblocks when all forms of compaction will be known to
151 * fail in the near future.
152 */
153 if (set_unsuitable)
154 set_pageblock_skip(page);
155 193
156 pfn = page_to_pfn(page); 194 pfn = page_to_pfn(page);
157 195
@@ -180,52 +218,77 @@ static inline bool isolation_suitable(struct compact_control *cc,
180 218
181static void update_pageblock_skip(struct compact_control *cc, 219static void update_pageblock_skip(struct compact_control *cc,
182 struct page *page, unsigned long nr_isolated, 220 struct page *page, unsigned long nr_isolated,
183 bool set_unsuitable, bool migrate_scanner) 221 bool migrate_scanner)
184{ 222{
185} 223}
186#endif /* CONFIG_COMPACTION */ 224#endif /* CONFIG_COMPACTION */
187 225
188static inline bool should_release_lock(spinlock_t *lock) 226/*
227 * Compaction requires the taking of some coarse locks that are potentially
228 * very heavily contended. For async compaction, back out if the lock cannot
229 * be taken immediately. For sync compaction, spin on the lock if needed.
230 *
231 * Returns true if the lock is held
232 * Returns false if the lock is not held and compaction should abort
233 */
234static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags,
235 struct compact_control *cc)
189{ 236{
190 return need_resched() || spin_is_contended(lock); 237 if (cc->mode == MIGRATE_ASYNC) {
238 if (!spin_trylock_irqsave(lock, *flags)) {
239 cc->contended = COMPACT_CONTENDED_LOCK;
240 return false;
241 }
242 } else {
243 spin_lock_irqsave(lock, *flags);
244 }
245
246 return true;
191} 247}
192 248
193/* 249/*
194 * Compaction requires the taking of some coarse locks that are potentially 250 * Compaction requires the taking of some coarse locks that are potentially
195 * very heavily contended. Check if the process needs to be scheduled or 251 * very heavily contended. The lock should be periodically unlocked to avoid
196 * if the lock is contended. For async compaction, back out in the event 252 * having disabled IRQs for a long time, even when there is nobody waiting on
197 * if contention is severe. For sync compaction, schedule. 253 * the lock. It might also be that allowing the IRQs will result in
254 * need_resched() becoming true. If scheduling is needed, async compaction
255 * aborts. Sync compaction schedules.
256 * Either compaction type will also abort if a fatal signal is pending.
257 * In either case if the lock was locked, it is dropped and not regained.
198 * 258 *
199 * Returns true if the lock is held. 259 * Returns true if compaction should abort due to fatal signal pending, or
200 * Returns false if the lock is released and compaction should abort 260 * async compaction due to need_resched()
261 * Returns false when compaction can continue (sync compaction might have
262 * scheduled)
201 */ 263 */
202static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, 264static bool compact_unlock_should_abort(spinlock_t *lock,
203 bool locked, struct compact_control *cc) 265 unsigned long flags, bool *locked, struct compact_control *cc)
204{ 266{
205 if (should_release_lock(lock)) { 267 if (*locked) {
206 if (locked) { 268 spin_unlock_irqrestore(lock, flags);
207 spin_unlock_irqrestore(lock, *flags); 269 *locked = false;
208 locked = false; 270 }
209 } 271
272 if (fatal_signal_pending(current)) {
273 cc->contended = COMPACT_CONTENDED_SCHED;
274 return true;
275 }
210 276
211 /* async aborts if taking too long or contended */ 277 if (need_resched()) {
212 if (cc->mode == MIGRATE_ASYNC) { 278 if (cc->mode == MIGRATE_ASYNC) {
213 cc->contended = true; 279 cc->contended = COMPACT_CONTENDED_SCHED;
214 return false; 280 return true;
215 } 281 }
216
217 cond_resched(); 282 cond_resched();
218 } 283 }
219 284
220 if (!locked) 285 return false;
221 spin_lock_irqsave(lock, *flags);
222 return true;
223} 286}
224 287
225/* 288/*
226 * Aside from avoiding lock contention, compaction also periodically checks 289 * Aside from avoiding lock contention, compaction also periodically checks
227 * need_resched() and either schedules in sync compaction or aborts async 290 * need_resched() and either schedules in sync compaction or aborts async
228 * compaction. This is similar to what compact_checklock_irqsave() does, but 291 * compaction. This is similar to what compact_unlock_should_abort() does, but
229 * is used where no lock is concerned. 292 * is used where no lock is concerned.
230 * 293 *
231 * Returns false when no scheduling was needed, or sync compaction scheduled. 294 * Returns false when no scheduling was needed, or sync compaction scheduled.
@@ -236,7 +299,7 @@ static inline bool compact_should_abort(struct compact_control *cc)
236 /* async compaction aborts if contended */ 299 /* async compaction aborts if contended */
237 if (need_resched()) { 300 if (need_resched()) {
238 if (cc->mode == MIGRATE_ASYNC) { 301 if (cc->mode == MIGRATE_ASYNC) {
239 cc->contended = true; 302 cc->contended = COMPACT_CONTENDED_SCHED;
240 return true; 303 return true;
241 } 304 }
242 305
@@ -250,8 +313,15 @@ static inline bool compact_should_abort(struct compact_control *cc)
250static bool suitable_migration_target(struct page *page) 313static bool suitable_migration_target(struct page *page)
251{ 314{
252 /* If the page is a large free page, then disallow migration */ 315 /* If the page is a large free page, then disallow migration */
253 if (PageBuddy(page) && page_order(page) >= pageblock_order) 316 if (PageBuddy(page)) {
254 return false; 317 /*
318 * We are checking page_order without zone->lock taken. But
319 * the only small danger is that we skip a potentially suitable
320 * pageblock, so it's not worth to check order for valid range.
321 */
322 if (page_order_unsafe(page) >= pageblock_order)
323 return false;
324 }
255 325
256 /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ 326 /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
257 if (migrate_async_suitable(get_pageblock_migratetype(page))) 327 if (migrate_async_suitable(get_pageblock_migratetype(page)))
@@ -267,16 +337,16 @@ static bool suitable_migration_target(struct page *page)
267 * (even though it may still end up isolating some pages). 337 * (even though it may still end up isolating some pages).
268 */ 338 */
269static unsigned long isolate_freepages_block(struct compact_control *cc, 339static unsigned long isolate_freepages_block(struct compact_control *cc,
270 unsigned long blockpfn, 340 unsigned long *start_pfn,
271 unsigned long end_pfn, 341 unsigned long end_pfn,
272 struct list_head *freelist, 342 struct list_head *freelist,
273 bool strict) 343 bool strict)
274{ 344{
275 int nr_scanned = 0, total_isolated = 0; 345 int nr_scanned = 0, total_isolated = 0;
276 struct page *cursor, *valid_page = NULL; 346 struct page *cursor, *valid_page = NULL;
277 unsigned long flags; 347 unsigned long flags = 0;
278 bool locked = false; 348 bool locked = false;
279 bool checked_pageblock = false; 349 unsigned long blockpfn = *start_pfn;
280 350
281 cursor = pfn_to_page(blockpfn); 351 cursor = pfn_to_page(blockpfn);
282 352
@@ -285,6 +355,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
285 int isolated, i; 355 int isolated, i;
286 struct page *page = cursor; 356 struct page *page = cursor;
287 357
358 /*
359 * Periodically drop the lock (if held) regardless of its
360 * contention, to give chance to IRQs. Abort if fatal signal
361 * pending or async compaction detects need_resched()
362 */
363 if (!(blockpfn % SWAP_CLUSTER_MAX)
364 && compact_unlock_should_abort(&cc->zone->lock, flags,
365 &locked, cc))
366 break;
367
288 nr_scanned++; 368 nr_scanned++;
289 if (!pfn_valid_within(blockpfn)) 369 if (!pfn_valid_within(blockpfn))
290 goto isolate_fail; 370 goto isolate_fail;
@@ -295,33 +375,30 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
295 goto isolate_fail; 375 goto isolate_fail;
296 376
297 /* 377 /*
298 * The zone lock must be held to isolate freepages. 378 * If we already hold the lock, we can skip some rechecking.
299 * Unfortunately this is a very coarse lock and can be 379 * Note that if we hold the lock now, checked_pageblock was
300 * heavily contended if there are parallel allocations 380 * already set in some previous iteration (or strict is true),
301 * or parallel compactions. For async compaction do not 381 * so it is correct to skip the suitable migration target
302 * spin on the lock and we acquire the lock as late as 382 * recheck as well.
303 * possible.
304 */ 383 */
305 locked = compact_checklock_irqsave(&cc->zone->lock, &flags, 384 if (!locked) {
306 locked, cc);
307 if (!locked)
308 break;
309
310 /* Recheck this is a suitable migration target under lock */
311 if (!strict && !checked_pageblock) {
312 /* 385 /*
313 * We need to check suitability of pageblock only once 386 * The zone lock must be held to isolate freepages.
314 * and this isolate_freepages_block() is called with 387 * Unfortunately this is a very coarse lock and can be
315 * pageblock range, so just check once is sufficient. 388 * heavily contended if there are parallel allocations
389 * or parallel compactions. For async compaction do not
390 * spin on the lock and we acquire the lock as late as
391 * possible.
316 */ 392 */
317 checked_pageblock = true; 393 locked = compact_trylock_irqsave(&cc->zone->lock,
318 if (!suitable_migration_target(page)) 394 &flags, cc);
395 if (!locked)
319 break; 396 break;
320 }
321 397
322 /* Recheck this is a buddy page under lock */ 398 /* Recheck this is a buddy page under lock */
323 if (!PageBuddy(page)) 399 if (!PageBuddy(page))
324 goto isolate_fail; 400 goto isolate_fail;
401 }
325 402
326 /* Found a free page, break it into order-0 pages */ 403 /* Found a free page, break it into order-0 pages */
327 isolated = split_free_page(page); 404 isolated = split_free_page(page);
@@ -346,6 +423,9 @@ isolate_fail:
346 423
347 } 424 }
348 425
426 /* Record how far we have got within the block */
427 *start_pfn = blockpfn;
428
349 trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); 429 trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
350 430
351 /* 431 /*
@@ -361,8 +441,7 @@ isolate_fail:
361 441
362 /* Update the pageblock-skip if the whole pageblock was scanned */ 442 /* Update the pageblock-skip if the whole pageblock was scanned */
363 if (blockpfn == end_pfn) 443 if (blockpfn == end_pfn)
364 update_pageblock_skip(cc, valid_page, total_isolated, true, 444 update_pageblock_skip(cc, valid_page, total_isolated, false);
365 false);
366 445
367 count_compact_events(COMPACTFREE_SCANNED, nr_scanned); 446 count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
368 if (total_isolated) 447 if (total_isolated)
@@ -390,19 +469,21 @@ isolate_freepages_range(struct compact_control *cc,
390 unsigned long isolated, pfn, block_end_pfn; 469 unsigned long isolated, pfn, block_end_pfn;
391 LIST_HEAD(freelist); 470 LIST_HEAD(freelist);
392 471
393 for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) { 472 pfn = start_pfn;
394 if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn))) 473 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
395 break; 474
475 for (; pfn < end_pfn; pfn += isolated,
476 block_end_pfn += pageblock_nr_pages) {
477 /* Protect pfn from changing by isolate_freepages_block */
478 unsigned long isolate_start_pfn = pfn;
396 479
397 /*
398 * On subsequent iterations ALIGN() is actually not needed,
399 * but we keep it that we not to complicate the code.
400 */
401 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
402 block_end_pfn = min(block_end_pfn, end_pfn); 480 block_end_pfn = min(block_end_pfn, end_pfn);
403 481
404 isolated = isolate_freepages_block(cc, pfn, block_end_pfn, 482 if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone))
405 &freelist, true); 483 break;
484
485 isolated = isolate_freepages_block(cc, &isolate_start_pfn,
486 block_end_pfn, &freelist, true);
406 487
407 /* 488 /*
408 * In strict mode, isolate_freepages_block() returns 0 if 489 * In strict mode, isolate_freepages_block() returns 0 if
@@ -433,22 +514,19 @@ isolate_freepages_range(struct compact_control *cc,
433} 514}
434 515
435/* Update the number of anon and file isolated pages in the zone */ 516/* Update the number of anon and file isolated pages in the zone */
436static void acct_isolated(struct zone *zone, bool locked, struct compact_control *cc) 517static void acct_isolated(struct zone *zone, struct compact_control *cc)
437{ 518{
438 struct page *page; 519 struct page *page;
439 unsigned int count[2] = { 0, }; 520 unsigned int count[2] = { 0, };
440 521
522 if (list_empty(&cc->migratepages))
523 return;
524
441 list_for_each_entry(page, &cc->migratepages, lru) 525 list_for_each_entry(page, &cc->migratepages, lru)
442 count[!!page_is_file_cache(page)]++; 526 count[!!page_is_file_cache(page)]++;
443 527
444 /* If locked we can use the interrupt unsafe versions */ 528 mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
445 if (locked) { 529 mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
446 __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
447 __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
448 } else {
449 mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
450 mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
451 }
452} 530}
453 531
454/* Similar to reclaim, but different enough that they don't share logic */ 532/* Similar to reclaim, but different enough that they don't share logic */
@@ -467,40 +545,34 @@ static bool too_many_isolated(struct zone *zone)
467} 545}
468 546
469/** 547/**
470 * isolate_migratepages_range() - isolate all migrate-able pages in range. 548 * isolate_migratepages_block() - isolate all migrate-able pages within
471 * @zone: Zone pages are in. 549 * a single pageblock
472 * @cc: Compaction control structure. 550 * @cc: Compaction control structure.
473 * @low_pfn: The first PFN of the range. 551 * @low_pfn: The first PFN to isolate
474 * @end_pfn: The one-past-the-last PFN of the range. 552 * @end_pfn: The one-past-the-last PFN to isolate, within same pageblock
475 * @unevictable: true if it allows to isolate unevictable pages 553 * @isolate_mode: Isolation mode to be used.
476 * 554 *
477 * Isolate all pages that can be migrated from the range specified by 555 * Isolate all pages that can be migrated from the range specified by
478 * [low_pfn, end_pfn). Returns zero if there is a fatal signal 556 * [low_pfn, end_pfn). The range is expected to be within same pageblock.
479 * pending), otherwise PFN of the first page that was not scanned 557 * Returns zero if there is a fatal signal pending, otherwise PFN of the
480 * (which may be both less, equal to or more then end_pfn). 558 * first page that was not scanned (which may be both less, equal to or more
559 * than end_pfn).
481 * 560 *
482 * Assumes that cc->migratepages is empty and cc->nr_migratepages is 561 * The pages are isolated on cc->migratepages list (not required to be empty),
483 * zero. 562 * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field
484 * 563 * is neither read nor updated.
485 * Apart from cc->migratepages and cc->nr_migratetypes this function
486 * does not modify any cc's fields, in particular it does not modify
487 * (or read for that matter) cc->migrate_pfn.
488 */ 564 */
489unsigned long 565static unsigned long
490isolate_migratepages_range(struct zone *zone, struct compact_control *cc, 566isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
491 unsigned long low_pfn, unsigned long end_pfn, bool unevictable) 567 unsigned long end_pfn, isolate_mode_t isolate_mode)
492{ 568{
493 unsigned long last_pageblock_nr = 0, pageblock_nr; 569 struct zone *zone = cc->zone;
494 unsigned long nr_scanned = 0, nr_isolated = 0; 570 unsigned long nr_scanned = 0, nr_isolated = 0;
495 struct list_head *migratelist = &cc->migratepages; 571 struct list_head *migratelist = &cc->migratepages;
496 struct lruvec *lruvec; 572 struct lruvec *lruvec;
497 unsigned long flags; 573 unsigned long flags = 0;
498 bool locked = false; 574 bool locked = false;
499 struct page *page = NULL, *valid_page = NULL; 575 struct page *page = NULL, *valid_page = NULL;
500 bool set_unsuitable = true;
501 const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ?
502 ISOLATE_ASYNC_MIGRATE : 0) |
503 (unevictable ? ISOLATE_UNEVICTABLE : 0);
504 576
505 /* 577 /*
506 * Ensure that there are not too many pages isolated from the LRU 578 * Ensure that there are not too many pages isolated from the LRU
@@ -523,72 +595,43 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
523 595
524 /* Time to isolate some pages for migration */ 596 /* Time to isolate some pages for migration */
525 for (; low_pfn < end_pfn; low_pfn++) { 597 for (; low_pfn < end_pfn; low_pfn++) {
526 /* give a chance to irqs before checking need_resched() */
527 if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
528 if (should_release_lock(&zone->lru_lock)) {
529 spin_unlock_irqrestore(&zone->lru_lock, flags);
530 locked = false;
531 }
532 }
533
534 /* 598 /*
535 * migrate_pfn does not necessarily start aligned to a 599 * Periodically drop the lock (if held) regardless of its
536 * pageblock. Ensure that pfn_valid is called when moving 600 * contention, to give chance to IRQs. Abort async compaction
537 * into a new MAX_ORDER_NR_PAGES range in case of large 601 * if contended.
538 * memory holes within the zone
539 */ 602 */
540 if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) { 603 if (!(low_pfn % SWAP_CLUSTER_MAX)
541 if (!pfn_valid(low_pfn)) { 604 && compact_unlock_should_abort(&zone->lru_lock, flags,
542 low_pfn += MAX_ORDER_NR_PAGES - 1; 605 &locked, cc))
543 continue; 606 break;
544 }
545 }
546 607
547 if (!pfn_valid_within(low_pfn)) 608 if (!pfn_valid_within(low_pfn))
548 continue; 609 continue;
549 nr_scanned++; 610 nr_scanned++;
550 611
551 /*
552 * Get the page and ensure the page is within the same zone.
553 * See the comment in isolate_freepages about overlapping
554 * nodes. It is deliberate that the new zone lock is not taken
555 * as memory compaction should not move pages between nodes.
556 */
557 page = pfn_to_page(low_pfn); 612 page = pfn_to_page(low_pfn);
558 if (page_zone(page) != zone)
559 continue;
560 613
561 if (!valid_page) 614 if (!valid_page)
562 valid_page = page; 615 valid_page = page;
563 616
564 /* If isolation recently failed, do not retry */ 617 /*
565 pageblock_nr = low_pfn >> pageblock_order; 618 * Skip if free. We read page order here without zone lock
566 if (last_pageblock_nr != pageblock_nr) { 619 * which is generally unsafe, but the race window is small and
567 int mt; 620 * the worst thing that can happen is that we skip some
568 621 * potential isolation targets.
569 last_pageblock_nr = pageblock_nr; 622 */
570 if (!isolation_suitable(cc, page)) 623 if (PageBuddy(page)) {
571 goto next_pageblock; 624 unsigned long freepage_order = page_order_unsafe(page);
572 625
573 /* 626 /*
574 * For async migration, also only scan in MOVABLE 627 * Without lock, we cannot be sure that what we got is
575 * blocks. Async migration is optimistic to see if 628 * a valid page order. Consider only values in the
576 * the minimum amount of work satisfies the allocation 629 * valid order range to prevent low_pfn overflow.
577 */ 630 */
578 mt = get_pageblock_migratetype(page); 631 if (freepage_order > 0 && freepage_order < MAX_ORDER)
579 if (cc->mode == MIGRATE_ASYNC && 632 low_pfn += (1UL << freepage_order) - 1;
580 !migrate_async_suitable(mt)) {
581 set_unsuitable = false;
582 goto next_pageblock;
583 }
584 }
585
586 /*
587 * Skip if free. page_order cannot be used without zone->lock
588 * as nothing prevents parallel allocations or buddy merging.
589 */
590 if (PageBuddy(page))
591 continue; 633 continue;
634 }
592 635
593 /* 636 /*
594 * Check may be lockless but that's ok as we recheck later. 637 * Check may be lockless but that's ok as we recheck later.
@@ -597,7 +640,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
597 */ 640 */
598 if (!PageLRU(page)) { 641 if (!PageLRU(page)) {
599 if (unlikely(balloon_page_movable(page))) { 642 if (unlikely(balloon_page_movable(page))) {
600 if (locked && balloon_page_isolate(page)) { 643 if (balloon_page_isolate(page)) {
601 /* Successfully isolated */ 644 /* Successfully isolated */
602 goto isolate_success; 645 goto isolate_success;
603 } 646 }
@@ -617,8 +660,11 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
617 */ 660 */
618 if (PageTransHuge(page)) { 661 if (PageTransHuge(page)) {
619 if (!locked) 662 if (!locked)
620 goto next_pageblock; 663 low_pfn = ALIGN(low_pfn + 1,
621 low_pfn += (1 << compound_order(page)) - 1; 664 pageblock_nr_pages) - 1;
665 else
666 low_pfn += (1 << compound_order(page)) - 1;
667
622 continue; 668 continue;
623 } 669 }
624 670
@@ -631,24 +677,26 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
631 page_count(page) > page_mapcount(page)) 677 page_count(page) > page_mapcount(page))
632 continue; 678 continue;
633 679
634 /* Check if it is ok to still hold the lock */ 680 /* If we already hold the lock, we can skip some rechecking */
635 locked = compact_checklock_irqsave(&zone->lru_lock, &flags, 681 if (!locked) {
636 locked, cc); 682 locked = compact_trylock_irqsave(&zone->lru_lock,
637 if (!locked || fatal_signal_pending(current)) 683 &flags, cc);
638 break; 684 if (!locked)
685 break;
639 686
640 /* Recheck PageLRU and PageTransHuge under lock */ 687 /* Recheck PageLRU and PageTransHuge under lock */
641 if (!PageLRU(page)) 688 if (!PageLRU(page))
642 continue; 689 continue;
643 if (PageTransHuge(page)) { 690 if (PageTransHuge(page)) {
644 low_pfn += (1 << compound_order(page)) - 1; 691 low_pfn += (1 << compound_order(page)) - 1;
645 continue; 692 continue;
693 }
646 } 694 }
647 695
648 lruvec = mem_cgroup_page_lruvec(page, zone); 696 lruvec = mem_cgroup_page_lruvec(page, zone);
649 697
650 /* Try isolate the page */ 698 /* Try isolate the page */
651 if (__isolate_lru_page(page, mode) != 0) 699 if (__isolate_lru_page(page, isolate_mode) != 0)
652 continue; 700 continue;
653 701
654 VM_BUG_ON_PAGE(PageTransCompound(page), page); 702 VM_BUG_ON_PAGE(PageTransCompound(page), page);
@@ -667,14 +715,14 @@ isolate_success:
667 ++low_pfn; 715 ++low_pfn;
668 break; 716 break;
669 } 717 }
670
671 continue;
672
673next_pageblock:
674 low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
675 } 718 }
676 719
677 acct_isolated(zone, locked, cc); 720 /*
721 * The PageBuddy() check could have potentially brought us outside
722 * the range to be scanned.
723 */
724 if (unlikely(low_pfn > end_pfn))
725 low_pfn = end_pfn;
678 726
679 if (locked) 727 if (locked)
680 spin_unlock_irqrestore(&zone->lru_lock, flags); 728 spin_unlock_irqrestore(&zone->lru_lock, flags);
@@ -684,8 +732,7 @@ next_pageblock:
684 * if the whole pageblock was scanned without isolating any page. 732 * if the whole pageblock was scanned without isolating any page.
685 */ 733 */
686 if (low_pfn == end_pfn) 734 if (low_pfn == end_pfn)
687 update_pageblock_skip(cc, valid_page, nr_isolated, 735 update_pageblock_skip(cc, valid_page, nr_isolated, true);
688 set_unsuitable, true);
689 736
690 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); 737 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
691 738
@@ -696,17 +743,65 @@ next_pageblock:
696 return low_pfn; 743 return low_pfn;
697} 744}
698 745
746/**
747 * isolate_migratepages_range() - isolate migrate-able pages in a PFN range
748 * @cc: Compaction control structure.
749 * @start_pfn: The first PFN to start isolating.
750 * @end_pfn: The one-past-last PFN.
751 *
752 * Returns zero if isolation fails fatally due to e.g. pending signal.
753 * Otherwise, function returns one-past-the-last PFN of isolated page
754 * (which may be greater than end_pfn if end fell in a middle of a THP page).
755 */
756unsigned long
757isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
758 unsigned long end_pfn)
759{
760 unsigned long pfn, block_end_pfn;
761
762 /* Scan block by block. First and last block may be incomplete */
763 pfn = start_pfn;
764 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
765
766 for (; pfn < end_pfn; pfn = block_end_pfn,
767 block_end_pfn += pageblock_nr_pages) {
768
769 block_end_pfn = min(block_end_pfn, end_pfn);
770
771 if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone))
772 continue;
773
774 pfn = isolate_migratepages_block(cc, pfn, block_end_pfn,
775 ISOLATE_UNEVICTABLE);
776
777 /*
778 * In case of fatal failure, release everything that might
779 * have been isolated in the previous iteration, and signal
780 * the failure back to caller.
781 */
782 if (!pfn) {
783 putback_movable_pages(&cc->migratepages);
784 cc->nr_migratepages = 0;
785 break;
786 }
787 }
788 acct_isolated(cc->zone, cc);
789
790 return pfn;
791}
792
699#endif /* CONFIG_COMPACTION || CONFIG_CMA */ 793#endif /* CONFIG_COMPACTION || CONFIG_CMA */
700#ifdef CONFIG_COMPACTION 794#ifdef CONFIG_COMPACTION
701/* 795/*
702 * Based on information in the current compact_control, find blocks 796 * Based on information in the current compact_control, find blocks
703 * suitable for isolating free pages from and then isolate them. 797 * suitable for isolating free pages from and then isolate them.
704 */ 798 */
705static void isolate_freepages(struct zone *zone, 799static void isolate_freepages(struct compact_control *cc)
706 struct compact_control *cc)
707{ 800{
801 struct zone *zone = cc->zone;
708 struct page *page; 802 struct page *page;
709 unsigned long block_start_pfn; /* start of current pageblock */ 803 unsigned long block_start_pfn; /* start of current pageblock */
804 unsigned long isolate_start_pfn; /* exact pfn we start at */
710 unsigned long block_end_pfn; /* end of current pageblock */ 805 unsigned long block_end_pfn; /* end of current pageblock */
711 unsigned long low_pfn; /* lowest pfn scanner is able to scan */ 806 unsigned long low_pfn; /* lowest pfn scanner is able to scan */
712 int nr_freepages = cc->nr_freepages; 807 int nr_freepages = cc->nr_freepages;
@@ -715,14 +810,15 @@ static void isolate_freepages(struct zone *zone,
715 /* 810 /*
716 * Initialise the free scanner. The starting point is where we last 811 * Initialise the free scanner. The starting point is where we last
717 * successfully isolated from, zone-cached value, or the end of the 812 * successfully isolated from, zone-cached value, or the end of the
718 * zone when isolating for the first time. We need this aligned to 813 * zone when isolating for the first time. For looping we also need
719 * the pageblock boundary, because we do 814 * this pfn aligned down to the pageblock boundary, because we do
720 * block_start_pfn -= pageblock_nr_pages in the for loop. 815 * block_start_pfn -= pageblock_nr_pages in the for loop.
721 * For ending point, take care when isolating in last pageblock of a 816 * For ending point, take care when isolating in last pageblock of a
722 * a zone which ends in the middle of a pageblock. 817 * a zone which ends in the middle of a pageblock.
723 * The low boundary is the end of the pageblock the migration scanner 818 * The low boundary is the end of the pageblock the migration scanner
724 * is using. 819 * is using.
725 */ 820 */
821 isolate_start_pfn = cc->free_pfn;
726 block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1); 822 block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1);
727 block_end_pfn = min(block_start_pfn + pageblock_nr_pages, 823 block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
728 zone_end_pfn(zone)); 824 zone_end_pfn(zone));
@@ -735,7 +831,8 @@ static void isolate_freepages(struct zone *zone,
735 */ 831 */
736 for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages; 832 for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
737 block_end_pfn = block_start_pfn, 833 block_end_pfn = block_start_pfn,
738 block_start_pfn -= pageblock_nr_pages) { 834 block_start_pfn -= pageblock_nr_pages,
835 isolate_start_pfn = block_start_pfn) {
739 unsigned long isolated; 836 unsigned long isolated;
740 837
741 /* 838 /*
@@ -747,18 +844,9 @@ static void isolate_freepages(struct zone *zone,
747 && compact_should_abort(cc)) 844 && compact_should_abort(cc))
748 break; 845 break;
749 846
750 if (!pfn_valid(block_start_pfn)) 847 page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
751 continue; 848 zone);
752 849 if (!page)
753 /*
754 * Check for overlapping nodes/zones. It's possible on some
755 * configurations to have a setup like
756 * node0 node1 node0
757 * i.e. it's possible that all pages within a zones range of
758 * pages do not belong to a single zone.
759 */
760 page = pfn_to_page(block_start_pfn);
761 if (page_zone(page) != zone)
762 continue; 850 continue;
763 851
764 /* Check the block is suitable for migration */ 852 /* Check the block is suitable for migration */
@@ -769,13 +857,25 @@ static void isolate_freepages(struct zone *zone,
769 if (!isolation_suitable(cc, page)) 857 if (!isolation_suitable(cc, page))
770 continue; 858 continue;
771 859
772 /* Found a block suitable for isolating free pages from */ 860 /* Found a block suitable for isolating free pages from. */
773 cc->free_pfn = block_start_pfn; 861 isolated = isolate_freepages_block(cc, &isolate_start_pfn,
774 isolated = isolate_freepages_block(cc, block_start_pfn,
775 block_end_pfn, freelist, false); 862 block_end_pfn, freelist, false);
776 nr_freepages += isolated; 863 nr_freepages += isolated;
777 864
778 /* 865 /*
866 * Remember where the free scanner should restart next time,
867 * which is where isolate_freepages_block() left off.
868 * But if it scanned the whole pageblock, isolate_start_pfn
869 * now points at block_end_pfn, which is the start of the next
870 * pageblock.
871 * In that case we will however want to restart at the start
872 * of the previous pageblock.
873 */
874 cc->free_pfn = (isolate_start_pfn < block_end_pfn) ?
875 isolate_start_pfn :
876 block_start_pfn - pageblock_nr_pages;
877
878 /*
779 * Set a flag that we successfully isolated in this pageblock. 879 * Set a flag that we successfully isolated in this pageblock.
780 * In the next loop iteration, zone->compact_cached_free_pfn 880 * In the next loop iteration, zone->compact_cached_free_pfn
781 * will not be updated and thus it will effectively contain the 881 * will not be updated and thus it will effectively contain the
@@ -822,7 +922,7 @@ static struct page *compaction_alloc(struct page *migratepage,
822 */ 922 */
823 if (list_empty(&cc->freepages)) { 923 if (list_empty(&cc->freepages)) {
824 if (!cc->contended) 924 if (!cc->contended)
825 isolate_freepages(cc->zone, cc); 925 isolate_freepages(cc);
826 926
827 if (list_empty(&cc->freepages)) 927 if (list_empty(&cc->freepages))
828 return NULL; 928 return NULL;
@@ -856,38 +956,84 @@ typedef enum {
856} isolate_migrate_t; 956} isolate_migrate_t;
857 957
858/* 958/*
859 * Isolate all pages that can be migrated from the block pointed to by 959 * Isolate all pages that can be migrated from the first suitable block,
860 * the migrate scanner within compact_control. 960 * starting at the block pointed to by the migrate scanner pfn within
961 * compact_control.
861 */ 962 */
862static isolate_migrate_t isolate_migratepages(struct zone *zone, 963static isolate_migrate_t isolate_migratepages(struct zone *zone,
863 struct compact_control *cc) 964 struct compact_control *cc)
864{ 965{
865 unsigned long low_pfn, end_pfn; 966 unsigned long low_pfn, end_pfn;
967 struct page *page;
968 const isolate_mode_t isolate_mode =
969 (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0);
866 970
867 /* Do not scan outside zone boundaries */ 971 /*
868 low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); 972 * Start at where we last stopped, or beginning of the zone as
973 * initialized by compact_zone()
974 */
975 low_pfn = cc->migrate_pfn;
869 976
870 /* Only scan within a pageblock boundary */ 977 /* Only scan within a pageblock boundary */
871 end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); 978 end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
872 979
873 /* Do not cross the free scanner or scan within a memory hole */ 980 /*
874 if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { 981 * Iterate over whole pageblocks until we find the first suitable.
875 cc->migrate_pfn = end_pfn; 982 * Do not cross the free scanner.
876 return ISOLATE_NONE; 983 */
877 } 984 for (; end_pfn <= cc->free_pfn;
985 low_pfn = end_pfn, end_pfn += pageblock_nr_pages) {
878 986
879 /* Perform the isolation */ 987 /*
880 low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn, false); 988 * This can potentially iterate a massively long zone with
881 if (!low_pfn || cc->contended) 989 * many pageblocks unsuitable, so periodically check if we
882 return ISOLATE_ABORT; 990 * need to schedule, or even abort async compaction.
991 */
992 if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
993 && compact_should_abort(cc))
994 break;
995
996 page = pageblock_pfn_to_page(low_pfn, end_pfn, zone);
997 if (!page)
998 continue;
999
1000 /* If isolation recently failed, do not retry */
1001 if (!isolation_suitable(cc, page))
1002 continue;
1003
1004 /*
1005 * For async compaction, also only scan in MOVABLE blocks.
1006 * Async compaction is optimistic to see if the minimum amount
1007 * of work satisfies the allocation.
1008 */
1009 if (cc->mode == MIGRATE_ASYNC &&
1010 !migrate_async_suitable(get_pageblock_migratetype(page)))
1011 continue;
1012
1013 /* Perform the isolation */
1014 low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
1015 isolate_mode);
883 1016
1017 if (!low_pfn || cc->contended)
1018 return ISOLATE_ABORT;
1019
1020 /*
1021 * Either we isolated something and proceed with migration. Or
1022 * we failed and compact_zone should decide if we should
1023 * continue or not.
1024 */
1025 break;
1026 }
1027
1028 acct_isolated(zone, cc);
1029 /* Record where migration scanner will be restarted */
884 cc->migrate_pfn = low_pfn; 1030 cc->migrate_pfn = low_pfn;
885 1031
886 return ISOLATE_SUCCESS; 1032 return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
887} 1033}
888 1034
889static int compact_finished(struct zone *zone, 1035static int compact_finished(struct zone *zone, struct compact_control *cc,
890 struct compact_control *cc) 1036 const int migratetype)
891{ 1037{
892 unsigned int order; 1038 unsigned int order;
893 unsigned long watermark; 1039 unsigned long watermark;
@@ -933,7 +1079,7 @@ static int compact_finished(struct zone *zone,
933 struct free_area *area = &zone->free_area[order]; 1079 struct free_area *area = &zone->free_area[order];
934 1080
935 /* Job done if page is free of the right migratetype */ 1081 /* Job done if page is free of the right migratetype */
936 if (!list_empty(&area->free_list[cc->migratetype])) 1082 if (!list_empty(&area->free_list[migratetype]))
937 return COMPACT_PARTIAL; 1083 return COMPACT_PARTIAL;
938 1084
939 /* Job done if allocation would set block type */ 1085 /* Job done if allocation would set block type */
@@ -999,6 +1145,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
999 int ret; 1145 int ret;
1000 unsigned long start_pfn = zone->zone_start_pfn; 1146 unsigned long start_pfn = zone->zone_start_pfn;
1001 unsigned long end_pfn = zone_end_pfn(zone); 1147 unsigned long end_pfn = zone_end_pfn(zone);
1148 const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
1002 const bool sync = cc->mode != MIGRATE_ASYNC; 1149 const bool sync = cc->mode != MIGRATE_ASYNC;
1003 1150
1004 ret = compaction_suitable(zone, cc->order); 1151 ret = compaction_suitable(zone, cc->order);
@@ -1041,7 +1188,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1041 1188
1042 migrate_prep_local(); 1189 migrate_prep_local();
1043 1190
1044 while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { 1191 while ((ret = compact_finished(zone, cc, migratetype)) ==
1192 COMPACT_CONTINUE) {
1045 int err; 1193 int err;
1046 1194
1047 switch (isolate_migratepages(zone, cc)) { 1195 switch (isolate_migratepages(zone, cc)) {
@@ -1056,9 +1204,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1056 ; 1204 ;
1057 } 1205 }
1058 1206
1059 if (!cc->nr_migratepages)
1060 continue;
1061
1062 err = migrate_pages(&cc->migratepages, compaction_alloc, 1207 err = migrate_pages(&cc->migratepages, compaction_alloc,
1063 compaction_free, (unsigned long)cc, cc->mode, 1208 compaction_free, (unsigned long)cc, cc->mode,
1064 MR_COMPACTION); 1209 MR_COMPACTION);
@@ -1092,14 +1237,14 @@ out:
1092} 1237}
1093 1238
1094static unsigned long compact_zone_order(struct zone *zone, int order, 1239static unsigned long compact_zone_order(struct zone *zone, int order,
1095 gfp_t gfp_mask, enum migrate_mode mode, bool *contended) 1240 gfp_t gfp_mask, enum migrate_mode mode, int *contended)
1096{ 1241{
1097 unsigned long ret; 1242 unsigned long ret;
1098 struct compact_control cc = { 1243 struct compact_control cc = {
1099 .nr_freepages = 0, 1244 .nr_freepages = 0,
1100 .nr_migratepages = 0, 1245 .nr_migratepages = 0,
1101 .order = order, 1246 .order = order,
1102 .migratetype = allocflags_to_migratetype(gfp_mask), 1247 .gfp_mask = gfp_mask,
1103 .zone = zone, 1248 .zone = zone,
1104 .mode = mode, 1249 .mode = mode,
1105 }; 1250 };
@@ -1124,48 +1269,117 @@ int sysctl_extfrag_threshold = 500;
1124 * @gfp_mask: The GFP mask of the current allocation 1269 * @gfp_mask: The GFP mask of the current allocation
1125 * @nodemask: The allowed nodes to allocate from 1270 * @nodemask: The allowed nodes to allocate from
1126 * @mode: The migration mode for async, sync light, or sync migration 1271 * @mode: The migration mode for async, sync light, or sync migration
1127 * @contended: Return value that is true if compaction was aborted due to lock contention 1272 * @contended: Return value that determines if compaction was aborted due to
1128 * @page: Optionally capture a free page of the requested order during compaction 1273 * need_resched() or lock contention
1274 * @candidate_zone: Return the zone where we think allocation should succeed
1129 * 1275 *
1130 * This is the main entry point for direct page compaction. 1276 * This is the main entry point for direct page compaction.
1131 */ 1277 */
1132unsigned long try_to_compact_pages(struct zonelist *zonelist, 1278unsigned long try_to_compact_pages(struct zonelist *zonelist,
1133 int order, gfp_t gfp_mask, nodemask_t *nodemask, 1279 int order, gfp_t gfp_mask, nodemask_t *nodemask,
1134 enum migrate_mode mode, bool *contended) 1280 enum migrate_mode mode, int *contended,
1281 struct zone **candidate_zone)
1135{ 1282{
1136 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 1283 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1137 int may_enter_fs = gfp_mask & __GFP_FS; 1284 int may_enter_fs = gfp_mask & __GFP_FS;
1138 int may_perform_io = gfp_mask & __GFP_IO; 1285 int may_perform_io = gfp_mask & __GFP_IO;
1139 struct zoneref *z; 1286 struct zoneref *z;
1140 struct zone *zone; 1287 struct zone *zone;
1141 int rc = COMPACT_SKIPPED; 1288 int rc = COMPACT_DEFERRED;
1142 int alloc_flags = 0; 1289 int alloc_flags = 0;
1290 int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */
1291
1292 *contended = COMPACT_CONTENDED_NONE;
1143 1293
1144 /* Check if the GFP flags allow compaction */ 1294 /* Check if the GFP flags allow compaction */
1145 if (!order || !may_enter_fs || !may_perform_io) 1295 if (!order || !may_enter_fs || !may_perform_io)
1146 return rc; 1296 return COMPACT_SKIPPED;
1147
1148 count_compact_event(COMPACTSTALL);
1149 1297
1150#ifdef CONFIG_CMA 1298#ifdef CONFIG_CMA
1151 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 1299 if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
1152 alloc_flags |= ALLOC_CMA; 1300 alloc_flags |= ALLOC_CMA;
1153#endif 1301#endif
1154 /* Compact each zone in the list */ 1302 /* Compact each zone in the list */
1155 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, 1303 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
1156 nodemask) { 1304 nodemask) {
1157 int status; 1305 int status;
1306 int zone_contended;
1307
1308 if (compaction_deferred(zone, order))
1309 continue;
1158 1310
1159 status = compact_zone_order(zone, order, gfp_mask, mode, 1311 status = compact_zone_order(zone, order, gfp_mask, mode,
1160 contended); 1312 &zone_contended);
1161 rc = max(status, rc); 1313 rc = max(status, rc);
1314 /*
1315 * It takes at least one zone that wasn't lock contended
1316 * to clear all_zones_contended.
1317 */
1318 all_zones_contended &= zone_contended;
1162 1319
1163 /* If a normal allocation would succeed, stop compacting */ 1320 /* If a normal allocation would succeed, stop compacting */
1164 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 1321 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0,
1165 alloc_flags)) 1322 alloc_flags)) {
1166 break; 1323 *candidate_zone = zone;
1324 /*
1325 * We think the allocation will succeed in this zone,
1326 * but it is not certain, hence the false. The caller
1327 * will repeat this with true if allocation indeed
1328 * succeeds in this zone.
1329 */
1330 compaction_defer_reset(zone, order, false);
1331 /*
1332 * It is possible that async compaction aborted due to
1333 * need_resched() and the watermarks were ok thanks to
1334 * somebody else freeing memory. The allocation can
1335 * however still fail so we better signal the
1336 * need_resched() contention anyway (this will not
1337 * prevent the allocation attempt).
1338 */
1339 if (zone_contended == COMPACT_CONTENDED_SCHED)
1340 *contended = COMPACT_CONTENDED_SCHED;
1341
1342 goto break_loop;
1343 }
1344
1345 if (mode != MIGRATE_ASYNC) {
1346 /*
1347 * We think that allocation won't succeed in this zone
1348 * so we defer compaction there. If it ends up
1349 * succeeding after all, it will be reset.
1350 */
1351 defer_compaction(zone, order);
1352 }
1353
1354 /*
1355 * We might have stopped compacting due to need_resched() in
1356 * async compaction, or due to a fatal signal detected. In that
1357 * case do not try further zones and signal need_resched()
1358 * contention.
1359 */
1360 if ((zone_contended == COMPACT_CONTENDED_SCHED)
1361 || fatal_signal_pending(current)) {
1362 *contended = COMPACT_CONTENDED_SCHED;
1363 goto break_loop;
1364 }
1365
1366 continue;
1367break_loop:
1368 /*
1369 * We might not have tried all the zones, so be conservative
1370 * and assume they are not all lock contended.
1371 */
1372 all_zones_contended = 0;
1373 break;
1167 } 1374 }
1168 1375
1376 /*
1377 * If at least one zone wasn't deferred or skipped, we report if all
1378 * zones that were tried were lock contended.
1379 */
1380 if (rc > COMPACT_SKIPPED && all_zones_contended)
1381 *contended = COMPACT_CONTENDED_LOCK;
1382
1169 return rc; 1383 return rc;
1170} 1384}
1171 1385
diff --git a/mm/debug.c b/mm/debug.c
new file mode 100644
index 000000000000..5ce45c9a29b5
--- /dev/null
+++ b/mm/debug.c
@@ -0,0 +1,237 @@
1/*
2 * mm/debug.c
3 *
4 * mm/ specific debug routines.
5 *
6 */
7
8#include <linux/kernel.h>
9#include <linux/mm.h>
10#include <linux/ftrace_event.h>
11#include <linux/memcontrol.h>
12
13static const struct trace_print_flags pageflag_names[] = {
14 {1UL << PG_locked, "locked" },
15 {1UL << PG_error, "error" },
16 {1UL << PG_referenced, "referenced" },
17 {1UL << PG_uptodate, "uptodate" },
18 {1UL << PG_dirty, "dirty" },
19 {1UL << PG_lru, "lru" },
20 {1UL << PG_active, "active" },
21 {1UL << PG_slab, "slab" },
22 {1UL << PG_owner_priv_1, "owner_priv_1" },
23 {1UL << PG_arch_1, "arch_1" },
24 {1UL << PG_reserved, "reserved" },
25 {1UL << PG_private, "private" },
26 {1UL << PG_private_2, "private_2" },
27 {1UL << PG_writeback, "writeback" },
28#ifdef CONFIG_PAGEFLAGS_EXTENDED
29 {1UL << PG_head, "head" },
30 {1UL << PG_tail, "tail" },
31#else
32 {1UL << PG_compound, "compound" },
33#endif
34 {1UL << PG_swapcache, "swapcache" },
35 {1UL << PG_mappedtodisk, "mappedtodisk" },
36 {1UL << PG_reclaim, "reclaim" },
37 {1UL << PG_swapbacked, "swapbacked" },
38 {1UL << PG_unevictable, "unevictable" },
39#ifdef CONFIG_MMU
40 {1UL << PG_mlocked, "mlocked" },
41#endif
42#ifdef CONFIG_ARCH_USES_PG_UNCACHED
43 {1UL << PG_uncached, "uncached" },
44#endif
45#ifdef CONFIG_MEMORY_FAILURE
46 {1UL << PG_hwpoison, "hwpoison" },
47#endif
48#ifdef CONFIG_TRANSPARENT_HUGEPAGE
49 {1UL << PG_compound_lock, "compound_lock" },
50#endif
51};
52
53static void dump_flags(unsigned long flags,
54 const struct trace_print_flags *names, int count)
55{
56 const char *delim = "";
57 unsigned long mask;
58 int i;
59
60 pr_emerg("flags: %#lx(", flags);
61
62 /* remove zone id */
63 flags &= (1UL << NR_PAGEFLAGS) - 1;
64
65 for (i = 0; i < count && flags; i++) {
66
67 mask = names[i].mask;
68 if ((flags & mask) != mask)
69 continue;
70
71 flags &= ~mask;
72 pr_cont("%s%s", delim, names[i].name);
73 delim = "|";
74 }
75
76 /* check for left over flags */
77 if (flags)
78 pr_cont("%s%#lx", delim, flags);
79
80 pr_cont(")\n");
81}
82
83void dump_page_badflags(struct page *page, const char *reason,
84 unsigned long badflags)
85{
86 pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
87 page, atomic_read(&page->_count), page_mapcount(page),
88 page->mapping, page->index);
89 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
90 dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names));
91 if (reason)
92 pr_alert("page dumped because: %s\n", reason);
93 if (page->flags & badflags) {
94 pr_alert("bad because of flags:\n");
95 dump_flags(page->flags & badflags,
96 pageflag_names, ARRAY_SIZE(pageflag_names));
97 }
98 mem_cgroup_print_bad_page(page);
99}
100
101void dump_page(struct page *page, const char *reason)
102{
103 dump_page_badflags(page, reason, 0);
104}
105EXPORT_SYMBOL(dump_page);
106
107#ifdef CONFIG_DEBUG_VM
108
109static const struct trace_print_flags vmaflags_names[] = {
110 {VM_READ, "read" },
111 {VM_WRITE, "write" },
112 {VM_EXEC, "exec" },
113 {VM_SHARED, "shared" },
114 {VM_MAYREAD, "mayread" },
115 {VM_MAYWRITE, "maywrite" },
116 {VM_MAYEXEC, "mayexec" },
117 {VM_MAYSHARE, "mayshare" },
118 {VM_GROWSDOWN, "growsdown" },
119 {VM_PFNMAP, "pfnmap" },
120 {VM_DENYWRITE, "denywrite" },
121 {VM_LOCKED, "locked" },
122 {VM_IO, "io" },
123 {VM_SEQ_READ, "seqread" },
124 {VM_RAND_READ, "randread" },
125 {VM_DONTCOPY, "dontcopy" },
126 {VM_DONTEXPAND, "dontexpand" },
127 {VM_ACCOUNT, "account" },
128 {VM_NORESERVE, "noreserve" },
129 {VM_HUGETLB, "hugetlb" },
130 {VM_NONLINEAR, "nonlinear" },
131#if defined(CONFIG_X86)
132 {VM_PAT, "pat" },
133#elif defined(CONFIG_PPC)
134 {VM_SAO, "sao" },
135#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64)
136 {VM_GROWSUP, "growsup" },
137#elif !defined(CONFIG_MMU)
138 {VM_MAPPED_COPY, "mappedcopy" },
139#else
140 {VM_ARCH_1, "arch_1" },
141#endif
142 {VM_DONTDUMP, "dontdump" },
143#ifdef CONFIG_MEM_SOFT_DIRTY
144 {VM_SOFTDIRTY, "softdirty" },
145#endif
146 {VM_MIXEDMAP, "mixedmap" },
147 {VM_HUGEPAGE, "hugepage" },
148 {VM_NOHUGEPAGE, "nohugepage" },
149 {VM_MERGEABLE, "mergeable" },
150};
151
152void dump_vma(const struct vm_area_struct *vma)
153{
154 pr_emerg("vma %p start %p end %p\n"
155 "next %p prev %p mm %p\n"
156 "prot %lx anon_vma %p vm_ops %p\n"
157 "pgoff %lx file %p private_data %p\n",
158 vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next,
159 vma->vm_prev, vma->vm_mm,
160 (unsigned long)pgprot_val(vma->vm_page_prot),
161 vma->anon_vma, vma->vm_ops, vma->vm_pgoff,
162 vma->vm_file, vma->vm_private_data);
163 dump_flags(vma->vm_flags, vmaflags_names, ARRAY_SIZE(vmaflags_names));
164}
165EXPORT_SYMBOL(dump_vma);
166
167void dump_mm(const struct mm_struct *mm)
168{
169 pr_emerg("mm %p mmap %p seqnum %d task_size %lu\n"
170#ifdef CONFIG_MMU
171 "get_unmapped_area %p\n"
172#endif
173 "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
174 "pgd %p mm_users %d mm_count %d nr_ptes %lu map_count %d\n"
175 "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
176 "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n"
177 "start_code %lx end_code %lx start_data %lx end_data %lx\n"
178 "start_brk %lx brk %lx start_stack %lx\n"
179 "arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
180 "binfmt %p flags %lx core_state %p\n"
181#ifdef CONFIG_AIO
182 "ioctx_table %p\n"
183#endif
184#ifdef CONFIG_MEMCG
185 "owner %p "
186#endif
187 "exe_file %p\n"
188#ifdef CONFIG_MMU_NOTIFIER
189 "mmu_notifier_mm %p\n"
190#endif
191#ifdef CONFIG_NUMA_BALANCING
192 "numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n"
193#endif
194#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
195 "tlb_flush_pending %d\n"
196#endif
197 "%s", /* This is here to hold the comma */
198
199 mm, mm->mmap, mm->vmacache_seqnum, mm->task_size,
200#ifdef CONFIG_MMU
201 mm->get_unmapped_area,
202#endif
203 mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end,
204 mm->pgd, atomic_read(&mm->mm_users),
205 atomic_read(&mm->mm_count),
206 atomic_long_read((atomic_long_t *)&mm->nr_ptes),
207 mm->map_count,
208 mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
209 mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm,
210 mm->start_code, mm->end_code, mm->start_data, mm->end_data,
211 mm->start_brk, mm->brk, mm->start_stack,
212 mm->arg_start, mm->arg_end, mm->env_start, mm->env_end,
213 mm->binfmt, mm->flags, mm->core_state,
214#ifdef CONFIG_AIO
215 mm->ioctx_table,
216#endif
217#ifdef CONFIG_MEMCG
218 mm->owner,
219#endif
220 mm->exe_file,
221#ifdef CONFIG_MMU_NOTIFIER
222 mm->mmu_notifier_mm,
223#endif
224#ifdef CONFIG_NUMA_BALANCING
225 mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq,
226#endif
227#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
228 mm->tlb_flush_pending,
229#endif
230 "" /* This is here to not have a comma! */
231 );
232
233 dump_flags(mm->def_flags, vmaflags_names,
234 ARRAY_SIZE(vmaflags_names));
235}
236
237#endif /* CONFIG_DEBUG_VM */
diff --git a/mm/dmapool.c b/mm/dmapool.c
index ba8019b063e1..fd5fe4342e93 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -62,6 +62,7 @@ struct dma_page { /* cacheable header for 'allocation' bytes */
62}; 62};
63 63
64static DEFINE_MUTEX(pools_lock); 64static DEFINE_MUTEX(pools_lock);
65static DEFINE_MUTEX(pools_reg_lock);
65 66
66static ssize_t 67static ssize_t
67show_pools(struct device *dev, struct device_attribute *attr, char *buf) 68show_pools(struct device *dev, struct device_attribute *attr, char *buf)
@@ -132,29 +133,27 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
132{ 133{
133 struct dma_pool *retval; 134 struct dma_pool *retval;
134 size_t allocation; 135 size_t allocation;
136 bool empty = false;
135 137
136 if (align == 0) { 138 if (align == 0)
137 align = 1; 139 align = 1;
138 } else if (align & (align - 1)) { 140 else if (align & (align - 1))
139 return NULL; 141 return NULL;
140 }
141 142
142 if (size == 0) { 143 if (size == 0)
143 return NULL; 144 return NULL;
144 } else if (size < 4) { 145 else if (size < 4)
145 size = 4; 146 size = 4;
146 }
147 147
148 if ((size % align) != 0) 148 if ((size % align) != 0)
149 size = ALIGN(size, align); 149 size = ALIGN(size, align);
150 150
151 allocation = max_t(size_t, size, PAGE_SIZE); 151 allocation = max_t(size_t, size, PAGE_SIZE);
152 152
153 if (!boundary) { 153 if (!boundary)
154 boundary = allocation; 154 boundary = allocation;
155 } else if ((boundary < size) || (boundary & (boundary - 1))) { 155 else if ((boundary < size) || (boundary & (boundary - 1)))
156 return NULL; 156 return NULL;
157 }
158 157
159 retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev)); 158 retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev));
160 if (!retval) 159 if (!retval)
@@ -172,15 +171,34 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
172 171
173 INIT_LIST_HEAD(&retval->pools); 172 INIT_LIST_HEAD(&retval->pools);
174 173
174 /*
175 * pools_lock ensures that the ->dma_pools list does not get corrupted.
176 * pools_reg_lock ensures that there is not a race between
177 * dma_pool_create() and dma_pool_destroy() or within dma_pool_create()
178 * when the first invocation of dma_pool_create() failed on
179 * device_create_file() and the second assumes that it has been done (I
180 * know it is a short window).
181 */
182 mutex_lock(&pools_reg_lock);
175 mutex_lock(&pools_lock); 183 mutex_lock(&pools_lock);
176 if (list_empty(&dev->dma_pools) && 184 if (list_empty(&dev->dma_pools))
177 device_create_file(dev, &dev_attr_pools)) { 185 empty = true;
178 kfree(retval); 186 list_add(&retval->pools, &dev->dma_pools);
179 retval = NULL;
180 } else
181 list_add(&retval->pools, &dev->dma_pools);
182 mutex_unlock(&pools_lock); 187 mutex_unlock(&pools_lock);
183 188 if (empty) {
189 int err;
190
191 err = device_create_file(dev, &dev_attr_pools);
192 if (err) {
193 mutex_lock(&pools_lock);
194 list_del(&retval->pools);
195 mutex_unlock(&pools_lock);
196 mutex_unlock(&pools_reg_lock);
197 kfree(retval);
198 return NULL;
199 }
200 }
201 mutex_unlock(&pools_reg_lock);
184 return retval; 202 return retval;
185} 203}
186EXPORT_SYMBOL(dma_pool_create); 204EXPORT_SYMBOL(dma_pool_create);
@@ -251,11 +269,17 @@ static void pool_free_page(struct dma_pool *pool, struct dma_page *page)
251 */ 269 */
252void dma_pool_destroy(struct dma_pool *pool) 270void dma_pool_destroy(struct dma_pool *pool)
253{ 271{
272 bool empty = false;
273
274 mutex_lock(&pools_reg_lock);
254 mutex_lock(&pools_lock); 275 mutex_lock(&pools_lock);
255 list_del(&pool->pools); 276 list_del(&pool->pools);
256 if (pool->dev && list_empty(&pool->dev->dma_pools)) 277 if (pool->dev && list_empty(&pool->dev->dma_pools))
257 device_remove_file(pool->dev, &dev_attr_pools); 278 empty = true;
258 mutex_unlock(&pools_lock); 279 mutex_unlock(&pools_lock);
280 if (empty)
281 device_remove_file(pool->dev, &dev_attr_pools);
282 mutex_unlock(&pools_reg_lock);
259 283
260 while (!list_empty(&pool->page_list)) { 284 while (!list_empty(&pool->page_list)) {
261 struct dma_page *page; 285 struct dma_page *page;
diff --git a/mm/filemap.c b/mm/filemap.c
index 0ab0a3ea5721..14b4642279f1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1753,7 +1753,7 @@ EXPORT_SYMBOL(generic_file_read_iter);
1753static int page_cache_read(struct file *file, pgoff_t offset) 1753static int page_cache_read(struct file *file, pgoff_t offset)
1754{ 1754{
1755 struct address_space *mapping = file->f_mapping; 1755 struct address_space *mapping = file->f_mapping;
1756 struct page *page; 1756 struct page *page;
1757 int ret; 1757 int ret;
1758 1758
1759 do { 1759 do {
@@ -1770,7 +1770,7 @@ static int page_cache_read(struct file *file, pgoff_t offset)
1770 page_cache_release(page); 1770 page_cache_release(page);
1771 1771
1772 } while (ret == AOP_TRUNCATED_PAGE); 1772 } while (ret == AOP_TRUNCATED_PAGE);
1773 1773
1774 return ret; 1774 return ret;
1775} 1775}
1776 1776
diff --git a/mm/gup.c b/mm/gup.c
index af7ea3e0826b..cd62c8c90d4a 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -10,6 +10,10 @@
10#include <linux/swap.h> 10#include <linux/swap.h>
11#include <linux/swapops.h> 11#include <linux/swapops.h>
12 12
13#include <linux/sched.h>
14#include <linux/rwsem.h>
15#include <asm/pgtable.h>
16
13#include "internal.h" 17#include "internal.h"
14 18
15static struct page *no_page_table(struct vm_area_struct *vma, 19static struct page *no_page_table(struct vm_area_struct *vma,
@@ -676,3 +680,353 @@ struct page *get_dump_page(unsigned long addr)
676 return page; 680 return page;
677} 681}
678#endif /* CONFIG_ELF_CORE */ 682#endif /* CONFIG_ELF_CORE */
683
684/*
685 * Generic RCU Fast GUP
686 *
687 * get_user_pages_fast attempts to pin user pages by walking the page
688 * tables directly and avoids taking locks. Thus the walker needs to be
689 * protected from page table pages being freed from under it, and should
690 * block any THP splits.
691 *
692 * One way to achieve this is to have the walker disable interrupts, and
693 * rely on IPIs from the TLB flushing code blocking before the page table
694 * pages are freed. This is unsuitable for architectures that do not need
695 * to broadcast an IPI when invalidating TLBs.
696 *
697 * Another way to achieve this is to batch up page table containing pages
698 * belonging to more than one mm_user, then rcu_sched a callback to free those
699 * pages. Disabling interrupts will allow the fast_gup walker to both block
700 * the rcu_sched callback, and an IPI that we broadcast for splitting THPs
701 * (which is a relatively rare event). The code below adopts this strategy.
702 *
703 * Before activating this code, please be aware that the following assumptions
704 * are currently made:
705 *
706 * *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free
707 * pages containing page tables.
708 *
709 * *) THP splits will broadcast an IPI, this can be achieved by overriding
710 * pmdp_splitting_flush.
711 *
712 * *) ptes can be read atomically by the architecture.
713 *
714 * *) access_ok is sufficient to validate userspace address ranges.
715 *
716 * The last two assumptions can be relaxed by the addition of helper functions.
717 *
718 * This code is based heavily on the PowerPC implementation by Nick Piggin.
719 */
720#ifdef CONFIG_HAVE_GENERIC_RCU_GUP
721
722#ifdef __HAVE_ARCH_PTE_SPECIAL
723static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
724 int write, struct page **pages, int *nr)
725{
726 pte_t *ptep, *ptem;
727 int ret = 0;
728
729 ptem = ptep = pte_offset_map(&pmd, addr);
730 do {
731 /*
732 * In the line below we are assuming that the pte can be read
733 * atomically. If this is not the case for your architecture,
734 * please wrap this in a helper function!
735 *
736 * for an example see gup_get_pte in arch/x86/mm/gup.c
737 */
738 pte_t pte = ACCESS_ONCE(*ptep);
739 struct page *page;
740
741 /*
742 * Similar to the PMD case below, NUMA hinting must take slow
743 * path
744 */
745 if (!pte_present(pte) || pte_special(pte) ||
746 pte_numa(pte) || (write && !pte_write(pte)))
747 goto pte_unmap;
748
749 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
750 page = pte_page(pte);
751
752 if (!page_cache_get_speculative(page))
753 goto pte_unmap;
754
755 if (unlikely(pte_val(pte) != pte_val(*ptep))) {
756 put_page(page);
757 goto pte_unmap;
758 }
759
760 pages[*nr] = page;
761 (*nr)++;
762
763 } while (ptep++, addr += PAGE_SIZE, addr != end);
764
765 ret = 1;
766
767pte_unmap:
768 pte_unmap(ptem);
769 return ret;
770}
771#else
772
773/*
774 * If we can't determine whether or not a pte is special, then fail immediately
775 * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not
776 * to be special.
777 *
778 * For a futex to be placed on a THP tail page, get_futex_key requires a
779 * __get_user_pages_fast implementation that can pin pages. Thus it's still
780 * useful to have gup_huge_pmd even if we can't operate on ptes.
781 */
782static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
783 int write, struct page **pages, int *nr)
784{
785 return 0;
786}
787#endif /* __HAVE_ARCH_PTE_SPECIAL */
788
789static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
790 unsigned long end, int write, struct page **pages, int *nr)
791{
792 struct page *head, *page, *tail;
793 int refs;
794
795 if (write && !pmd_write(orig))
796 return 0;
797
798 refs = 0;
799 head = pmd_page(orig);
800 page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
801 tail = page;
802 do {
803 VM_BUG_ON_PAGE(compound_head(page) != head, page);
804 pages[*nr] = page;
805 (*nr)++;
806 page++;
807 refs++;
808 } while (addr += PAGE_SIZE, addr != end);
809
810 if (!page_cache_add_speculative(head, refs)) {
811 *nr -= refs;
812 return 0;
813 }
814
815 if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
816 *nr -= refs;
817 while (refs--)
818 put_page(head);
819 return 0;
820 }
821
822 /*
823 * Any tail pages need their mapcount reference taken before we
824 * return. (This allows the THP code to bump their ref count when
825 * they are split into base pages).
826 */
827 while (refs--) {
828 if (PageTail(tail))
829 get_huge_page_tail(tail);
830 tail++;
831 }
832
833 return 1;
834}
835
836static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
837 unsigned long end, int write, struct page **pages, int *nr)
838{
839 struct page *head, *page, *tail;
840 int refs;
841
842 if (write && !pud_write(orig))
843 return 0;
844
845 refs = 0;
846 head = pud_page(orig);
847 page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
848 tail = page;
849 do {
850 VM_BUG_ON_PAGE(compound_head(page) != head, page);
851 pages[*nr] = page;
852 (*nr)++;
853 page++;
854 refs++;
855 } while (addr += PAGE_SIZE, addr != end);
856
857 if (!page_cache_add_speculative(head, refs)) {
858 *nr -= refs;
859 return 0;
860 }
861
862 if (unlikely(pud_val(orig) != pud_val(*pudp))) {
863 *nr -= refs;
864 while (refs--)
865 put_page(head);
866 return 0;
867 }
868
869 while (refs--) {
870 if (PageTail(tail))
871 get_huge_page_tail(tail);
872 tail++;
873 }
874
875 return 1;
876}
877
878static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
879 int write, struct page **pages, int *nr)
880{
881 unsigned long next;
882 pmd_t *pmdp;
883
884 pmdp = pmd_offset(&pud, addr);
885 do {
886 pmd_t pmd = ACCESS_ONCE(*pmdp);
887
888 next = pmd_addr_end(addr, end);
889 if (pmd_none(pmd) || pmd_trans_splitting(pmd))
890 return 0;
891
892 if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) {
893 /*
894 * NUMA hinting faults need to be handled in the GUP
895 * slowpath for accounting purposes and so that they
896 * can be serialised against THP migration.
897 */
898 if (pmd_numa(pmd))
899 return 0;
900
901 if (!gup_huge_pmd(pmd, pmdp, addr, next, write,
902 pages, nr))
903 return 0;
904
905 } else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
906 return 0;
907 } while (pmdp++, addr = next, addr != end);
908
909 return 1;
910}
911
912static int gup_pud_range(pgd_t *pgdp, unsigned long addr, unsigned long end,
913 int write, struct page **pages, int *nr)
914{
915 unsigned long next;
916 pud_t *pudp;
917
918 pudp = pud_offset(pgdp, addr);
919 do {
920 pud_t pud = ACCESS_ONCE(*pudp);
921
922 next = pud_addr_end(addr, end);
923 if (pud_none(pud))
924 return 0;
925 if (pud_huge(pud)) {
926 if (!gup_huge_pud(pud, pudp, addr, next, write,
927 pages, nr))
928 return 0;
929 } else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
930 return 0;
931 } while (pudp++, addr = next, addr != end);
932
933 return 1;
934}
935
936/*
937 * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
938 * the regular GUP. It will only return non-negative values.
939 */
940int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
941 struct page **pages)
942{
943 struct mm_struct *mm = current->mm;
944 unsigned long addr, len, end;
945 unsigned long next, flags;
946 pgd_t *pgdp;
947 int nr = 0;
948
949 start &= PAGE_MASK;
950 addr = start;
951 len = (unsigned long) nr_pages << PAGE_SHIFT;
952 end = start + len;
953
954 if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
955 start, len)))
956 return 0;
957
958 /*
959 * Disable interrupts. We use the nested form as we can already have
960 * interrupts disabled by get_futex_key.
961 *
962 * With interrupts disabled, we block page table pages from being
963 * freed from under us. See mmu_gather_tlb in asm-generic/tlb.h
964 * for more details.
965 *
966 * We do not adopt an rcu_read_lock(.) here as we also want to
967 * block IPIs that come from THPs splitting.
968 */
969
970 local_irq_save(flags);
971 pgdp = pgd_offset(mm, addr);
972 do {
973 next = pgd_addr_end(addr, end);
974 if (pgd_none(*pgdp))
975 break;
976 else if (!gup_pud_range(pgdp, addr, next, write, pages, &nr))
977 break;
978 } while (pgdp++, addr = next, addr != end);
979 local_irq_restore(flags);
980
981 return nr;
982}
983
984/**
985 * get_user_pages_fast() - pin user pages in memory
986 * @start: starting user address
987 * @nr_pages: number of pages from start to pin
988 * @write: whether pages will be written to
989 * @pages: array that receives pointers to the pages pinned.
990 * Should be at least nr_pages long.
991 *
992 * Attempt to pin user pages in memory without taking mm->mmap_sem.
993 * If not successful, it will fall back to taking the lock and
994 * calling get_user_pages().
995 *
996 * Returns number of pages pinned. This may be fewer than the number
997 * requested. If nr_pages is 0 or negative, returns 0. If no pages
998 * were pinned, returns -errno.
999 */
1000int get_user_pages_fast(unsigned long start, int nr_pages, int write,
1001 struct page **pages)
1002{
1003 struct mm_struct *mm = current->mm;
1004 int nr, ret;
1005
1006 start &= PAGE_MASK;
1007 nr = __get_user_pages_fast(start, nr_pages, write, pages);
1008 ret = nr;
1009
1010 if (nr < nr_pages) {
1011 /* Try to get the remaining pages with get_user_pages */
1012 start += nr << PAGE_SHIFT;
1013 pages += nr;
1014
1015 down_read(&mm->mmap_sem);
1016 ret = get_user_pages(current, mm, start,
1017 nr_pages - nr, write, 0, pages, NULL);
1018 up_read(&mm->mmap_sem);
1019
1020 /* Have to be a bit careful with return values */
1021 if (nr > 0) {
1022 if (ret < 0)
1023 ret = nr;
1024 else
1025 ret += nr;
1026 }
1027 }
1028
1029 return ret;
1030}
1031
1032#endif /* CONFIG_HAVE_GENERIC_RCU_GUP */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f8ffd9412ec5..74c78aa8bc2f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1096,7 +1096,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1096 unsigned long mmun_end; /* For mmu_notifiers */ 1096 unsigned long mmun_end; /* For mmu_notifiers */
1097 1097
1098 ptl = pmd_lockptr(mm, pmd); 1098 ptl = pmd_lockptr(mm, pmd);
1099 VM_BUG_ON(!vma->anon_vma); 1099 VM_BUG_ON_VMA(!vma->anon_vma, vma);
1100 haddr = address & HPAGE_PMD_MASK; 1100 haddr = address & HPAGE_PMD_MASK;
1101 if (is_huge_zero_pmd(orig_pmd)) 1101 if (is_huge_zero_pmd(orig_pmd))
1102 goto alloc; 1102 goto alloc;
@@ -2048,7 +2048,7 @@ int __khugepaged_enter(struct mm_struct *mm)
2048 return -ENOMEM; 2048 return -ENOMEM;
2049 2049
2050 /* __khugepaged_exit() must not run from under us */ 2050 /* __khugepaged_exit() must not run from under us */
2051 VM_BUG_ON(khugepaged_test_exit(mm)); 2051 VM_BUG_ON_MM(khugepaged_test_exit(mm), mm);
2052 if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { 2052 if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
2053 free_mm_slot(mm_slot); 2053 free_mm_slot(mm_slot);
2054 return 0; 2054 return 0;
@@ -2083,7 +2083,7 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
2083 if (vma->vm_ops) 2083 if (vma->vm_ops)
2084 /* khugepaged not yet working on file or special mappings */ 2084 /* khugepaged not yet working on file or special mappings */
2085 return 0; 2085 return 0;
2086 VM_BUG_ON(vma->vm_flags & VM_NO_THP); 2086 VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma);
2087 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2087 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2088 hend = vma->vm_end & HPAGE_PMD_MASK; 2088 hend = vma->vm_end & HPAGE_PMD_MASK;
2089 if (hstart < hend) 2089 if (hstart < hend)
@@ -2322,23 +2322,17 @@ static struct page
2322 int node) 2322 int node)
2323{ 2323{
2324 VM_BUG_ON_PAGE(*hpage, *hpage); 2324 VM_BUG_ON_PAGE(*hpage, *hpage);
2325
2325 /* 2326 /*
2326 * Allocate the page while the vma is still valid and under 2327 * Before allocating the hugepage, release the mmap_sem read lock.
2327 * the mmap_sem read mode so there is no memory allocation 2328 * The allocation can take potentially a long time if it involves
2328 * later when we take the mmap_sem in write mode. This is more 2329 * sync compaction, and we do not need to hold the mmap_sem during
2329 * friendly behavior (OTOH it may actually hide bugs) to 2330 * that. We will recheck the vma after taking it again in write mode.
2330 * filesystems in userland with daemons allocating memory in
2331 * the userland I/O paths. Allocating memory with the
2332 * mmap_sem in read mode is good idea also to allow greater
2333 * scalability.
2334 */ 2331 */
2332 up_read(&mm->mmap_sem);
2333
2335 *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask( 2334 *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask(
2336 khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER); 2335 khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
2337 /*
2338 * After allocating the hugepage, release the mmap_sem read lock in
2339 * preparation for taking it in write mode.
2340 */
2341 up_read(&mm->mmap_sem);
2342 if (unlikely(!*hpage)) { 2336 if (unlikely(!*hpage)) {
2343 count_vm_event(THP_COLLAPSE_ALLOC_FAILED); 2337 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2344 *hpage = ERR_PTR(-ENOMEM); 2338 *hpage = ERR_PTR(-ENOMEM);
@@ -2412,7 +2406,7 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
2412 return false; 2406 return false;
2413 if (is_vma_temporary_stack(vma)) 2407 if (is_vma_temporary_stack(vma))
2414 return false; 2408 return false;
2415 VM_BUG_ON(vma->vm_flags & VM_NO_THP); 2409 VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma);
2416 return true; 2410 return true;
2417} 2411}
2418 2412
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index eeceeeb09019..9fd722769927 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -434,7 +434,7 @@ static inline struct resv_map *inode_resv_map(struct inode *inode)
434 434
435static struct resv_map *vma_resv_map(struct vm_area_struct *vma) 435static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
436{ 436{
437 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 437 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
438 if (vma->vm_flags & VM_MAYSHARE) { 438 if (vma->vm_flags & VM_MAYSHARE) {
439 struct address_space *mapping = vma->vm_file->f_mapping; 439 struct address_space *mapping = vma->vm_file->f_mapping;
440 struct inode *inode = mapping->host; 440 struct inode *inode = mapping->host;
@@ -449,8 +449,8 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
449 449
450static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) 450static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
451{ 451{
452 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 452 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
453 VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); 453 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
454 454
455 set_vma_private_data(vma, (get_vma_private_data(vma) & 455 set_vma_private_data(vma, (get_vma_private_data(vma) &
456 HPAGE_RESV_MASK) | (unsigned long)map); 456 HPAGE_RESV_MASK) | (unsigned long)map);
@@ -458,15 +458,15 @@ static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
458 458
459static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) 459static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
460{ 460{
461 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 461 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
462 VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); 462 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
463 463
464 set_vma_private_data(vma, get_vma_private_data(vma) | flags); 464 set_vma_private_data(vma, get_vma_private_data(vma) | flags);
465} 465}
466 466
467static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) 467static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
468{ 468{
469 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 469 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
470 470
471 return (get_vma_private_data(vma) & flag) != 0; 471 return (get_vma_private_data(vma) & flag) != 0;
472} 472}
@@ -474,7 +474,7 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
474/* Reset counters to 0 and clear all HPAGE_RESV_* flags */ 474/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
475void reset_vma_resv_huge_pages(struct vm_area_struct *vma) 475void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
476{ 476{
477 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 477 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
478 if (!(vma->vm_flags & VM_MAYSHARE)) 478 if (!(vma->vm_flags & VM_MAYSHARE))
479 vma->vm_private_data = (void *)0; 479 vma->vm_private_data = (void *)0;
480} 480}
diff --git a/mm/internal.h b/mm/internal.h
index a1b651b11c5f..829304090b90 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -142,10 +142,10 @@ struct compact_control {
142 bool finished_update_migrate; 142 bool finished_update_migrate;
143 143
144 int order; /* order a direct compactor needs */ 144 int order; /* order a direct compactor needs */
145 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 145 const gfp_t gfp_mask; /* gfp mask of a direct compactor */
146 struct zone *zone; 146 struct zone *zone;
147 bool contended; /* True if a lock was contended, or 147 int contended; /* Signal need_sched() or lock
148 * need_resched() true during async 148 * contention detected during
149 * compaction 149 * compaction
150 */ 150 */
151}; 151};
@@ -154,8 +154,8 @@ unsigned long
154isolate_freepages_range(struct compact_control *cc, 154isolate_freepages_range(struct compact_control *cc,
155 unsigned long start_pfn, unsigned long end_pfn); 155 unsigned long start_pfn, unsigned long end_pfn);
156unsigned long 156unsigned long
157isolate_migratepages_range(struct zone *zone, struct compact_control *cc, 157isolate_migratepages_range(struct compact_control *cc,
158 unsigned long low_pfn, unsigned long end_pfn, bool unevictable); 158 unsigned long low_pfn, unsigned long end_pfn);
159 159
160#endif 160#endif
161 161
@@ -164,7 +164,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
164 * general, page_zone(page)->lock must be held by the caller to prevent the 164 * general, page_zone(page)->lock must be held by the caller to prevent the
165 * page from being allocated in parallel and returning garbage as the order. 165 * page from being allocated in parallel and returning garbage as the order.
166 * If a caller does not hold page_zone(page)->lock, it must guarantee that the 166 * If a caller does not hold page_zone(page)->lock, it must guarantee that the
167 * page cannot be allocated or merged in parallel. 167 * page cannot be allocated or merged in parallel. Alternatively, it must
168 * handle invalid values gracefully, and use page_order_unsafe() below.
168 */ 169 */
169static inline unsigned long page_order(struct page *page) 170static inline unsigned long page_order(struct page *page)
170{ 171{
@@ -172,6 +173,19 @@ static inline unsigned long page_order(struct page *page)
172 return page_private(page); 173 return page_private(page);
173} 174}
174 175
176/*
177 * Like page_order(), but for callers who cannot afford to hold the zone lock.
178 * PageBuddy() should be checked first by the caller to minimize race window,
179 * and invalid values must be handled gracefully.
180 *
181 * ACCESS_ONCE is used so that if the caller assigns the result into a local
182 * variable and e.g. tests it for valid range before using, the compiler cannot
183 * decide to remove the variable and inline the page_private(page) multiple
184 * times, potentially observing different values in the tests and the actual
185 * use of the result.
186 */
187#define page_order_unsafe(page) ACCESS_ONCE(page_private(page))
188
175static inline bool is_cow_mapping(vm_flags_t flags) 189static inline bool is_cow_mapping(vm_flags_t flags)
176{ 190{
177 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 191 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
index 4a5822a586e6..8da581fa9060 100644
--- a/mm/interval_tree.c
+++ b/mm/interval_tree.c
@@ -34,7 +34,7 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node,
34 struct vm_area_struct *parent; 34 struct vm_area_struct *parent;
35 unsigned long last = vma_last_pgoff(node); 35 unsigned long last = vma_last_pgoff(node);
36 36
37 VM_BUG_ON(vma_start_pgoff(node) != vma_start_pgoff(prev)); 37 VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node);
38 38
39 if (!prev->shared.linear.rb.rb_right) { 39 if (!prev->shared.linear.rb.rb_right) {
40 parent = prev; 40 parent = prev;
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
index fd814fd61319..cab58bb592d8 100644
--- a/mm/kmemcheck.c
+++ b/mm/kmemcheck.c
@@ -2,6 +2,7 @@
2#include <linux/mm_types.h> 2#include <linux/mm_types.h>
3#include <linux/mm.h> 3#include <linux/mm.h>
4#include <linux/slab.h> 4#include <linux/slab.h>
5#include "slab.h"
5#include <linux/kmemcheck.h> 6#include <linux/kmemcheck.h>
6 7
7void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) 8void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
diff --git a/mm/ksm.c b/mm/ksm.c
index fb7590222706..6b2e337bc03c 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2310,7 +2310,7 @@ static int __init ksm_init(void)
2310 2310
2311 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); 2311 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
2312 if (IS_ERR(ksm_thread)) { 2312 if (IS_ERR(ksm_thread)) {
2313 printk(KERN_ERR "ksm: creating kthread failed\n"); 2313 pr_err("ksm: creating kthread failed\n");
2314 err = PTR_ERR(ksm_thread); 2314 err = PTR_ERR(ksm_thread);
2315 goto out_free; 2315 goto out_free;
2316 } 2316 }
@@ -2318,7 +2318,7 @@ static int __init ksm_init(void)
2318#ifdef CONFIG_SYSFS 2318#ifdef CONFIG_SYSFS
2319 err = sysfs_create_group(mm_kobj, &ksm_attr_group); 2319 err = sysfs_create_group(mm_kobj, &ksm_attr_group);
2320 if (err) { 2320 if (err) {
2321 printk(KERN_ERR "ksm: register sysfs failed\n"); 2321 pr_err("ksm: register sysfs failed\n");
2322 kthread_stop(ksm_thread); 2322 kthread_stop(ksm_thread);
2323 goto out_free; 2323 goto out_free;
2324 } 2324 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 28928ce9b07f..23976fd885fd 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -318,9 +318,6 @@ struct mem_cgroup {
318 /* OOM-Killer disable */ 318 /* OOM-Killer disable */
319 int oom_kill_disable; 319 int oom_kill_disable;
320 320
321 /* set when res.limit == memsw.limit */
322 bool memsw_is_minimum;
323
324 /* protect arrays of thresholds */ 321 /* protect arrays of thresholds */
325 struct mutex thresholds_lock; 322 struct mutex thresholds_lock;
326 323
@@ -484,14 +481,6 @@ enum res_type {
484#define OOM_CONTROL (0) 481#define OOM_CONTROL (0)
485 482
486/* 483/*
487 * Reclaim flags for mem_cgroup_hierarchical_reclaim
488 */
489#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
490#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
491#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
492#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
493
494/*
495 * The memcg_create_mutex will be held whenever a new cgroup is created. 484 * The memcg_create_mutex will be held whenever a new cgroup is created.
496 * As a consequence, any change that needs to protect against new child cgroups 485 * As a consequence, any change that needs to protect against new child cgroups
497 * appearing has to hold it as well. 486 * appearing has to hold it as well.
@@ -649,11 +638,13 @@ int memcg_limited_groups_array_size;
649struct static_key memcg_kmem_enabled_key; 638struct static_key memcg_kmem_enabled_key;
650EXPORT_SYMBOL(memcg_kmem_enabled_key); 639EXPORT_SYMBOL(memcg_kmem_enabled_key);
651 640
641static void memcg_free_cache_id(int id);
642
652static void disarm_kmem_keys(struct mem_cgroup *memcg) 643static void disarm_kmem_keys(struct mem_cgroup *memcg)
653{ 644{
654 if (memcg_kmem_is_active(memcg)) { 645 if (memcg_kmem_is_active(memcg)) {
655 static_key_slow_dec(&memcg_kmem_enabled_key); 646 static_key_slow_dec(&memcg_kmem_enabled_key);
656 ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id); 647 memcg_free_cache_id(memcg->kmemcg_id);
657 } 648 }
658 /* 649 /*
659 * This check can't live in kmem destruction function, 650 * This check can't live in kmem destruction function,
@@ -1806,42 +1797,6 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1806 NULL, "Memory cgroup out of memory"); 1797 NULL, "Memory cgroup out of memory");
1807} 1798}
1808 1799
1809static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1810 gfp_t gfp_mask,
1811 unsigned long flags)
1812{
1813 unsigned long total = 0;
1814 bool noswap = false;
1815 int loop;
1816
1817 if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
1818 noswap = true;
1819 if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
1820 noswap = true;
1821
1822 for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
1823 if (loop)
1824 drain_all_stock_async(memcg);
1825 total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
1826 /*
1827 * Allow limit shrinkers, which are triggered directly
1828 * by userspace, to catch signals and stop reclaim
1829 * after minimal progress, regardless of the margin.
1830 */
1831 if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
1832 break;
1833 if (mem_cgroup_margin(memcg))
1834 break;
1835 /*
1836 * If nothing was reclaimed after two attempts, there
1837 * may be no reclaimable pages in this hierarchy.
1838 */
1839 if (loop && !total)
1840 break;
1841 }
1842 return total;
1843}
1844
1845/** 1800/**
1846 * test_mem_cgroup_node_reclaimable 1801 * test_mem_cgroup_node_reclaimable
1847 * @memcg: the target memcg 1802 * @memcg: the target memcg
@@ -2544,8 +2499,9 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2544 struct mem_cgroup *mem_over_limit; 2499 struct mem_cgroup *mem_over_limit;
2545 struct res_counter *fail_res; 2500 struct res_counter *fail_res;
2546 unsigned long nr_reclaimed; 2501 unsigned long nr_reclaimed;
2547 unsigned long flags = 0;
2548 unsigned long long size; 2502 unsigned long long size;
2503 bool may_swap = true;
2504 bool drained = false;
2549 int ret = 0; 2505 int ret = 0;
2550 2506
2551 if (mem_cgroup_is_root(memcg)) 2507 if (mem_cgroup_is_root(memcg))
@@ -2555,16 +2511,17 @@ retry:
2555 goto done; 2511 goto done;
2556 2512
2557 size = batch * PAGE_SIZE; 2513 size = batch * PAGE_SIZE;
2558 if (!res_counter_charge(&memcg->res, size, &fail_res)) { 2514 if (!do_swap_account ||
2559 if (!do_swap_account) 2515 !res_counter_charge(&memcg->memsw, size, &fail_res)) {
2560 goto done_restock; 2516 if (!res_counter_charge(&memcg->res, size, &fail_res))
2561 if (!res_counter_charge(&memcg->memsw, size, &fail_res))
2562 goto done_restock; 2517 goto done_restock;
2563 res_counter_uncharge(&memcg->res, size); 2518 if (do_swap_account)
2564 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 2519 res_counter_uncharge(&memcg->memsw, size);
2565 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
2566 } else
2567 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 2520 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
2521 } else {
2522 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
2523 may_swap = false;
2524 }
2568 2525
2569 if (batch > nr_pages) { 2526 if (batch > nr_pages) {
2570 batch = nr_pages; 2527 batch = nr_pages;
@@ -2588,11 +2545,18 @@ retry:
2588 if (!(gfp_mask & __GFP_WAIT)) 2545 if (!(gfp_mask & __GFP_WAIT))
2589 goto nomem; 2546 goto nomem;
2590 2547
2591 nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); 2548 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2549 gfp_mask, may_swap);
2592 2550
2593 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2551 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2594 goto retry; 2552 goto retry;
2595 2553
2554 if (!drained) {
2555 drain_all_stock_async(mem_over_limit);
2556 drained = true;
2557 goto retry;
2558 }
2559
2596 if (gfp_mask & __GFP_NORETRY) 2560 if (gfp_mask & __GFP_NORETRY)
2597 goto nomem; 2561 goto nomem;
2598 /* 2562 /*
@@ -2798,12 +2762,6 @@ static DEFINE_MUTEX(memcg_slab_mutex);
2798 2762
2799static DEFINE_MUTEX(activate_kmem_mutex); 2763static DEFINE_MUTEX(activate_kmem_mutex);
2800 2764
2801static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
2802{
2803 return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
2804 memcg_kmem_is_active(memcg);
2805}
2806
2807/* 2765/*
2808 * This is a bit cumbersome, but it is rarely used and avoids a backpointer 2766 * This is a bit cumbersome, but it is rarely used and avoids a backpointer
2809 * in the memcg_cache_params struct. 2767 * in the memcg_cache_params struct.
@@ -2823,7 +2781,7 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
2823 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 2781 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
2824 struct memcg_cache_params *params; 2782 struct memcg_cache_params *params;
2825 2783
2826 if (!memcg_can_account_kmem(memcg)) 2784 if (!memcg_kmem_is_active(memcg))
2827 return -EIO; 2785 return -EIO;
2828 2786
2829 print_slabinfo_header(m); 2787 print_slabinfo_header(m);
@@ -2906,19 +2864,44 @@ int memcg_cache_id(struct mem_cgroup *memcg)
2906 return memcg ? memcg->kmemcg_id : -1; 2864 return memcg ? memcg->kmemcg_id : -1;
2907} 2865}
2908 2866
2909static size_t memcg_caches_array_size(int num_groups) 2867static int memcg_alloc_cache_id(void)
2910{ 2868{
2911 ssize_t size; 2869 int id, size;
2912 if (num_groups <= 0) 2870 int err;
2913 return 0; 2871
2872 id = ida_simple_get(&kmem_limited_groups,
2873 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2874 if (id < 0)
2875 return id;
2876
2877 if (id < memcg_limited_groups_array_size)
2878 return id;
2879
2880 /*
2881 * There's no space for the new id in memcg_caches arrays,
2882 * so we have to grow them.
2883 */
2914 2884
2915 size = 2 * num_groups; 2885 size = 2 * (id + 1);
2916 if (size < MEMCG_CACHES_MIN_SIZE) 2886 if (size < MEMCG_CACHES_MIN_SIZE)
2917 size = MEMCG_CACHES_MIN_SIZE; 2887 size = MEMCG_CACHES_MIN_SIZE;
2918 else if (size > MEMCG_CACHES_MAX_SIZE) 2888 else if (size > MEMCG_CACHES_MAX_SIZE)
2919 size = MEMCG_CACHES_MAX_SIZE; 2889 size = MEMCG_CACHES_MAX_SIZE;
2920 2890
2921 return size; 2891 mutex_lock(&memcg_slab_mutex);
2892 err = memcg_update_all_caches(size);
2893 mutex_unlock(&memcg_slab_mutex);
2894
2895 if (err) {
2896 ida_simple_remove(&kmem_limited_groups, id);
2897 return err;
2898 }
2899 return id;
2900}
2901
2902static void memcg_free_cache_id(int id)
2903{
2904 ida_simple_remove(&kmem_limited_groups, id);
2922} 2905}
2923 2906
2924/* 2907/*
@@ -2928,97 +2911,7 @@ static size_t memcg_caches_array_size(int num_groups)
2928 */ 2911 */
2929void memcg_update_array_size(int num) 2912void memcg_update_array_size(int num)
2930{ 2913{
2931 if (num > memcg_limited_groups_array_size) 2914 memcg_limited_groups_array_size = num;
2932 memcg_limited_groups_array_size = memcg_caches_array_size(num);
2933}
2934
2935int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
2936{
2937 struct memcg_cache_params *cur_params = s->memcg_params;
2938
2939 VM_BUG_ON(!is_root_cache(s));
2940
2941 if (num_groups > memcg_limited_groups_array_size) {
2942 int i;
2943 struct memcg_cache_params *new_params;
2944 ssize_t size = memcg_caches_array_size(num_groups);
2945
2946 size *= sizeof(void *);
2947 size += offsetof(struct memcg_cache_params, memcg_caches);
2948
2949 new_params = kzalloc(size, GFP_KERNEL);
2950 if (!new_params)
2951 return -ENOMEM;
2952
2953 new_params->is_root_cache = true;
2954
2955 /*
2956 * There is the chance it will be bigger than
2957 * memcg_limited_groups_array_size, if we failed an allocation
2958 * in a cache, in which case all caches updated before it, will
2959 * have a bigger array.
2960 *
2961 * But if that is the case, the data after
2962 * memcg_limited_groups_array_size is certainly unused
2963 */
2964 for (i = 0; i < memcg_limited_groups_array_size; i++) {
2965 if (!cur_params->memcg_caches[i])
2966 continue;
2967 new_params->memcg_caches[i] =
2968 cur_params->memcg_caches[i];
2969 }
2970
2971 /*
2972 * Ideally, we would wait until all caches succeed, and only
2973 * then free the old one. But this is not worth the extra
2974 * pointer per-cache we'd have to have for this.
2975 *
2976 * It is not a big deal if some caches are left with a size
2977 * bigger than the others. And all updates will reset this
2978 * anyway.
2979 */
2980 rcu_assign_pointer(s->memcg_params, new_params);
2981 if (cur_params)
2982 kfree_rcu(cur_params, rcu_head);
2983 }
2984 return 0;
2985}
2986
2987int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
2988 struct kmem_cache *root_cache)
2989{
2990 size_t size;
2991
2992 if (!memcg_kmem_enabled())
2993 return 0;
2994
2995 if (!memcg) {
2996 size = offsetof(struct memcg_cache_params, memcg_caches);
2997 size += memcg_limited_groups_array_size * sizeof(void *);
2998 } else
2999 size = sizeof(struct memcg_cache_params);
3000
3001 s->memcg_params = kzalloc(size, GFP_KERNEL);
3002 if (!s->memcg_params)
3003 return -ENOMEM;
3004
3005 if (memcg) {
3006 s->memcg_params->memcg = memcg;
3007 s->memcg_params->root_cache = root_cache;
3008 css_get(&memcg->css);
3009 } else
3010 s->memcg_params->is_root_cache = true;
3011
3012 return 0;
3013}
3014
3015void memcg_free_cache_params(struct kmem_cache *s)
3016{
3017 if (!s->memcg_params)
3018 return;
3019 if (!s->memcg_params->is_root_cache)
3020 css_put(&s->memcg_params->memcg->css);
3021 kfree(s->memcg_params);
3022} 2915}
3023 2916
3024static void memcg_register_cache(struct mem_cgroup *memcg, 2917static void memcg_register_cache(struct mem_cgroup *memcg,
@@ -3051,6 +2944,7 @@ static void memcg_register_cache(struct mem_cgroup *memcg,
3051 if (!cachep) 2944 if (!cachep)
3052 return; 2945 return;
3053 2946
2947 css_get(&memcg->css);
3054 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); 2948 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
3055 2949
3056 /* 2950 /*
@@ -3084,6 +2978,9 @@ static void memcg_unregister_cache(struct kmem_cache *cachep)
3084 list_del(&cachep->memcg_params->list); 2978 list_del(&cachep->memcg_params->list);
3085 2979
3086 kmem_cache_destroy(cachep); 2980 kmem_cache_destroy(cachep);
2981
2982 /* drop the reference taken in memcg_register_cache */
2983 css_put(&memcg->css);
3087} 2984}
3088 2985
3089/* 2986/*
@@ -3261,7 +3158,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
3261 rcu_read_lock(); 3158 rcu_read_lock();
3262 memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); 3159 memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
3263 3160
3264 if (!memcg_can_account_kmem(memcg)) 3161 if (!memcg_kmem_is_active(memcg))
3265 goto out; 3162 goto out;
3266 3163
3267 memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); 3164 memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
@@ -3346,7 +3243,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
3346 3243
3347 memcg = get_mem_cgroup_from_mm(current->mm); 3244 memcg = get_mem_cgroup_from_mm(current->mm);
3348 3245
3349 if (!memcg_can_account_kmem(memcg)) { 3246 if (!memcg_kmem_is_active(memcg)) {
3350 css_put(&memcg->css); 3247 css_put(&memcg->css);
3351 return true; 3248 return true;
3352 } 3249 }
@@ -3688,7 +3585,6 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3688 unsigned long long val) 3585 unsigned long long val)
3689{ 3586{
3690 int retry_count; 3587 int retry_count;
3691 u64 memswlimit, memlimit;
3692 int ret = 0; 3588 int ret = 0;
3693 int children = mem_cgroup_count_children(memcg); 3589 int children = mem_cgroup_count_children(memcg);
3694 u64 curusage, oldusage; 3590 u64 curusage, oldusage;
@@ -3715,31 +3611,23 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3715 * We have to guarantee memcg->res.limit <= memcg->memsw.limit. 3611 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
3716 */ 3612 */
3717 mutex_lock(&set_limit_mutex); 3613 mutex_lock(&set_limit_mutex);
3718 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3614 if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) {
3719 if (memswlimit < val) {
3720 ret = -EINVAL; 3615 ret = -EINVAL;
3721 mutex_unlock(&set_limit_mutex); 3616 mutex_unlock(&set_limit_mutex);
3722 break; 3617 break;
3723 } 3618 }
3724 3619
3725 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3620 if (res_counter_read_u64(&memcg->res, RES_LIMIT) < val)
3726 if (memlimit < val)
3727 enlarge = 1; 3621 enlarge = 1;
3728 3622
3729 ret = res_counter_set_limit(&memcg->res, val); 3623 ret = res_counter_set_limit(&memcg->res, val);
3730 if (!ret) {
3731 if (memswlimit == val)
3732 memcg->memsw_is_minimum = true;
3733 else
3734 memcg->memsw_is_minimum = false;
3735 }
3736 mutex_unlock(&set_limit_mutex); 3624 mutex_unlock(&set_limit_mutex);
3737 3625
3738 if (!ret) 3626 if (!ret)
3739 break; 3627 break;
3740 3628
3741 mem_cgroup_reclaim(memcg, GFP_KERNEL, 3629 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
3742 MEM_CGROUP_RECLAIM_SHRINK); 3630
3743 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3631 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3744 /* Usage is reduced ? */ 3632 /* Usage is reduced ? */
3745 if (curusage >= oldusage) 3633 if (curusage >= oldusage)
@@ -3757,7 +3645,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3757 unsigned long long val) 3645 unsigned long long val)
3758{ 3646{
3759 int retry_count; 3647 int retry_count;
3760 u64 memlimit, memswlimit, oldusage, curusage; 3648 u64 oldusage, curusage;
3761 int children = mem_cgroup_count_children(memcg); 3649 int children = mem_cgroup_count_children(memcg);
3762 int ret = -EBUSY; 3650 int ret = -EBUSY;
3763 int enlarge = 0; 3651 int enlarge = 0;
@@ -3776,30 +3664,21 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3776 * We have to guarantee memcg->res.limit <= memcg->memsw.limit. 3664 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
3777 */ 3665 */
3778 mutex_lock(&set_limit_mutex); 3666 mutex_lock(&set_limit_mutex);
3779 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3667 if (res_counter_read_u64(&memcg->res, RES_LIMIT) > val) {
3780 if (memlimit > val) {
3781 ret = -EINVAL; 3668 ret = -EINVAL;
3782 mutex_unlock(&set_limit_mutex); 3669 mutex_unlock(&set_limit_mutex);
3783 break; 3670 break;
3784 } 3671 }
3785 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3672 if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val)
3786 if (memswlimit < val)
3787 enlarge = 1; 3673 enlarge = 1;
3788 ret = res_counter_set_limit(&memcg->memsw, val); 3674 ret = res_counter_set_limit(&memcg->memsw, val);
3789 if (!ret) {
3790 if (memlimit == val)
3791 memcg->memsw_is_minimum = true;
3792 else
3793 memcg->memsw_is_minimum = false;
3794 }
3795 mutex_unlock(&set_limit_mutex); 3675 mutex_unlock(&set_limit_mutex);
3796 3676
3797 if (!ret) 3677 if (!ret)
3798 break; 3678 break;
3799 3679
3800 mem_cgroup_reclaim(memcg, GFP_KERNEL, 3680 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
3801 MEM_CGROUP_RECLAIM_NOSWAP | 3681
3802 MEM_CGROUP_RECLAIM_SHRINK);
3803 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3682 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3804 /* Usage is reduced ? */ 3683 /* Usage is reduced ? */
3805 if (curusage >= oldusage) 3684 if (curusage >= oldusage)
@@ -4048,8 +3927,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
4048 if (signal_pending(current)) 3927 if (signal_pending(current))
4049 return -EINTR; 3928 return -EINTR;
4050 3929
4051 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, 3930 progress = try_to_free_mem_cgroup_pages(memcg, 1,
4052 false); 3931 GFP_KERNEL, true);
4053 if (!progress) { 3932 if (!progress) {
4054 nr_retries--; 3933 nr_retries--;
4055 /* maybe some writeback is necessary */ 3934 /* maybe some writeback is necessary */
@@ -4214,23 +4093,12 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg,
4214 if (err) 4093 if (err)
4215 goto out; 4094 goto out;
4216 4095
4217 memcg_id = ida_simple_get(&kmem_limited_groups, 4096 memcg_id = memcg_alloc_cache_id();
4218 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
4219 if (memcg_id < 0) { 4097 if (memcg_id < 0) {
4220 err = memcg_id; 4098 err = memcg_id;
4221 goto out; 4099 goto out;
4222 } 4100 }
4223 4101
4224 /*
4225 * Make sure we have enough space for this cgroup in each root cache's
4226 * memcg_params.
4227 */
4228 mutex_lock(&memcg_slab_mutex);
4229 err = memcg_update_all_caches(memcg_id + 1);
4230 mutex_unlock(&memcg_slab_mutex);
4231 if (err)
4232 goto out_rmid;
4233
4234 memcg->kmemcg_id = memcg_id; 4102 memcg->kmemcg_id = memcg_id;
4235 INIT_LIST_HEAD(&memcg->memcg_slab_caches); 4103 INIT_LIST_HEAD(&memcg->memcg_slab_caches);
4236 4104
@@ -4251,10 +4119,6 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg,
4251out: 4119out:
4252 memcg_resume_kmem_account(); 4120 memcg_resume_kmem_account();
4253 return err; 4121 return err;
4254
4255out_rmid:
4256 ida_simple_remove(&kmem_limited_groups, memcg_id);
4257 goto out;
4258} 4122}
4259 4123
4260static int memcg_activate_kmem(struct mem_cgroup *memcg, 4124static int memcg_activate_kmem(struct mem_cgroup *memcg,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2ff8c2325e96..29d8693d0c61 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1307,7 +1307,7 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
1307/* 1307/*
1308 * Confirm all pages in a range [start, end) is belongs to the same zone. 1308 * Confirm all pages in a range [start, end) is belongs to the same zone.
1309 */ 1309 */
1310static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) 1310int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
1311{ 1311{
1312 unsigned long pfn; 1312 unsigned long pfn;
1313 struct zone *zone = NULL; 1313 struct zone *zone = NULL;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8f5330d74f47..e58725aff7e9 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -123,25 +123,23 @@ static struct mempolicy default_policy = {
123 123
124static struct mempolicy preferred_node_policy[MAX_NUMNODES]; 124static struct mempolicy preferred_node_policy[MAX_NUMNODES];
125 125
126static struct mempolicy *get_task_policy(struct task_struct *p) 126struct mempolicy *get_task_policy(struct task_struct *p)
127{ 127{
128 struct mempolicy *pol = p->mempolicy; 128 struct mempolicy *pol = p->mempolicy;
129 int node;
129 130
130 if (!pol) { 131 if (pol)
131 int node = numa_node_id(); 132 return pol;
132 133
133 if (node != NUMA_NO_NODE) { 134 node = numa_node_id();
134 pol = &preferred_node_policy[node]; 135 if (node != NUMA_NO_NODE) {
135 /* 136 pol = &preferred_node_policy[node];
136 * preferred_node_policy is not initialised early in 137 /* preferred_node_policy is not initialised early in boot */
137 * boot 138 if (pol->mode)
138 */ 139 return pol;
139 if (!pol->mode)
140 pol = NULL;
141 }
142 } 140 }
143 141
144 return pol; 142 return &default_policy;
145} 143}
146 144
147static const struct mempolicy_operations { 145static const struct mempolicy_operations {
@@ -683,7 +681,9 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
683 } 681 }
684 682
685 if (flags & MPOL_MF_LAZY) { 683 if (flags & MPOL_MF_LAZY) {
686 change_prot_numa(vma, start, endvma); 684 /* Similar to task_numa_work, skip inaccessible VMAs */
685 if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
686 change_prot_numa(vma, start, endvma);
687 goto next; 687 goto next;
688 } 688 }
689 689
@@ -804,7 +804,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
804 nodemask_t *nodes) 804 nodemask_t *nodes)
805{ 805{
806 struct mempolicy *new, *old; 806 struct mempolicy *new, *old;
807 struct mm_struct *mm = current->mm;
808 NODEMASK_SCRATCH(scratch); 807 NODEMASK_SCRATCH(scratch);
809 int ret; 808 int ret;
810 809
@@ -816,20 +815,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
816 ret = PTR_ERR(new); 815 ret = PTR_ERR(new);
817 goto out; 816 goto out;
818 } 817 }
819 /* 818
820 * prevent changing our mempolicy while show_numa_maps()
821 * is using it.
822 * Note: do_set_mempolicy() can be called at init time
823 * with no 'mm'.
824 */
825 if (mm)
826 down_write(&mm->mmap_sem);
827 task_lock(current); 819 task_lock(current);
828 ret = mpol_set_nodemask(new, nodes, scratch); 820 ret = mpol_set_nodemask(new, nodes, scratch);
829 if (ret) { 821 if (ret) {
830 task_unlock(current); 822 task_unlock(current);
831 if (mm)
832 up_write(&mm->mmap_sem);
833 mpol_put(new); 823 mpol_put(new);
834 goto out; 824 goto out;
835 } 825 }
@@ -839,9 +829,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
839 nodes_weight(new->v.nodes)) 829 nodes_weight(new->v.nodes))
840 current->il_next = first_node(new->v.nodes); 830 current->il_next = first_node(new->v.nodes);
841 task_unlock(current); 831 task_unlock(current);
842 if (mm)
843 up_write(&mm->mmap_sem);
844
845 mpol_put(old); 832 mpol_put(old);
846 ret = 0; 833 ret = 0;
847out: 834out:
@@ -1605,32 +1592,14 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1605 1592
1606#endif 1593#endif
1607 1594
1608/* 1595struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1609 * get_vma_policy(@task, @vma, @addr) 1596 unsigned long addr)
1610 * @task: task for fallback if vma policy == default
1611 * @vma: virtual memory area whose policy is sought
1612 * @addr: address in @vma for shared policy lookup
1613 *
1614 * Returns effective policy for a VMA at specified address.
1615 * Falls back to @task or system default policy, as necessary.
1616 * Current or other task's task mempolicy and non-shared vma policies must be
1617 * protected by task_lock(task) by the caller.
1618 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1619 * count--added by the get_policy() vm_op, as appropriate--to protect against
1620 * freeing by another task. It is the caller's responsibility to free the
1621 * extra reference for shared policies.
1622 */
1623struct mempolicy *get_vma_policy(struct task_struct *task,
1624 struct vm_area_struct *vma, unsigned long addr)
1625{ 1597{
1626 struct mempolicy *pol = get_task_policy(task); 1598 struct mempolicy *pol = NULL;
1627 1599
1628 if (vma) { 1600 if (vma) {
1629 if (vma->vm_ops && vma->vm_ops->get_policy) { 1601 if (vma->vm_ops && vma->vm_ops->get_policy) {
1630 struct mempolicy *vpol = vma->vm_ops->get_policy(vma, 1602 pol = vma->vm_ops->get_policy(vma, addr);
1631 addr);
1632 if (vpol)
1633 pol = vpol;
1634 } else if (vma->vm_policy) { 1603 } else if (vma->vm_policy) {
1635 pol = vma->vm_policy; 1604 pol = vma->vm_policy;
1636 1605
@@ -1644,31 +1613,51 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
1644 mpol_get(pol); 1613 mpol_get(pol);
1645 } 1614 }
1646 } 1615 }
1616
1617 return pol;
1618}
1619
1620/*
1621 * get_vma_policy(@vma, @addr)
1622 * @vma: virtual memory area whose policy is sought
1623 * @addr: address in @vma for shared policy lookup
1624 *
1625 * Returns effective policy for a VMA at specified address.
1626 * Falls back to current->mempolicy or system default policy, as necessary.
1627 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1628 * count--added by the get_policy() vm_op, as appropriate--to protect against
1629 * freeing by another task. It is the caller's responsibility to free the
1630 * extra reference for shared policies.
1631 */
1632static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1633 unsigned long addr)
1634{
1635 struct mempolicy *pol = __get_vma_policy(vma, addr);
1636
1647 if (!pol) 1637 if (!pol)
1648 pol = &default_policy; 1638 pol = get_task_policy(current);
1639
1649 return pol; 1640 return pol;
1650} 1641}
1651 1642
1652bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma) 1643bool vma_policy_mof(struct vm_area_struct *vma)
1653{ 1644{
1654 struct mempolicy *pol = get_task_policy(task); 1645 struct mempolicy *pol;
1655 if (vma) {
1656 if (vma->vm_ops && vma->vm_ops->get_policy) {
1657 bool ret = false;
1658 1646
1659 pol = vma->vm_ops->get_policy(vma, vma->vm_start); 1647 if (vma->vm_ops && vma->vm_ops->get_policy) {
1660 if (pol && (pol->flags & MPOL_F_MOF)) 1648 bool ret = false;
1661 ret = true;
1662 mpol_cond_put(pol);
1663 1649
1664 return ret; 1650 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1665 } else if (vma->vm_policy) { 1651 if (pol && (pol->flags & MPOL_F_MOF))
1666 pol = vma->vm_policy; 1652 ret = true;
1667 } 1653 mpol_cond_put(pol);
1654
1655 return ret;
1668 } 1656 }
1669 1657
1658 pol = vma->vm_policy;
1670 if (!pol) 1659 if (!pol)
1671 return default_policy.flags & MPOL_F_MOF; 1660 pol = get_task_policy(current);
1672 1661
1673 return pol->flags & MPOL_F_MOF; 1662 return pol->flags & MPOL_F_MOF;
1674} 1663}
@@ -1874,7 +1863,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1874{ 1863{
1875 struct zonelist *zl; 1864 struct zonelist *zl;
1876 1865
1877 *mpol = get_vma_policy(current, vma, addr); 1866 *mpol = get_vma_policy(vma, addr);
1878 *nodemask = NULL; /* assume !MPOL_BIND */ 1867 *nodemask = NULL; /* assume !MPOL_BIND */
1879 1868
1880 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { 1869 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
@@ -2029,7 +2018,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2029 unsigned int cpuset_mems_cookie; 2018 unsigned int cpuset_mems_cookie;
2030 2019
2031retry_cpuset: 2020retry_cpuset:
2032 pol = get_vma_policy(current, vma, addr); 2021 pol = get_vma_policy(vma, addr);
2033 cpuset_mems_cookie = read_mems_allowed_begin(); 2022 cpuset_mems_cookie = read_mems_allowed_begin();
2034 2023
2035 if (unlikely(pol->mode == MPOL_INTERLEAVE)) { 2024 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
@@ -2046,8 +2035,7 @@ retry_cpuset:
2046 page = __alloc_pages_nodemask(gfp, order, 2035 page = __alloc_pages_nodemask(gfp, order,
2047 policy_zonelist(gfp, pol, node), 2036 policy_zonelist(gfp, pol, node),
2048 policy_nodemask(gfp, pol)); 2037 policy_nodemask(gfp, pol));
2049 if (unlikely(mpol_needs_cond_ref(pol))) 2038 mpol_cond_put(pol);
2050 __mpol_put(pol);
2051 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) 2039 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2052 goto retry_cpuset; 2040 goto retry_cpuset;
2053 return page; 2041 return page;
@@ -2074,12 +2062,12 @@ retry_cpuset:
2074 */ 2062 */
2075struct page *alloc_pages_current(gfp_t gfp, unsigned order) 2063struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2076{ 2064{
2077 struct mempolicy *pol = get_task_policy(current); 2065 struct mempolicy *pol = &default_policy;
2078 struct page *page; 2066 struct page *page;
2079 unsigned int cpuset_mems_cookie; 2067 unsigned int cpuset_mems_cookie;
2080 2068
2081 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) 2069 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2082 pol = &default_policy; 2070 pol = get_task_policy(current);
2083 2071
2084retry_cpuset: 2072retry_cpuset:
2085 cpuset_mems_cookie = read_mems_allowed_begin(); 2073 cpuset_mems_cookie = read_mems_allowed_begin();
@@ -2296,7 +2284,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
2296 2284
2297 BUG_ON(!vma); 2285 BUG_ON(!vma);
2298 2286
2299 pol = get_vma_policy(current, vma, addr); 2287 pol = get_vma_policy(vma, addr);
2300 if (!(pol->flags & MPOL_F_MOF)) 2288 if (!(pol->flags & MPOL_F_MOF))
2301 goto out; 2289 goto out;
2302 2290
diff --git a/mm/migrate.c b/mm/migrate.c
index 2740360cd216..01439953abf5 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -876,7 +876,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
876 } 876 }
877 } 877 }
878 878
879 if (unlikely(balloon_page_movable(page))) { 879 if (unlikely(isolated_balloon_page(page))) {
880 /* 880 /*
881 * A ballooned page does not need any special attention from 881 * A ballooned page does not need any special attention from
882 * physical to virtual reverse mapping procedures. 882 * physical to virtual reverse mapping procedures.
@@ -955,17 +955,6 @@ static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
955 955
956 rc = __unmap_and_move(page, newpage, force, mode); 956 rc = __unmap_and_move(page, newpage, force, mode);
957 957
958 if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) {
959 /*
960 * A ballooned page has been migrated already.
961 * Now, it's the time to wrap-up counters,
962 * handle the page back to Buddy and return.
963 */
964 dec_zone_page_state(page, NR_ISOLATED_ANON +
965 page_is_file_cache(page));
966 balloon_page_free(page);
967 return MIGRATEPAGE_SUCCESS;
968 }
969out: 958out:
970 if (rc != -EAGAIN) { 959 if (rc != -EAGAIN) {
971 /* 960 /*
@@ -988,6 +977,9 @@ out:
988 if (rc != MIGRATEPAGE_SUCCESS && put_new_page) { 977 if (rc != MIGRATEPAGE_SUCCESS && put_new_page) {
989 ClearPageSwapBacked(newpage); 978 ClearPageSwapBacked(newpage);
990 put_new_page(newpage, private); 979 put_new_page(newpage, private);
980 } else if (unlikely(__is_movable_balloon_page(newpage))) {
981 /* drop our reference, page already in the balloon */
982 put_page(newpage);
991 } else 983 } else
992 putback_lru_page(newpage); 984 putback_lru_page(newpage);
993 985
diff --git a/mm/mlock.c b/mm/mlock.c
index ce84cb0b83ef..03aa8512723b 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -233,9 +233,9 @@ long __mlock_vma_pages_range(struct vm_area_struct *vma,
233 233
234 VM_BUG_ON(start & ~PAGE_MASK); 234 VM_BUG_ON(start & ~PAGE_MASK);
235 VM_BUG_ON(end & ~PAGE_MASK); 235 VM_BUG_ON(end & ~PAGE_MASK);
236 VM_BUG_ON(start < vma->vm_start); 236 VM_BUG_ON_VMA(start < vma->vm_start, vma);
237 VM_BUG_ON(end > vma->vm_end); 237 VM_BUG_ON_VMA(end > vma->vm_end, vma);
238 VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); 238 VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
239 239
240 gup_flags = FOLL_TOUCH | FOLL_MLOCK; 240 gup_flags = FOLL_TOUCH | FOLL_MLOCK;
241 /* 241 /*
diff --git a/mm/mmap.c b/mm/mmap.c
index c0a3637cdb64..16d19b48e2ad 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -70,7 +70,7 @@ static void unmap_region(struct mm_struct *mm,
70 * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes 70 * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes
71 * w: (no) no w: (no) no w: (yes) yes w: (no) no 71 * w: (no) no w: (no) no w: (yes) yes w: (no) no
72 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes 72 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes
73 * 73 *
74 * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes 74 * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes
75 * w: (no) no w: (no) no w: (copy) copy w: (no) no 75 * w: (no) no w: (no) no w: (copy) copy w: (no) no
76 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes 76 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes
@@ -268,7 +268,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len);
268 268
269SYSCALL_DEFINE1(brk, unsigned long, brk) 269SYSCALL_DEFINE1(brk, unsigned long, brk)
270{ 270{
271 unsigned long rlim, retval; 271 unsigned long retval;
272 unsigned long newbrk, oldbrk; 272 unsigned long newbrk, oldbrk;
273 struct mm_struct *mm = current->mm; 273 struct mm_struct *mm = current->mm;
274 unsigned long min_brk; 274 unsigned long min_brk;
@@ -298,9 +298,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
298 * segment grow beyond its set limit the in case where the limit is 298 * segment grow beyond its set limit the in case where the limit is
299 * not page aligned -Ram Gupta 299 * not page aligned -Ram Gupta
300 */ 300 */
301 rlim = rlimit(RLIMIT_DATA); 301 if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk,
302 if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + 302 mm->end_data, mm->start_data))
303 (mm->end_data - mm->start_data) > rlim)
304 goto out; 303 goto out;
305 304
306 newbrk = PAGE_ALIGN(brk); 305 newbrk = PAGE_ALIGN(brk);
@@ -369,16 +368,18 @@ static int browse_rb(struct rb_root *root)
369 struct vm_area_struct *vma; 368 struct vm_area_struct *vma;
370 vma = rb_entry(nd, struct vm_area_struct, vm_rb); 369 vma = rb_entry(nd, struct vm_area_struct, vm_rb);
371 if (vma->vm_start < prev) { 370 if (vma->vm_start < prev) {
372 pr_emerg("vm_start %lx prev %lx\n", vma->vm_start, prev); 371 pr_emerg("vm_start %lx < prev %lx\n",
372 vma->vm_start, prev);
373 bug = 1; 373 bug = 1;
374 } 374 }
375 if (vma->vm_start < pend) { 375 if (vma->vm_start < pend) {
376 pr_emerg("vm_start %lx pend %lx\n", vma->vm_start, pend); 376 pr_emerg("vm_start %lx < pend %lx\n",
377 vma->vm_start, pend);
377 bug = 1; 378 bug = 1;
378 } 379 }
379 if (vma->vm_start > vma->vm_end) { 380 if (vma->vm_start > vma->vm_end) {
380 pr_emerg("vm_end %lx < vm_start %lx\n", 381 pr_emerg("vm_start %lx > vm_end %lx\n",
381 vma->vm_end, vma->vm_start); 382 vma->vm_start, vma->vm_end);
382 bug = 1; 383 bug = 1;
383 } 384 }
384 if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { 385 if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
@@ -409,8 +410,9 @@ static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
409 for (nd = rb_first(root); nd; nd = rb_next(nd)) { 410 for (nd = rb_first(root); nd; nd = rb_next(nd)) {
410 struct vm_area_struct *vma; 411 struct vm_area_struct *vma;
411 vma = rb_entry(nd, struct vm_area_struct, vm_rb); 412 vma = rb_entry(nd, struct vm_area_struct, vm_rb);
412 BUG_ON(vma != ignore && 413 VM_BUG_ON_VMA(vma != ignore &&
413 vma->rb_subtree_gap != vma_compute_subtree_gap(vma)); 414 vma->rb_subtree_gap != vma_compute_subtree_gap(vma),
415 vma);
414 } 416 }
415} 417}
416 418
@@ -420,8 +422,10 @@ static void validate_mm(struct mm_struct *mm)
420 int i = 0; 422 int i = 0;
421 unsigned long highest_address = 0; 423 unsigned long highest_address = 0;
422 struct vm_area_struct *vma = mm->mmap; 424 struct vm_area_struct *vma = mm->mmap;
425
423 while (vma) { 426 while (vma) {
424 struct anon_vma_chain *avc; 427 struct anon_vma_chain *avc;
428
425 vma_lock_anon_vma(vma); 429 vma_lock_anon_vma(vma);
426 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 430 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
427 anon_vma_interval_tree_verify(avc); 431 anon_vma_interval_tree_verify(avc);
@@ -436,15 +440,16 @@ static void validate_mm(struct mm_struct *mm)
436 } 440 }
437 if (highest_address != mm->highest_vm_end) { 441 if (highest_address != mm->highest_vm_end) {
438 pr_emerg("mm->highest_vm_end %lx, found %lx\n", 442 pr_emerg("mm->highest_vm_end %lx, found %lx\n",
439 mm->highest_vm_end, highest_address); 443 mm->highest_vm_end, highest_address);
440 bug = 1; 444 bug = 1;
441 } 445 }
442 i = browse_rb(&mm->mm_rb); 446 i = browse_rb(&mm->mm_rb);
443 if (i != mm->map_count) { 447 if (i != mm->map_count) {
444 pr_emerg("map_count %d rb %d\n", mm->map_count, i); 448 if (i != -1)
449 pr_emerg("map_count %d rb %d\n", mm->map_count, i);
445 bug = 1; 450 bug = 1;
446 } 451 }
447 BUG_ON(bug); 452 VM_BUG_ON_MM(bug, mm);
448} 453}
449#else 454#else
450#define validate_mm_rb(root, ignore) do { } while (0) 455#define validate_mm_rb(root, ignore) do { } while (0)
@@ -741,7 +746,7 @@ again: remove_next = 1 + (end > next->vm_end);
741 * split_vma inserting another: so it must be 746 * split_vma inserting another: so it must be
742 * mprotect case 4 shifting the boundary down. 747 * mprotect case 4 shifting the boundary down.
743 */ 748 */
744 adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT); 749 adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT);
745 exporter = vma; 750 exporter = vma;
746 importer = next; 751 importer = next;
747 } 752 }
@@ -787,8 +792,8 @@ again: remove_next = 1 + (end > next->vm_end);
787 if (!anon_vma && adjust_next) 792 if (!anon_vma && adjust_next)
788 anon_vma = next->anon_vma; 793 anon_vma = next->anon_vma;
789 if (anon_vma) { 794 if (anon_vma) {
790 VM_BUG_ON(adjust_next && next->anon_vma && 795 VM_BUG_ON_VMA(adjust_next && next->anon_vma &&
791 anon_vma != next->anon_vma); 796 anon_vma != next->anon_vma, next);
792 anon_vma_lock_write(anon_vma); 797 anon_vma_lock_write(anon_vma);
793 anon_vma_interval_tree_pre_update_vma(vma); 798 anon_vma_interval_tree_pre_update_vma(vma);
794 if (adjust_next) 799 if (adjust_next)
@@ -1010,7 +1015,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
1010struct vm_area_struct *vma_merge(struct mm_struct *mm, 1015struct vm_area_struct *vma_merge(struct mm_struct *mm,
1011 struct vm_area_struct *prev, unsigned long addr, 1016 struct vm_area_struct *prev, unsigned long addr,
1012 unsigned long end, unsigned long vm_flags, 1017 unsigned long end, unsigned long vm_flags,
1013 struct anon_vma *anon_vma, struct file *file, 1018 struct anon_vma *anon_vma, struct file *file,
1014 pgoff_t pgoff, struct mempolicy *policy) 1019 pgoff_t pgoff, struct mempolicy *policy)
1015{ 1020{
1016 pgoff_t pglen = (end - addr) >> PAGE_SHIFT; 1021 pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
@@ -1036,7 +1041,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
1036 * Can it merge with the predecessor? 1041 * Can it merge with the predecessor?
1037 */ 1042 */
1038 if (prev && prev->vm_end == addr && 1043 if (prev && prev->vm_end == addr &&
1039 mpol_equal(vma_policy(prev), policy) && 1044 mpol_equal(vma_policy(prev), policy) &&
1040 can_vma_merge_after(prev, vm_flags, 1045 can_vma_merge_after(prev, vm_flags,
1041 anon_vma, file, pgoff)) { 1046 anon_vma, file, pgoff)) {
1042 /* 1047 /*
@@ -1064,7 +1069,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
1064 * Can this new request be merged in front of next? 1069 * Can this new request be merged in front of next?
1065 */ 1070 */
1066 if (next && end == next->vm_start && 1071 if (next && end == next->vm_start &&
1067 mpol_equal(policy, vma_policy(next)) && 1072 mpol_equal(policy, vma_policy(next)) &&
1068 can_vma_merge_before(next, vm_flags, 1073 can_vma_merge_before(next, vm_flags,
1069 anon_vma, file, pgoff+pglen)) { 1074 anon_vma, file, pgoff+pglen)) {
1070 if (prev && addr < prev->vm_end) /* case 4 */ 1075 if (prev && addr < prev->vm_end) /* case 4 */
@@ -1235,7 +1240,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1235 unsigned long flags, unsigned long pgoff, 1240 unsigned long flags, unsigned long pgoff,
1236 unsigned long *populate) 1241 unsigned long *populate)
1237{ 1242{
1238 struct mm_struct * mm = current->mm; 1243 struct mm_struct *mm = current->mm;
1239 vm_flags_t vm_flags; 1244 vm_flags_t vm_flags;
1240 1245
1241 *populate = 0; 1246 *populate = 0;
@@ -1263,7 +1268,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1263 1268
1264 /* offset overflow? */ 1269 /* offset overflow? */
1265 if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) 1270 if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
1266 return -EOVERFLOW; 1271 return -EOVERFLOW;
1267 1272
1268 /* Too many mappings? */ 1273 /* Too many mappings? */
1269 if (mm->map_count > sysctl_max_map_count) 1274 if (mm->map_count > sysctl_max_map_count)
@@ -1921,7 +1926,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
1921 info.align_mask = 0; 1926 info.align_mask = 0;
1922 return vm_unmapped_area(&info); 1927 return vm_unmapped_area(&info);
1923} 1928}
1924#endif 1929#endif
1925 1930
1926/* 1931/*
1927 * This mmap-allocator allocates new areas top-down from below the 1932 * This mmap-allocator allocates new areas top-down from below the
@@ -2321,13 +2326,13 @@ int expand_stack(struct vm_area_struct *vma, unsigned long address)
2321} 2326}
2322 2327
2323struct vm_area_struct * 2328struct vm_area_struct *
2324find_extend_vma(struct mm_struct * mm, unsigned long addr) 2329find_extend_vma(struct mm_struct *mm, unsigned long addr)
2325{ 2330{
2326 struct vm_area_struct * vma; 2331 struct vm_area_struct *vma;
2327 unsigned long start; 2332 unsigned long start;
2328 2333
2329 addr &= PAGE_MASK; 2334 addr &= PAGE_MASK;
2330 vma = find_vma(mm,addr); 2335 vma = find_vma(mm, addr);
2331 if (!vma) 2336 if (!vma)
2332 return NULL; 2337 return NULL;
2333 if (vma->vm_start <= addr) 2338 if (vma->vm_start <= addr)
@@ -2376,7 +2381,7 @@ static void unmap_region(struct mm_struct *mm,
2376 struct vm_area_struct *vma, struct vm_area_struct *prev, 2381 struct vm_area_struct *vma, struct vm_area_struct *prev,
2377 unsigned long start, unsigned long end) 2382 unsigned long start, unsigned long end)
2378{ 2383{
2379 struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; 2384 struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap;
2380 struct mmu_gather tlb; 2385 struct mmu_gather tlb;
2381 2386
2382 lru_add_drain(); 2387 lru_add_drain();
@@ -2423,7 +2428,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
2423 * __split_vma() bypasses sysctl_max_map_count checking. We use this on the 2428 * __split_vma() bypasses sysctl_max_map_count checking. We use this on the
2424 * munmap path where it doesn't make sense to fail. 2429 * munmap path where it doesn't make sense to fail.
2425 */ 2430 */
2426static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, 2431static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
2427 unsigned long addr, int new_below) 2432 unsigned long addr, int new_below)
2428{ 2433{
2429 struct vm_area_struct *new; 2434 struct vm_area_struct *new;
@@ -2512,7 +2517,8 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
2512 if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) 2517 if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
2513 return -EINVAL; 2518 return -EINVAL;
2514 2519
2515 if ((len = PAGE_ALIGN(len)) == 0) 2520 len = PAGE_ALIGN(len);
2521 if (len == 0)
2516 return -EINVAL; 2522 return -EINVAL;
2517 2523
2518 /* Find the first overlapping VMA */ 2524 /* Find the first overlapping VMA */
@@ -2558,7 +2564,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
2558 if (error) 2564 if (error)
2559 return error; 2565 return error;
2560 } 2566 }
2561 vma = prev? prev->vm_next: mm->mmap; 2567 vma = prev ? prev->vm_next : mm->mmap;
2562 2568
2563 /* 2569 /*
2564 * unlock any mlock()ed ranges before detaching vmas 2570 * unlock any mlock()ed ranges before detaching vmas
@@ -2621,10 +2627,10 @@ static inline void verify_mm_writelocked(struct mm_struct *mm)
2621 */ 2627 */
2622static unsigned long do_brk(unsigned long addr, unsigned long len) 2628static unsigned long do_brk(unsigned long addr, unsigned long len)
2623{ 2629{
2624 struct mm_struct * mm = current->mm; 2630 struct mm_struct *mm = current->mm;
2625 struct vm_area_struct * vma, * prev; 2631 struct vm_area_struct *vma, *prev;
2626 unsigned long flags; 2632 unsigned long flags;
2627 struct rb_node ** rb_link, * rb_parent; 2633 struct rb_node **rb_link, *rb_parent;
2628 pgoff_t pgoff = addr >> PAGE_SHIFT; 2634 pgoff_t pgoff = addr >> PAGE_SHIFT;
2629 int error; 2635 int error;
2630 2636
@@ -2848,7 +2854,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2848 * safe. It is only safe to keep the vm_pgoff 2854 * safe. It is only safe to keep the vm_pgoff
2849 * linear if there are no pages mapped yet. 2855 * linear if there are no pages mapped yet.
2850 */ 2856 */
2851 VM_BUG_ON(faulted_in_anon_vma); 2857 VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
2852 *vmap = vma = new_vma; 2858 *vmap = vma = new_vma;
2853 } 2859 }
2854 *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); 2860 *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
diff --git a/mm/mremap.c b/mm/mremap.c
index 05f1180e9f21..b147f66f4c40 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -21,8 +21,8 @@
21#include <linux/syscalls.h> 21#include <linux/syscalls.h>
22#include <linux/mmu_notifier.h> 22#include <linux/mmu_notifier.h>
23#include <linux/sched/sysctl.h> 23#include <linux/sched/sysctl.h>
24#include <linux/uaccess.h>
24 25
25#include <asm/uaccess.h>
26#include <asm/cacheflush.h> 26#include <asm/cacheflush.h>
27#include <asm/tlbflush.h> 27#include <asm/tlbflush.h>
28 28
@@ -195,7 +195,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
195 if (pmd_trans_huge(*old_pmd)) { 195 if (pmd_trans_huge(*old_pmd)) {
196 int err = 0; 196 int err = 0;
197 if (extent == HPAGE_PMD_SIZE) { 197 if (extent == HPAGE_PMD_SIZE) {
198 VM_BUG_ON(vma->vm_file || !vma->anon_vma); 198 VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma,
199 vma);
199 /* See comment in move_ptes() */ 200 /* See comment in move_ptes() */
200 if (need_rmap_locks) 201 if (need_rmap_locks)
201 anon_vma_lock_write(vma->anon_vma); 202 anon_vma_lock_write(vma->anon_vma);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1e11df8fa7ec..bbf405a3a18f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -565,7 +565,7 @@ bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask)
565 565
566 spin_lock(&zone_scan_lock); 566 spin_lock(&zone_scan_lock);
567 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) 567 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
568 if (zone_is_oom_locked(zone)) { 568 if (test_bit(ZONE_OOM_LOCKED, &zone->flags)) {
569 ret = false; 569 ret = false;
570 goto out; 570 goto out;
571 } 571 }
@@ -575,7 +575,7 @@ bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask)
575 * call to oom_zonelist_trylock() doesn't succeed when it shouldn't. 575 * call to oom_zonelist_trylock() doesn't succeed when it shouldn't.
576 */ 576 */
577 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) 577 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
578 zone_set_flag(zone, ZONE_OOM_LOCKED); 578 set_bit(ZONE_OOM_LOCKED, &zone->flags);
579 579
580out: 580out:
581 spin_unlock(&zone_scan_lock); 581 spin_unlock(&zone_scan_lock);
@@ -594,7 +594,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
594 594
595 spin_lock(&zone_scan_lock); 595 spin_lock(&zone_scan_lock);
596 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) 596 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
597 zone_clear_flag(zone, ZONE_OOM_LOCKED); 597 clear_bit(ZONE_OOM_LOCKED, &zone->flags);
598 spin_unlock(&zone_scan_lock); 598 spin_unlock(&zone_scan_lock);
599} 599}
600 600
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 91d73ef1744d..35ca7102d421 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1075,13 +1075,13 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
1075 } 1075 }
1076 1076
1077 if (dirty < setpoint) { 1077 if (dirty < setpoint) {
1078 x = min(bdi->balanced_dirty_ratelimit, 1078 x = min3(bdi->balanced_dirty_ratelimit,
1079 min(balanced_dirty_ratelimit, task_ratelimit)); 1079 balanced_dirty_ratelimit, task_ratelimit);
1080 if (dirty_ratelimit < x) 1080 if (dirty_ratelimit < x)
1081 step = x - dirty_ratelimit; 1081 step = x - dirty_ratelimit;
1082 } else { 1082 } else {
1083 x = max(bdi->balanced_dirty_ratelimit, 1083 x = max3(bdi->balanced_dirty_ratelimit,
1084 max(balanced_dirty_ratelimit, task_ratelimit)); 1084 balanced_dirty_ratelimit, task_ratelimit);
1085 if (dirty_ratelimit > x) 1085 if (dirty_ratelimit > x)
1086 step = dirty_ratelimit - x; 1086 step = dirty_ratelimit - x;
1087 } 1087 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index eee961958021..c9710c9bbee2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -53,8 +53,6 @@
53#include <linux/kmemleak.h> 53#include <linux/kmemleak.h>
54#include <linux/compaction.h> 54#include <linux/compaction.h>
55#include <trace/events/kmem.h> 55#include <trace/events/kmem.h>
56#include <linux/ftrace_event.h>
57#include <linux/memcontrol.h>
58#include <linux/prefetch.h> 56#include <linux/prefetch.h>
59#include <linux/mm_inline.h> 57#include <linux/mm_inline.h>
60#include <linux/migrate.h> 58#include <linux/migrate.h>
@@ -85,6 +83,7 @@ EXPORT_PER_CPU_SYMBOL(numa_node);
85 */ 83 */
86DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ 84DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
87EXPORT_PER_CPU_SYMBOL(_numa_mem_); 85EXPORT_PER_CPU_SYMBOL(_numa_mem_);
86int _node_numa_mem_[MAX_NUMNODES];
88#endif 87#endif
89 88
90/* 89/*
@@ -1014,7 +1013,7 @@ int move_freepages(struct zone *zone,
1014 * Remove at a later date when no bug reports exist related to 1013 * Remove at a later date when no bug reports exist related to
1015 * grouping pages by mobility 1014 * grouping pages by mobility
1016 */ 1015 */
1017 BUG_ON(page_zone(start_page) != page_zone(end_page)); 1016 VM_BUG_ON(page_zone(start_page) != page_zone(end_page));
1018#endif 1017#endif
1019 1018
1020 for (page = start_page; page <= end_page;) { 1019 for (page = start_page; page <= end_page;) {
@@ -1613,8 +1612,8 @@ again:
1613 1612
1614 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); 1613 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
1615 if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 && 1614 if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 &&
1616 !zone_is_fair_depleted(zone)) 1615 !test_bit(ZONE_FAIR_DEPLETED, &zone->flags))
1617 zone_set_flag(zone, ZONE_FAIR_DEPLETED); 1616 set_bit(ZONE_FAIR_DEPLETED, &zone->flags);
1618 1617
1619 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1618 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1620 zone_statistics(preferred_zone, zone, gfp_flags); 1619 zone_statistics(preferred_zone, zone, gfp_flags);
@@ -1934,7 +1933,7 @@ static void reset_alloc_batches(struct zone *preferred_zone)
1934 mod_zone_page_state(zone, NR_ALLOC_BATCH, 1933 mod_zone_page_state(zone, NR_ALLOC_BATCH,
1935 high_wmark_pages(zone) - low_wmark_pages(zone) - 1934 high_wmark_pages(zone) - low_wmark_pages(zone) -
1936 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); 1935 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
1937 zone_clear_flag(zone, ZONE_FAIR_DEPLETED); 1936 clear_bit(ZONE_FAIR_DEPLETED, &zone->flags);
1938 } while (zone++ != preferred_zone); 1937 } while (zone++ != preferred_zone);
1939} 1938}
1940 1939
@@ -1985,7 +1984,7 @@ zonelist_scan:
1985 if (alloc_flags & ALLOC_FAIR) { 1984 if (alloc_flags & ALLOC_FAIR) {
1986 if (!zone_local(preferred_zone, zone)) 1985 if (!zone_local(preferred_zone, zone))
1987 break; 1986 break;
1988 if (zone_is_fair_depleted(zone)) { 1987 if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
1989 nr_fair_skipped++; 1988 nr_fair_skipped++;
1990 continue; 1989 continue;
1991 } 1990 }
@@ -2296,58 +2295,72 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2296 struct zonelist *zonelist, enum zone_type high_zoneidx, 2295 struct zonelist *zonelist, enum zone_type high_zoneidx,
2297 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2296 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2298 int classzone_idx, int migratetype, enum migrate_mode mode, 2297 int classzone_idx, int migratetype, enum migrate_mode mode,
2299 bool *contended_compaction, bool *deferred_compaction, 2298 int *contended_compaction, bool *deferred_compaction)
2300 unsigned long *did_some_progress)
2301{ 2299{
2302 if (!order) 2300 struct zone *last_compact_zone = NULL;
2303 return NULL; 2301 unsigned long compact_result;
2302 struct page *page;
2304 2303
2305 if (compaction_deferred(preferred_zone, order)) { 2304 if (!order)
2306 *deferred_compaction = true;
2307 return NULL; 2305 return NULL;
2308 }
2309 2306
2310 current->flags |= PF_MEMALLOC; 2307 current->flags |= PF_MEMALLOC;
2311 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 2308 compact_result = try_to_compact_pages(zonelist, order, gfp_mask,
2312 nodemask, mode, 2309 nodemask, mode,
2313 contended_compaction); 2310 contended_compaction,
2311 &last_compact_zone);
2314 current->flags &= ~PF_MEMALLOC; 2312 current->flags &= ~PF_MEMALLOC;
2315 2313
2316 if (*did_some_progress != COMPACT_SKIPPED) { 2314 switch (compact_result) {
2317 struct page *page; 2315 case COMPACT_DEFERRED:
2316 *deferred_compaction = true;
2317 /* fall-through */
2318 case COMPACT_SKIPPED:
2319 return NULL;
2320 default:
2321 break;
2322 }
2318 2323
2319 /* Page migration frees to the PCP lists but we want merging */ 2324 /*
2320 drain_pages(get_cpu()); 2325 * At least in one zone compaction wasn't deferred or skipped, so let's
2321 put_cpu(); 2326 * count a compaction stall
2327 */
2328 count_vm_event(COMPACTSTALL);
2322 2329
2323 page = get_page_from_freelist(gfp_mask, nodemask, 2330 /* Page migration frees to the PCP lists but we want merging */
2324 order, zonelist, high_zoneidx, 2331 drain_pages(get_cpu());
2325 alloc_flags & ~ALLOC_NO_WATERMARKS, 2332 put_cpu();
2326 preferred_zone, classzone_idx, migratetype);
2327 if (page) {
2328 preferred_zone->compact_blockskip_flush = false;
2329 compaction_defer_reset(preferred_zone, order, true);
2330 count_vm_event(COMPACTSUCCESS);
2331 return page;
2332 }
2333 2333
2334 /* 2334 page = get_page_from_freelist(gfp_mask, nodemask,
2335 * It's bad if compaction run occurs and fails. 2335 order, zonelist, high_zoneidx,
2336 * The most likely reason is that pages exist, 2336 alloc_flags & ~ALLOC_NO_WATERMARKS,
2337 * but not enough to satisfy watermarks. 2337 preferred_zone, classzone_idx, migratetype);
2338 */
2339 count_vm_event(COMPACTFAIL);
2340 2338
2341 /* 2339 if (page) {
2342 * As async compaction considers a subset of pageblocks, only 2340 struct zone *zone = page_zone(page);
2343 * defer if the failure was a sync compaction failure.
2344 */
2345 if (mode != MIGRATE_ASYNC)
2346 defer_compaction(preferred_zone, order);
2347 2341
2348 cond_resched(); 2342 zone->compact_blockskip_flush = false;
2343 compaction_defer_reset(zone, order, true);
2344 count_vm_event(COMPACTSUCCESS);
2345 return page;
2349 } 2346 }
2350 2347
2348 /*
2349 * last_compact_zone is where try_to_compact_pages thought allocation
2350 * should succeed, so it did not defer compaction. But here we know
2351 * that it didn't succeed, so we do the defer.
2352 */
2353 if (last_compact_zone && mode != MIGRATE_ASYNC)
2354 defer_compaction(last_compact_zone, order);
2355
2356 /*
2357 * It's bad if compaction run occurs and fails. The most likely reason
2358 * is that pages exist, but not enough to satisfy watermarks.
2359 */
2360 count_vm_event(COMPACTFAIL);
2361
2362 cond_resched();
2363
2351 return NULL; 2364 return NULL;
2352} 2365}
2353#else 2366#else
@@ -2355,9 +2368,8 @@ static inline struct page *
2355__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2368__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2356 struct zonelist *zonelist, enum zone_type high_zoneidx, 2369 struct zonelist *zonelist, enum zone_type high_zoneidx,
2357 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2370 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2358 int classzone_idx, int migratetype, 2371 int classzone_idx, int migratetype, enum migrate_mode mode,
2359 enum migrate_mode mode, bool *contended_compaction, 2372 int *contended_compaction, bool *deferred_compaction)
2360 bool *deferred_compaction, unsigned long *did_some_progress)
2361{ 2373{
2362 return NULL; 2374 return NULL;
2363} 2375}
@@ -2457,12 +2469,14 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
2457static void wake_all_kswapds(unsigned int order, 2469static void wake_all_kswapds(unsigned int order,
2458 struct zonelist *zonelist, 2470 struct zonelist *zonelist,
2459 enum zone_type high_zoneidx, 2471 enum zone_type high_zoneidx,
2460 struct zone *preferred_zone) 2472 struct zone *preferred_zone,
2473 nodemask_t *nodemask)
2461{ 2474{
2462 struct zoneref *z; 2475 struct zoneref *z;
2463 struct zone *zone; 2476 struct zone *zone;
2464 2477
2465 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 2478 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2479 high_zoneidx, nodemask)
2466 wakeup_kswapd(zone, order, zone_idx(preferred_zone)); 2480 wakeup_kswapd(zone, order, zone_idx(preferred_zone));
2467} 2481}
2468 2482
@@ -2509,7 +2523,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
2509 alloc_flags |= ALLOC_NO_WATERMARKS; 2523 alloc_flags |= ALLOC_NO_WATERMARKS;
2510 } 2524 }
2511#ifdef CONFIG_CMA 2525#ifdef CONFIG_CMA
2512 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 2526 if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2513 alloc_flags |= ALLOC_CMA; 2527 alloc_flags |= ALLOC_CMA;
2514#endif 2528#endif
2515 return alloc_flags; 2529 return alloc_flags;
@@ -2533,7 +2547,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2533 unsigned long did_some_progress; 2547 unsigned long did_some_progress;
2534 enum migrate_mode migration_mode = MIGRATE_ASYNC; 2548 enum migrate_mode migration_mode = MIGRATE_ASYNC;
2535 bool deferred_compaction = false; 2549 bool deferred_compaction = false;
2536 bool contended_compaction = false; 2550 int contended_compaction = COMPACT_CONTENDED_NONE;
2537 2551
2538 /* 2552 /*
2539 * In the slowpath, we sanity check order to avoid ever trying to 2553 * In the slowpath, we sanity check order to avoid ever trying to
@@ -2560,7 +2574,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2560 2574
2561restart: 2575restart:
2562 if (!(gfp_mask & __GFP_NO_KSWAPD)) 2576 if (!(gfp_mask & __GFP_NO_KSWAPD))
2563 wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone); 2577 wake_all_kswapds(order, zonelist, high_zoneidx,
2578 preferred_zone, nodemask);
2564 2579
2565 /* 2580 /*
2566 * OK, we're below the kswapd watermark and have kicked background 2581 * OK, we're below the kswapd watermark and have kicked background
@@ -2633,20 +2648,40 @@ rebalance:
2633 preferred_zone, 2648 preferred_zone,
2634 classzone_idx, migratetype, 2649 classzone_idx, migratetype,
2635 migration_mode, &contended_compaction, 2650 migration_mode, &contended_compaction,
2636 &deferred_compaction, 2651 &deferred_compaction);
2637 &did_some_progress);
2638 if (page) 2652 if (page)
2639 goto got_pg; 2653 goto got_pg;
2640 2654
2641 /* 2655 /* Checks for THP-specific high-order allocations */
2642 * If compaction is deferred for high-order allocations, it is because 2656 if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) {
2643 * sync compaction recently failed. In this is the case and the caller 2657 /*
2644 * requested a movable allocation that does not heavily disrupt the 2658 * If compaction is deferred for high-order allocations, it is
2645 * system then fail the allocation instead of entering direct reclaim. 2659 * because sync compaction recently failed. If this is the case
2646 */ 2660 * and the caller requested a THP allocation, we do not want
2647 if ((deferred_compaction || contended_compaction) && 2661 * to heavily disrupt the system, so we fail the allocation
2648 (gfp_mask & __GFP_NO_KSWAPD)) 2662 * instead of entering direct reclaim.
2649 goto nopage; 2663 */
2664 if (deferred_compaction)
2665 goto nopage;
2666
2667 /*
2668 * In all zones where compaction was attempted (and not
2669 * deferred or skipped), lock contention has been detected.
2670 * For THP allocation we do not want to disrupt the others
2671 * so we fallback to base pages instead.
2672 */
2673 if (contended_compaction == COMPACT_CONTENDED_LOCK)
2674 goto nopage;
2675
2676 /*
2677 * If compaction was aborted due to need_resched(), we do not
2678 * want to further increase allocation latency, unless it is
2679 * khugepaged trying to collapse.
2680 */
2681 if (contended_compaction == COMPACT_CONTENDED_SCHED
2682 && !(current->flags & PF_KTHREAD))
2683 goto nopage;
2684 }
2650 2685
2651 /* 2686 /*
2652 * It can become very expensive to allocate transparent hugepages at 2687 * It can become very expensive to allocate transparent hugepages at
@@ -2726,8 +2761,7 @@ rebalance:
2726 preferred_zone, 2761 preferred_zone,
2727 classzone_idx, migratetype, 2762 classzone_idx, migratetype,
2728 migration_mode, &contended_compaction, 2763 migration_mode, &contended_compaction,
2729 &deferred_compaction, 2764 &deferred_compaction);
2730 &did_some_progress);
2731 if (page) 2765 if (page)
2732 goto got_pg; 2766 goto got_pg;
2733 } 2767 }
@@ -2753,7 +2787,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2753 struct zone *preferred_zone; 2787 struct zone *preferred_zone;
2754 struct zoneref *preferred_zoneref; 2788 struct zoneref *preferred_zoneref;
2755 struct page *page = NULL; 2789 struct page *page = NULL;
2756 int migratetype = allocflags_to_migratetype(gfp_mask); 2790 int migratetype = gfpflags_to_migratetype(gfp_mask);
2757 unsigned int cpuset_mems_cookie; 2791 unsigned int cpuset_mems_cookie;
2758 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; 2792 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
2759 int classzone_idx; 2793 int classzone_idx;
@@ -2775,6 +2809,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2775 if (unlikely(!zonelist->_zonerefs->zone)) 2809 if (unlikely(!zonelist->_zonerefs->zone))
2776 return NULL; 2810 return NULL;
2777 2811
2812 if (IS_ENABLED(CONFIG_CMA) && migratetype == MIGRATE_MOVABLE)
2813 alloc_flags |= ALLOC_CMA;
2814
2778retry_cpuset: 2815retry_cpuset:
2779 cpuset_mems_cookie = read_mems_allowed_begin(); 2816 cpuset_mems_cookie = read_mems_allowed_begin();
2780 2817
@@ -2786,10 +2823,6 @@ retry_cpuset:
2786 goto out; 2823 goto out;
2787 classzone_idx = zonelist_zone_idx(preferred_zoneref); 2824 classzone_idx = zonelist_zone_idx(preferred_zoneref);
2788 2825
2789#ifdef CONFIG_CMA
2790 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2791 alloc_flags |= ALLOC_CMA;
2792#endif
2793 /* First allocation attempt */ 2826 /* First allocation attempt */
2794 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2827 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
2795 zonelist, high_zoneidx, alloc_flags, 2828 zonelist, high_zoneidx, alloc_flags,
@@ -3579,68 +3612,30 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
3579 zonelist->_zonerefs[pos].zone_idx = 0; 3612 zonelist->_zonerefs[pos].zone_idx = 0;
3580} 3613}
3581 3614
3615#if defined(CONFIG_64BIT)
3616/*
3617 * Devices that require DMA32/DMA are relatively rare and do not justify a
3618 * penalty to every machine in case the specialised case applies. Default
3619 * to Node-ordering on 64-bit NUMA machines
3620 */
3621static int default_zonelist_order(void)
3622{
3623 return ZONELIST_ORDER_NODE;
3624}
3625#else
3626/*
3627 * On 32-bit, the Normal zone needs to be preserved for allocations accessible
3628 * by the kernel. If processes running on node 0 deplete the low memory zone
3629 * then reclaim will occur more frequency increasing stalls and potentially
3630 * be easier to OOM if a large percentage of the zone is under writeback or
3631 * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set.
3632 * Hence, default to zone ordering on 32-bit.
3633 */
3582static int default_zonelist_order(void) 3634static int default_zonelist_order(void)
3583{ 3635{
3584 int nid, zone_type;
3585 unsigned long low_kmem_size, total_size;
3586 struct zone *z;
3587 int average_size;
3588 /*
3589 * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
3590 * If they are really small and used heavily, the system can fall
3591 * into OOM very easily.
3592 * This function detect ZONE_DMA/DMA32 size and configures zone order.
3593 */
3594 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
3595 low_kmem_size = 0;
3596 total_size = 0;
3597 for_each_online_node(nid) {
3598 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
3599 z = &NODE_DATA(nid)->node_zones[zone_type];
3600 if (populated_zone(z)) {
3601 if (zone_type < ZONE_NORMAL)
3602 low_kmem_size += z->managed_pages;
3603 total_size += z->managed_pages;
3604 } else if (zone_type == ZONE_NORMAL) {
3605 /*
3606 * If any node has only lowmem, then node order
3607 * is preferred to allow kernel allocations
3608 * locally; otherwise, they can easily infringe
3609 * on other nodes when there is an abundance of
3610 * lowmem available to allocate from.
3611 */
3612 return ZONELIST_ORDER_NODE;
3613 }
3614 }
3615 }
3616 if (!low_kmem_size || /* there are no DMA area. */
3617 low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
3618 return ZONELIST_ORDER_NODE;
3619 /*
3620 * look into each node's config.
3621 * If there is a node whose DMA/DMA32 memory is very big area on
3622 * local memory, NODE_ORDER may be suitable.
3623 */
3624 average_size = total_size /
3625 (nodes_weight(node_states[N_MEMORY]) + 1);
3626 for_each_online_node(nid) {
3627 low_kmem_size = 0;
3628 total_size = 0;
3629 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
3630 z = &NODE_DATA(nid)->node_zones[zone_type];
3631 if (populated_zone(z)) {
3632 if (zone_type < ZONE_NORMAL)
3633 low_kmem_size += z->present_pages;
3634 total_size += z->present_pages;
3635 }
3636 }
3637 if (low_kmem_size &&
3638 total_size > average_size && /* ignore small node */
3639 low_kmem_size > total_size * 70/100)
3640 return ZONELIST_ORDER_NODE;
3641 }
3642 return ZONELIST_ORDER_ZONE; 3636 return ZONELIST_ORDER_ZONE;
3643} 3637}
3638#endif /* CONFIG_64BIT */
3644 3639
3645static void set_zonelist_order(void) 3640static void set_zonelist_order(void)
3646{ 3641{
@@ -6277,8 +6272,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
6277 6272
6278 if (list_empty(&cc->migratepages)) { 6273 if (list_empty(&cc->migratepages)) {
6279 cc->nr_migratepages = 0; 6274 cc->nr_migratepages = 0;
6280 pfn = isolate_migratepages_range(cc->zone, cc, 6275 pfn = isolate_migratepages_range(cc, pfn, end);
6281 pfn, end, true);
6282 if (!pfn) { 6276 if (!pfn) {
6283 ret = -EINTR; 6277 ret = -EINTR;
6284 break; 6278 break;
@@ -6554,97 +6548,3 @@ bool is_free_buddy_page(struct page *page)
6554 return order < MAX_ORDER; 6548 return order < MAX_ORDER;
6555} 6549}
6556#endif 6550#endif
6557
6558static const struct trace_print_flags pageflag_names[] = {
6559 {1UL << PG_locked, "locked" },
6560 {1UL << PG_error, "error" },
6561 {1UL << PG_referenced, "referenced" },
6562 {1UL << PG_uptodate, "uptodate" },
6563 {1UL << PG_dirty, "dirty" },
6564 {1UL << PG_lru, "lru" },
6565 {1UL << PG_active, "active" },
6566 {1UL << PG_slab, "slab" },
6567 {1UL << PG_owner_priv_1, "owner_priv_1" },
6568 {1UL << PG_arch_1, "arch_1" },
6569 {1UL << PG_reserved, "reserved" },
6570 {1UL << PG_private, "private" },
6571 {1UL << PG_private_2, "private_2" },
6572 {1UL << PG_writeback, "writeback" },
6573#ifdef CONFIG_PAGEFLAGS_EXTENDED
6574 {1UL << PG_head, "head" },
6575 {1UL << PG_tail, "tail" },
6576#else
6577 {1UL << PG_compound, "compound" },
6578#endif
6579 {1UL << PG_swapcache, "swapcache" },
6580 {1UL << PG_mappedtodisk, "mappedtodisk" },
6581 {1UL << PG_reclaim, "reclaim" },
6582 {1UL << PG_swapbacked, "swapbacked" },
6583 {1UL << PG_unevictable, "unevictable" },
6584#ifdef CONFIG_MMU
6585 {1UL << PG_mlocked, "mlocked" },
6586#endif
6587#ifdef CONFIG_ARCH_USES_PG_UNCACHED
6588 {1UL << PG_uncached, "uncached" },
6589#endif
6590#ifdef CONFIG_MEMORY_FAILURE
6591 {1UL << PG_hwpoison, "hwpoison" },
6592#endif
6593#ifdef CONFIG_TRANSPARENT_HUGEPAGE
6594 {1UL << PG_compound_lock, "compound_lock" },
6595#endif
6596};
6597
6598static void dump_page_flags(unsigned long flags)
6599{
6600 const char *delim = "";
6601 unsigned long mask;
6602 int i;
6603
6604 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
6605
6606 printk(KERN_ALERT "page flags: %#lx(", flags);
6607
6608 /* remove zone id */
6609 flags &= (1UL << NR_PAGEFLAGS) - 1;
6610
6611 for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
6612
6613 mask = pageflag_names[i].mask;
6614 if ((flags & mask) != mask)
6615 continue;
6616
6617 flags &= ~mask;
6618 printk("%s%s", delim, pageflag_names[i].name);
6619 delim = "|";
6620 }
6621
6622 /* check for left over flags */
6623 if (flags)
6624 printk("%s%#lx", delim, flags);
6625
6626 printk(")\n");
6627}
6628
6629void dump_page_badflags(struct page *page, const char *reason,
6630 unsigned long badflags)
6631{
6632 printk(KERN_ALERT
6633 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
6634 page, atomic_read(&page->_count), page_mapcount(page),
6635 page->mapping, page->index);
6636 dump_page_flags(page->flags);
6637 if (reason)
6638 pr_alert("page dumped because: %s\n", reason);
6639 if (page->flags & badflags) {
6640 pr_alert("bad because of flags:\n");
6641 dump_page_flags(page->flags & badflags);
6642 }
6643 mem_cgroup_print_bad_page(page);
6644}
6645
6646void dump_page(struct page *page, const char *reason)
6647{
6648 dump_page_badflags(page, reason, 0);
6649}
6650EXPORT_SYMBOL(dump_page);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 2beeabf502c5..ad83195521f2 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -177,7 +177,7 @@ int walk_page_range(unsigned long addr, unsigned long end,
177 if (!walk->mm) 177 if (!walk->mm)
178 return -EINVAL; 178 return -EINVAL;
179 179
180 VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); 180 VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm);
181 181
182 pgd = pgd_offset(walk->mm, addr); 182 pgd = pgd_offset(walk->mm, addr);
183 do { 183 do {
diff --git a/mm/rmap.c b/mm/rmap.c
index bc74e0012809..116a5053415b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -527,7 +527,7 @@ vma_address(struct page *page, struct vm_area_struct *vma)
527 unsigned long address = __vma_address(page, vma); 527 unsigned long address = __vma_address(page, vma);
528 528
529 /* page should be within @vma mapping range */ 529 /* page should be within @vma mapping range */
530 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 530 VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
531 531
532 return address; 532 return address;
533} 533}
@@ -897,7 +897,7 @@ void page_move_anon_rmap(struct page *page,
897 struct anon_vma *anon_vma = vma->anon_vma; 897 struct anon_vma *anon_vma = vma->anon_vma;
898 898
899 VM_BUG_ON_PAGE(!PageLocked(page), page); 899 VM_BUG_ON_PAGE(!PageLocked(page), page);
900 VM_BUG_ON(!anon_vma); 900 VM_BUG_ON_VMA(!anon_vma, vma);
901 VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page); 901 VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page);
902 902
903 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 903 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
@@ -1024,7 +1024,7 @@ void do_page_add_anon_rmap(struct page *page,
1024void page_add_new_anon_rmap(struct page *page, 1024void page_add_new_anon_rmap(struct page *page,
1025 struct vm_area_struct *vma, unsigned long address) 1025 struct vm_area_struct *vma, unsigned long address)
1026{ 1026{
1027 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 1027 VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
1028 SetPageSwapBacked(page); 1028 SetPageSwapBacked(page);
1029 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ 1029 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
1030 if (PageTransHuge(page)) 1030 if (PageTransHuge(page))
@@ -1670,7 +1670,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
1670 * structure at mapping cannot be freed and reused yet, 1670 * structure at mapping cannot be freed and reused yet,
1671 * so we can safely take mapping->i_mmap_mutex. 1671 * so we can safely take mapping->i_mmap_mutex.
1672 */ 1672 */
1673 VM_BUG_ON(!PageLocked(page)); 1673 VM_BUG_ON_PAGE(!PageLocked(page), page);
1674 1674
1675 if (!mapping) 1675 if (!mapping)
1676 return ret; 1676 return ret;
diff --git a/mm/shmem.c b/mm/shmem.c
index 469f90d56051..4fad61bb41e5 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3077,7 +3077,9 @@ static const struct address_space_operations shmem_aops = {
3077 .write_begin = shmem_write_begin, 3077 .write_begin = shmem_write_begin,
3078 .write_end = shmem_write_end, 3078 .write_end = shmem_write_end,
3079#endif 3079#endif
3080#ifdef CONFIG_MIGRATION
3080 .migratepage = migrate_page, 3081 .migratepage = migrate_page,
3082#endif
3081 .error_remove_page = generic_error_remove_page, 3083 .error_remove_page = generic_error_remove_page,
3082}; 3084};
3083 3085
diff --git a/mm/slab.c b/mm/slab.c
index 7c52b3890d25..154aac8411c5 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -237,11 +237,10 @@ struct arraycache_init {
237/* 237/*
238 * Need this for bootstrapping a per node allocator. 238 * Need this for bootstrapping a per node allocator.
239 */ 239 */
240#define NUM_INIT_LISTS (3 * MAX_NUMNODES) 240#define NUM_INIT_LISTS (2 * MAX_NUMNODES)
241static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS]; 241static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS];
242#define CACHE_CACHE 0 242#define CACHE_CACHE 0
243#define SIZE_AC MAX_NUMNODES 243#define SIZE_NODE (MAX_NUMNODES)
244#define SIZE_NODE (2 * MAX_NUMNODES)
245 244
246static int drain_freelist(struct kmem_cache *cache, 245static int drain_freelist(struct kmem_cache *cache,
247 struct kmem_cache_node *n, int tofree); 246 struct kmem_cache_node *n, int tofree);
@@ -253,7 +252,6 @@ static void cache_reap(struct work_struct *unused);
253 252
254static int slab_early_init = 1; 253static int slab_early_init = 1;
255 254
256#define INDEX_AC kmalloc_index(sizeof(struct arraycache_init))
257#define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node)) 255#define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node))
258 256
259static void kmem_cache_node_init(struct kmem_cache_node *parent) 257static void kmem_cache_node_init(struct kmem_cache_node *parent)
@@ -458,9 +456,6 @@ static inline unsigned int obj_to_index(const struct kmem_cache *cache,
458 return reciprocal_divide(offset, cache->reciprocal_buffer_size); 456 return reciprocal_divide(offset, cache->reciprocal_buffer_size);
459} 457}
460 458
461static struct arraycache_init initarray_generic =
462 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
463
464/* internal cache of cache description objs */ 459/* internal cache of cache description objs */
465static struct kmem_cache kmem_cache_boot = { 460static struct kmem_cache kmem_cache_boot = {
466 .batchcount = 1, 461 .batchcount = 1,
@@ -476,7 +471,7 @@ static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
476 471
477static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) 472static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
478{ 473{
479 return cachep->array[smp_processor_id()]; 474 return this_cpu_ptr(cachep->cpu_cache);
480} 475}
481 476
482static size_t calculate_freelist_size(int nr_objs, size_t align) 477static size_t calculate_freelist_size(int nr_objs, size_t align)
@@ -785,8 +780,8 @@ static inline void *ac_get_obj(struct kmem_cache *cachep,
785 return objp; 780 return objp;
786} 781}
787 782
788static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, 783static noinline void *__ac_put_obj(struct kmem_cache *cachep,
789 void *objp) 784 struct array_cache *ac, void *objp)
790{ 785{
791 if (unlikely(pfmemalloc_active)) { 786 if (unlikely(pfmemalloc_active)) {
792 /* Some pfmemalloc slabs exist, check if this is one */ 787 /* Some pfmemalloc slabs exist, check if this is one */
@@ -984,46 +979,50 @@ static void drain_alien_cache(struct kmem_cache *cachep,
984 } 979 }
985} 980}
986 981
987static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) 982static int __cache_free_alien(struct kmem_cache *cachep, void *objp,
983 int node, int page_node)
988{ 984{
989 int nodeid = page_to_nid(virt_to_page(objp));
990 struct kmem_cache_node *n; 985 struct kmem_cache_node *n;
991 struct alien_cache *alien = NULL; 986 struct alien_cache *alien = NULL;
992 struct array_cache *ac; 987 struct array_cache *ac;
993 int node;
994 LIST_HEAD(list); 988 LIST_HEAD(list);
995 989
996 node = numa_mem_id();
997
998 /*
999 * Make sure we are not freeing a object from another node to the array
1000 * cache on this cpu.
1001 */
1002 if (likely(nodeid == node))
1003 return 0;
1004
1005 n = get_node(cachep, node); 990 n = get_node(cachep, node);
1006 STATS_INC_NODEFREES(cachep); 991 STATS_INC_NODEFREES(cachep);
1007 if (n->alien && n->alien[nodeid]) { 992 if (n->alien && n->alien[page_node]) {
1008 alien = n->alien[nodeid]; 993 alien = n->alien[page_node];
1009 ac = &alien->ac; 994 ac = &alien->ac;
1010 spin_lock(&alien->lock); 995 spin_lock(&alien->lock);
1011 if (unlikely(ac->avail == ac->limit)) { 996 if (unlikely(ac->avail == ac->limit)) {
1012 STATS_INC_ACOVERFLOW(cachep); 997 STATS_INC_ACOVERFLOW(cachep);
1013 __drain_alien_cache(cachep, ac, nodeid, &list); 998 __drain_alien_cache(cachep, ac, page_node, &list);
1014 } 999 }
1015 ac_put_obj(cachep, ac, objp); 1000 ac_put_obj(cachep, ac, objp);
1016 spin_unlock(&alien->lock); 1001 spin_unlock(&alien->lock);
1017 slabs_destroy(cachep, &list); 1002 slabs_destroy(cachep, &list);
1018 } else { 1003 } else {
1019 n = get_node(cachep, nodeid); 1004 n = get_node(cachep, page_node);
1020 spin_lock(&n->list_lock); 1005 spin_lock(&n->list_lock);
1021 free_block(cachep, &objp, 1, nodeid, &list); 1006 free_block(cachep, &objp, 1, page_node, &list);
1022 spin_unlock(&n->list_lock); 1007 spin_unlock(&n->list_lock);
1023 slabs_destroy(cachep, &list); 1008 slabs_destroy(cachep, &list);
1024 } 1009 }
1025 return 1; 1010 return 1;
1026} 1011}
1012
1013static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1014{
1015 int page_node = page_to_nid(virt_to_page(objp));
1016 int node = numa_mem_id();
1017 /*
1018 * Make sure we are not freeing a object from another node to the array
1019 * cache on this cpu.
1020 */
1021 if (likely(node == page_node))
1022 return 0;
1023
1024 return __cache_free_alien(cachep, objp, node, page_node);
1025}
1027#endif 1026#endif
1028 1027
1029/* 1028/*
@@ -1092,24 +1091,25 @@ static void cpuup_canceled(long cpu)
1092 struct alien_cache **alien; 1091 struct alien_cache **alien;
1093 LIST_HEAD(list); 1092 LIST_HEAD(list);
1094 1093
1095 /* cpu is dead; no one can alloc from it. */
1096 nc = cachep->array[cpu];
1097 cachep->array[cpu] = NULL;
1098 n = get_node(cachep, node); 1094 n = get_node(cachep, node);
1099
1100 if (!n) 1095 if (!n)
1101 goto free_array_cache; 1096 continue;
1102 1097
1103 spin_lock_irq(&n->list_lock); 1098 spin_lock_irq(&n->list_lock);
1104 1099
1105 /* Free limit for this kmem_cache_node */ 1100 /* Free limit for this kmem_cache_node */
1106 n->free_limit -= cachep->batchcount; 1101 n->free_limit -= cachep->batchcount;
1107 if (nc) 1102
1103 /* cpu is dead; no one can alloc from it. */
1104 nc = per_cpu_ptr(cachep->cpu_cache, cpu);
1105 if (nc) {
1108 free_block(cachep, nc->entry, nc->avail, node, &list); 1106 free_block(cachep, nc->entry, nc->avail, node, &list);
1107 nc->avail = 0;
1108 }
1109 1109
1110 if (!cpumask_empty(mask)) { 1110 if (!cpumask_empty(mask)) {
1111 spin_unlock_irq(&n->list_lock); 1111 spin_unlock_irq(&n->list_lock);
1112 goto free_array_cache; 1112 goto free_slab;
1113 } 1113 }
1114 1114
1115 shared = n->shared; 1115 shared = n->shared;
@@ -1129,9 +1129,9 @@ static void cpuup_canceled(long cpu)
1129 drain_alien_cache(cachep, alien); 1129 drain_alien_cache(cachep, alien);
1130 free_alien_cache(alien); 1130 free_alien_cache(alien);
1131 } 1131 }
1132free_array_cache: 1132
1133free_slab:
1133 slabs_destroy(cachep, &list); 1134 slabs_destroy(cachep, &list);
1134 kfree(nc);
1135 } 1135 }
1136 /* 1136 /*
1137 * In the previous loop, all the objects were freed to 1137 * In the previous loop, all the objects were freed to
@@ -1168,32 +1168,23 @@ static int cpuup_prepare(long cpu)
1168 * array caches 1168 * array caches
1169 */ 1169 */
1170 list_for_each_entry(cachep, &slab_caches, list) { 1170 list_for_each_entry(cachep, &slab_caches, list) {
1171 struct array_cache *nc;
1172 struct array_cache *shared = NULL; 1171 struct array_cache *shared = NULL;
1173 struct alien_cache **alien = NULL; 1172 struct alien_cache **alien = NULL;
1174 1173
1175 nc = alloc_arraycache(node, cachep->limit,
1176 cachep->batchcount, GFP_KERNEL);
1177 if (!nc)
1178 goto bad;
1179 if (cachep->shared) { 1174 if (cachep->shared) {
1180 shared = alloc_arraycache(node, 1175 shared = alloc_arraycache(node,
1181 cachep->shared * cachep->batchcount, 1176 cachep->shared * cachep->batchcount,
1182 0xbaadf00d, GFP_KERNEL); 1177 0xbaadf00d, GFP_KERNEL);
1183 if (!shared) { 1178 if (!shared)
1184 kfree(nc);
1185 goto bad; 1179 goto bad;
1186 }
1187 } 1180 }
1188 if (use_alien_caches) { 1181 if (use_alien_caches) {
1189 alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL); 1182 alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL);
1190 if (!alien) { 1183 if (!alien) {
1191 kfree(shared); 1184 kfree(shared);
1192 kfree(nc);
1193 goto bad; 1185 goto bad;
1194 } 1186 }
1195 } 1187 }
1196 cachep->array[cpu] = nc;
1197 n = get_node(cachep, node); 1188 n = get_node(cachep, node);
1198 BUG_ON(!n); 1189 BUG_ON(!n);
1199 1190
@@ -1385,15 +1376,6 @@ static void __init set_up_node(struct kmem_cache *cachep, int index)
1385} 1376}
1386 1377
1387/* 1378/*
1388 * The memory after the last cpu cache pointer is used for the
1389 * the node pointer.
1390 */
1391static void setup_node_pointer(struct kmem_cache *cachep)
1392{
1393 cachep->node = (struct kmem_cache_node **)&cachep->array[nr_cpu_ids];
1394}
1395
1396/*
1397 * Initialisation. Called after the page allocator have been initialised and 1379 * Initialisation. Called after the page allocator have been initialised and
1398 * before smp_init(). 1380 * before smp_init().
1399 */ 1381 */
@@ -1404,7 +1386,6 @@ void __init kmem_cache_init(void)
1404 BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) < 1386 BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) <
1405 sizeof(struct rcu_head)); 1387 sizeof(struct rcu_head));
1406 kmem_cache = &kmem_cache_boot; 1388 kmem_cache = &kmem_cache_boot;
1407 setup_node_pointer(kmem_cache);
1408 1389
1409 if (num_possible_nodes() == 1) 1390 if (num_possible_nodes() == 1)
1410 use_alien_caches = 0; 1391 use_alien_caches = 0;
@@ -1412,8 +1393,6 @@ void __init kmem_cache_init(void)
1412 for (i = 0; i < NUM_INIT_LISTS; i++) 1393 for (i = 0; i < NUM_INIT_LISTS; i++)
1413 kmem_cache_node_init(&init_kmem_cache_node[i]); 1394 kmem_cache_node_init(&init_kmem_cache_node[i]);
1414 1395
1415 set_up_node(kmem_cache, CACHE_CACHE);
1416
1417 /* 1396 /*
1418 * Fragmentation resistance on low memory - only use bigger 1397 * Fragmentation resistance on low memory - only use bigger
1419 * page orders on machines with more than 32MB of memory if 1398 * page orders on machines with more than 32MB of memory if
@@ -1448,49 +1427,22 @@ void __init kmem_cache_init(void)
1448 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids 1427 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
1449 */ 1428 */
1450 create_boot_cache(kmem_cache, "kmem_cache", 1429 create_boot_cache(kmem_cache, "kmem_cache",
1451 offsetof(struct kmem_cache, array[nr_cpu_ids]) + 1430 offsetof(struct kmem_cache, node) +
1452 nr_node_ids * sizeof(struct kmem_cache_node *), 1431 nr_node_ids * sizeof(struct kmem_cache_node *),
1453 SLAB_HWCACHE_ALIGN); 1432 SLAB_HWCACHE_ALIGN);
1454 list_add(&kmem_cache->list, &slab_caches); 1433 list_add(&kmem_cache->list, &slab_caches);
1455 1434 slab_state = PARTIAL;
1456 /* 2+3) create the kmalloc caches */
1457 1435
1458 /* 1436 /*
1459 * Initialize the caches that provide memory for the array cache and the 1437 * Initialize the caches that provide memory for the kmem_cache_node
1460 * kmem_cache_node structures first. Without this, further allocations will 1438 * structures first. Without this, further allocations will bug.
1461 * bug.
1462 */ 1439 */
1463 1440 kmalloc_caches[INDEX_NODE] = create_kmalloc_cache("kmalloc-node",
1464 kmalloc_caches[INDEX_AC] = create_kmalloc_cache("kmalloc-ac",
1465 kmalloc_size(INDEX_AC), ARCH_KMALLOC_FLAGS);
1466
1467 if (INDEX_AC != INDEX_NODE)
1468 kmalloc_caches[INDEX_NODE] =
1469 create_kmalloc_cache("kmalloc-node",
1470 kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS); 1441 kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS);
1442 slab_state = PARTIAL_NODE;
1471 1443
1472 slab_early_init = 0; 1444 slab_early_init = 0;
1473 1445
1474 /* 4) Replace the bootstrap head arrays */
1475 {
1476 struct array_cache *ptr;
1477
1478 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1479
1480 memcpy(ptr, cpu_cache_get(kmem_cache),
1481 sizeof(struct arraycache_init));
1482
1483 kmem_cache->array[smp_processor_id()] = ptr;
1484
1485 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1486
1487 BUG_ON(cpu_cache_get(kmalloc_caches[INDEX_AC])
1488 != &initarray_generic.cache);
1489 memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]),
1490 sizeof(struct arraycache_init));
1491
1492 kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr;
1493 }
1494 /* 5) Replace the bootstrap kmem_cache_node */ 1446 /* 5) Replace the bootstrap kmem_cache_node */
1495 { 1447 {
1496 int nid; 1448 int nid;
@@ -1498,13 +1450,8 @@ void __init kmem_cache_init(void)
1498 for_each_online_node(nid) { 1450 for_each_online_node(nid) {
1499 init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid); 1451 init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid);
1500 1452
1501 init_list(kmalloc_caches[INDEX_AC], 1453 init_list(kmalloc_caches[INDEX_NODE],
1502 &init_kmem_cache_node[SIZE_AC + nid], nid);
1503
1504 if (INDEX_AC != INDEX_NODE) {
1505 init_list(kmalloc_caches[INDEX_NODE],
1506 &init_kmem_cache_node[SIZE_NODE + nid], nid); 1454 &init_kmem_cache_node[SIZE_NODE + nid], nid);
1507 }
1508 } 1455 }
1509 } 1456 }
1510 1457
@@ -2037,56 +1984,53 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
2037 return left_over; 1984 return left_over;
2038} 1985}
2039 1986
1987static struct array_cache __percpu *alloc_kmem_cache_cpus(
1988 struct kmem_cache *cachep, int entries, int batchcount)
1989{
1990 int cpu;
1991 size_t size;
1992 struct array_cache __percpu *cpu_cache;
1993
1994 size = sizeof(void *) * entries + sizeof(struct array_cache);
1995 cpu_cache = __alloc_percpu(size, 0);
1996
1997 if (!cpu_cache)
1998 return NULL;
1999
2000 for_each_possible_cpu(cpu) {
2001 init_arraycache(per_cpu_ptr(cpu_cache, cpu),
2002 entries, batchcount);
2003 }
2004
2005 return cpu_cache;
2006}
2007
2040static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) 2008static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2041{ 2009{
2042 if (slab_state >= FULL) 2010 if (slab_state >= FULL)
2043 return enable_cpucache(cachep, gfp); 2011 return enable_cpucache(cachep, gfp);
2044 2012
2013 cachep->cpu_cache = alloc_kmem_cache_cpus(cachep, 1, 1);
2014 if (!cachep->cpu_cache)
2015 return 1;
2016
2045 if (slab_state == DOWN) { 2017 if (slab_state == DOWN) {
2046 /* 2018 /* Creation of first cache (kmem_cache). */
2047 * Note: Creation of first cache (kmem_cache). 2019 set_up_node(kmem_cache, CACHE_CACHE);
2048 * The setup_node is taken care
2049 * of by the caller of __kmem_cache_create
2050 */
2051 cachep->array[smp_processor_id()] = &initarray_generic.cache;
2052 slab_state = PARTIAL;
2053 } else if (slab_state == PARTIAL) { 2020 } else if (slab_state == PARTIAL) {
2054 /* 2021 /* For kmem_cache_node */
2055 * Note: the second kmem_cache_create must create the cache 2022 set_up_node(cachep, SIZE_NODE);
2056 * that's used by kmalloc(24), otherwise the creation of
2057 * further caches will BUG().
2058 */
2059 cachep->array[smp_processor_id()] = &initarray_generic.cache;
2060
2061 /*
2062 * If the cache that's used by kmalloc(sizeof(kmem_cache_node)) is
2063 * the second cache, then we need to set up all its node/,
2064 * otherwise the creation of further caches will BUG().
2065 */
2066 set_up_node(cachep, SIZE_AC);
2067 if (INDEX_AC == INDEX_NODE)
2068 slab_state = PARTIAL_NODE;
2069 else
2070 slab_state = PARTIAL_ARRAYCACHE;
2071 } else { 2023 } else {
2072 /* Remaining boot caches */ 2024 int node;
2073 cachep->array[smp_processor_id()] =
2074 kmalloc(sizeof(struct arraycache_init), gfp);
2075 2025
2076 if (slab_state == PARTIAL_ARRAYCACHE) { 2026 for_each_online_node(node) {
2077 set_up_node(cachep, SIZE_NODE); 2027 cachep->node[node] = kmalloc_node(
2078 slab_state = PARTIAL_NODE; 2028 sizeof(struct kmem_cache_node), gfp, node);
2079 } else { 2029 BUG_ON(!cachep->node[node]);
2080 int node; 2030 kmem_cache_node_init(cachep->node[node]);
2081 for_each_online_node(node) {
2082 cachep->node[node] =
2083 kmalloc_node(sizeof(struct kmem_cache_node),
2084 gfp, node);
2085 BUG_ON(!cachep->node[node]);
2086 kmem_cache_node_init(cachep->node[node]);
2087 }
2088 } 2031 }
2089 } 2032 }
2033
2090 cachep->node[numa_mem_id()]->next_reap = 2034 cachep->node[numa_mem_id()]->next_reap =
2091 jiffies + REAPTIMEOUT_NODE + 2035 jiffies + REAPTIMEOUT_NODE +
2092 ((unsigned long)cachep) % REAPTIMEOUT_NODE; 2036 ((unsigned long)cachep) % REAPTIMEOUT_NODE;
@@ -2100,6 +2044,32 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2100 return 0; 2044 return 0;
2101} 2045}
2102 2046
2047unsigned long kmem_cache_flags(unsigned long object_size,
2048 unsigned long flags, const char *name,
2049 void (*ctor)(void *))
2050{
2051 return flags;
2052}
2053
2054struct kmem_cache *
2055__kmem_cache_alias(const char *name, size_t size, size_t align,
2056 unsigned long flags, void (*ctor)(void *))
2057{
2058 struct kmem_cache *cachep;
2059
2060 cachep = find_mergeable(size, align, flags, name, ctor);
2061 if (cachep) {
2062 cachep->refcount++;
2063
2064 /*
2065 * Adjust the object sizes so that we clear
2066 * the complete object on kzalloc.
2067 */
2068 cachep->object_size = max_t(int, cachep->object_size, size);
2069 }
2070 return cachep;
2071}
2072
2103/** 2073/**
2104 * __kmem_cache_create - Create a cache. 2074 * __kmem_cache_create - Create a cache.
2105 * @cachep: cache management descriptor 2075 * @cachep: cache management descriptor
@@ -2183,7 +2153,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2183 else 2153 else
2184 gfp = GFP_NOWAIT; 2154 gfp = GFP_NOWAIT;
2185 2155
2186 setup_node_pointer(cachep);
2187#if DEBUG 2156#if DEBUG
2188 2157
2189 /* 2158 /*
@@ -2440,8 +2409,7 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep)
2440 if (rc) 2409 if (rc)
2441 return rc; 2410 return rc;
2442 2411
2443 for_each_online_cpu(i) 2412 free_percpu(cachep->cpu_cache);
2444 kfree(cachep->array[i]);
2445 2413
2446 /* NUMA: free the node structures */ 2414 /* NUMA: free the node structures */
2447 for_each_kmem_cache_node(cachep, i, n) { 2415 for_each_kmem_cache_node(cachep, i, n) {
@@ -3399,7 +3367,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
3399 if (nr_online_nodes > 1 && cache_free_alien(cachep, objp)) 3367 if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
3400 return; 3368 return;
3401 3369
3402 if (likely(ac->avail < ac->limit)) { 3370 if (ac->avail < ac->limit) {
3403 STATS_INC_FREEHIT(cachep); 3371 STATS_INC_FREEHIT(cachep);
3404 } else { 3372 } else {
3405 STATS_INC_FREEMISS(cachep); 3373 STATS_INC_FREEMISS(cachep);
@@ -3496,7 +3464,6 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
3496 return kmem_cache_alloc_node_trace(cachep, flags, node, size); 3464 return kmem_cache_alloc_node_trace(cachep, flags, node, size);
3497} 3465}
3498 3466
3499#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
3500void *__kmalloc_node(size_t size, gfp_t flags, int node) 3467void *__kmalloc_node(size_t size, gfp_t flags, int node)
3501{ 3468{
3502 return __do_kmalloc_node(size, flags, node, _RET_IP_); 3469 return __do_kmalloc_node(size, flags, node, _RET_IP_);
@@ -3509,13 +3476,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
3509 return __do_kmalloc_node(size, flags, node, caller); 3476 return __do_kmalloc_node(size, flags, node, caller);
3510} 3477}
3511EXPORT_SYMBOL(__kmalloc_node_track_caller); 3478EXPORT_SYMBOL(__kmalloc_node_track_caller);
3512#else
3513void *__kmalloc_node(size_t size, gfp_t flags, int node)
3514{
3515 return __do_kmalloc_node(size, flags, node, 0);
3516}
3517EXPORT_SYMBOL(__kmalloc_node);
3518#endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */
3519#endif /* CONFIG_NUMA */ 3479#endif /* CONFIG_NUMA */
3520 3480
3521/** 3481/**
@@ -3541,8 +3501,6 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3541 return ret; 3501 return ret;
3542} 3502}
3543 3503
3544
3545#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
3546void *__kmalloc(size_t size, gfp_t flags) 3504void *__kmalloc(size_t size, gfp_t flags)
3547{ 3505{
3548 return __do_kmalloc(size, flags, _RET_IP_); 3506 return __do_kmalloc(size, flags, _RET_IP_);
@@ -3555,14 +3513,6 @@ void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
3555} 3513}
3556EXPORT_SYMBOL(__kmalloc_track_caller); 3514EXPORT_SYMBOL(__kmalloc_track_caller);
3557 3515
3558#else
3559void *__kmalloc(size_t size, gfp_t flags)
3560{
3561 return __do_kmalloc(size, flags, 0);
3562}
3563EXPORT_SYMBOL(__kmalloc);
3564#endif
3565
3566/** 3516/**
3567 * kmem_cache_free - Deallocate an object 3517 * kmem_cache_free - Deallocate an object
3568 * @cachep: The cache the allocation was from. 3518 * @cachep: The cache the allocation was from.
@@ -3707,72 +3657,45 @@ fail:
3707 return -ENOMEM; 3657 return -ENOMEM;
3708} 3658}
3709 3659
3710struct ccupdate_struct {
3711 struct kmem_cache *cachep;
3712 struct array_cache *new[0];
3713};
3714
3715static void do_ccupdate_local(void *info)
3716{
3717 struct ccupdate_struct *new = info;
3718 struct array_cache *old;
3719
3720 check_irq_off();
3721 old = cpu_cache_get(new->cachep);
3722
3723 new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
3724 new->new[smp_processor_id()] = old;
3725}
3726
3727/* Always called with the slab_mutex held */ 3660/* Always called with the slab_mutex held */
3728static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, 3661static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
3729 int batchcount, int shared, gfp_t gfp) 3662 int batchcount, int shared, gfp_t gfp)
3730{ 3663{
3731 struct ccupdate_struct *new; 3664 struct array_cache __percpu *cpu_cache, *prev;
3732 int i; 3665 int cpu;
3733 3666
3734 new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *), 3667 cpu_cache = alloc_kmem_cache_cpus(cachep, limit, batchcount);
3735 gfp); 3668 if (!cpu_cache)
3736 if (!new)
3737 return -ENOMEM; 3669 return -ENOMEM;
3738 3670
3739 for_each_online_cpu(i) { 3671 prev = cachep->cpu_cache;
3740 new->new[i] = alloc_arraycache(cpu_to_mem(i), limit, 3672 cachep->cpu_cache = cpu_cache;
3741 batchcount, gfp); 3673 kick_all_cpus_sync();
3742 if (!new->new[i]) {
3743 for (i--; i >= 0; i--)
3744 kfree(new->new[i]);
3745 kfree(new);
3746 return -ENOMEM;
3747 }
3748 }
3749 new->cachep = cachep;
3750
3751 on_each_cpu(do_ccupdate_local, (void *)new, 1);
3752 3674
3753 check_irq_on(); 3675 check_irq_on();
3754 cachep->batchcount = batchcount; 3676 cachep->batchcount = batchcount;
3755 cachep->limit = limit; 3677 cachep->limit = limit;
3756 cachep->shared = shared; 3678 cachep->shared = shared;
3757 3679
3758 for_each_online_cpu(i) { 3680 if (!prev)
3681 goto alloc_node;
3682
3683 for_each_online_cpu(cpu) {
3759 LIST_HEAD(list); 3684 LIST_HEAD(list);
3760 struct array_cache *ccold = new->new[i];
3761 int node; 3685 int node;
3762 struct kmem_cache_node *n; 3686 struct kmem_cache_node *n;
3687 struct array_cache *ac = per_cpu_ptr(prev, cpu);
3763 3688
3764 if (!ccold) 3689 node = cpu_to_mem(cpu);
3765 continue;
3766
3767 node = cpu_to_mem(i);
3768 n = get_node(cachep, node); 3690 n = get_node(cachep, node);
3769 spin_lock_irq(&n->list_lock); 3691 spin_lock_irq(&n->list_lock);
3770 free_block(cachep, ccold->entry, ccold->avail, node, &list); 3692 free_block(cachep, ac->entry, ac->avail, node, &list);
3771 spin_unlock_irq(&n->list_lock); 3693 spin_unlock_irq(&n->list_lock);
3772 slabs_destroy(cachep, &list); 3694 slabs_destroy(cachep, &list);
3773 kfree(ccold);
3774 } 3695 }
3775 kfree(new); 3696 free_percpu(prev);
3697
3698alloc_node:
3776 return alloc_kmem_cache_node(cachep, gfp); 3699 return alloc_kmem_cache_node(cachep, gfp);
3777} 3700}
3778 3701
@@ -4255,19 +4178,15 @@ static const struct seq_operations slabstats_op = {
4255 4178
4256static int slabstats_open(struct inode *inode, struct file *file) 4179static int slabstats_open(struct inode *inode, struct file *file)
4257{ 4180{
4258 unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL); 4181 unsigned long *n;
4259 int ret = -ENOMEM; 4182
4260 if (n) { 4183 n = __seq_open_private(file, &slabstats_op, PAGE_SIZE);
4261 ret = seq_open(file, &slabstats_op); 4184 if (!n)
4262 if (!ret) { 4185 return -ENOMEM;
4263 struct seq_file *m = file->private_data; 4186
4264 *n = PAGE_SIZE / (2 * sizeof(unsigned long)); 4187 *n = PAGE_SIZE / (2 * sizeof(unsigned long));
4265 m->private = n; 4188
4266 n = NULL; 4189 return 0;
4267 }
4268 kfree(n);
4269 }
4270 return ret;
4271} 4190}
4272 4191
4273static const struct file_operations proc_slabstats_operations = { 4192static const struct file_operations proc_slabstats_operations = {
diff --git a/mm/slab.h b/mm/slab.h
index 0e0fdd365840..ab019e63e3c2 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -4,6 +4,41 @@
4 * Internal slab definitions 4 * Internal slab definitions
5 */ 5 */
6 6
7#ifdef CONFIG_SLOB
8/*
9 * Common fields provided in kmem_cache by all slab allocators
10 * This struct is either used directly by the allocator (SLOB)
11 * or the allocator must include definitions for all fields
12 * provided in kmem_cache_common in their definition of kmem_cache.
13 *
14 * Once we can do anonymous structs (C11 standard) we could put a
15 * anonymous struct definition in these allocators so that the
16 * separate allocations in the kmem_cache structure of SLAB and
17 * SLUB is no longer needed.
18 */
19struct kmem_cache {
20 unsigned int object_size;/* The original size of the object */
21 unsigned int size; /* The aligned/padded/added on size */
22 unsigned int align; /* Alignment as calculated */
23 unsigned long flags; /* Active flags on the slab */
24 const char *name; /* Slab name for sysfs */
25 int refcount; /* Use counter */
26 void (*ctor)(void *); /* Called on object slot creation */
27 struct list_head list; /* List of all slab caches on the system */
28};
29
30#endif /* CONFIG_SLOB */
31
32#ifdef CONFIG_SLAB
33#include <linux/slab_def.h>
34#endif
35
36#ifdef CONFIG_SLUB
37#include <linux/slub_def.h>
38#endif
39
40#include <linux/memcontrol.h>
41
7/* 42/*
8 * State of the slab allocator. 43 * State of the slab allocator.
9 * 44 *
@@ -15,7 +50,6 @@
15enum slab_state { 50enum slab_state {
16 DOWN, /* No slab functionality yet */ 51 DOWN, /* No slab functionality yet */
17 PARTIAL, /* SLUB: kmem_cache_node available */ 52 PARTIAL, /* SLUB: kmem_cache_node available */
18 PARTIAL_ARRAYCACHE, /* SLAB: kmalloc size for arraycache available */
19 PARTIAL_NODE, /* SLAB: kmalloc size for node struct available */ 53 PARTIAL_NODE, /* SLAB: kmalloc size for node struct available */
20 UP, /* Slab caches usable but not all extras yet */ 54 UP, /* Slab caches usable but not all extras yet */
21 FULL /* Everything is working */ 55 FULL /* Everything is working */
@@ -53,15 +87,30 @@ extern void create_boot_cache(struct kmem_cache *, const char *name,
53 size_t size, unsigned long flags); 87 size_t size, unsigned long flags);
54 88
55struct mem_cgroup; 89struct mem_cgroup;
56#ifdef CONFIG_SLUB 90
91int slab_unmergeable(struct kmem_cache *s);
92struct kmem_cache *find_mergeable(size_t size, size_t align,
93 unsigned long flags, const char *name, void (*ctor)(void *));
94#ifndef CONFIG_SLOB
57struct kmem_cache * 95struct kmem_cache *
58__kmem_cache_alias(const char *name, size_t size, size_t align, 96__kmem_cache_alias(const char *name, size_t size, size_t align,
59 unsigned long flags, void (*ctor)(void *)); 97 unsigned long flags, void (*ctor)(void *));
98
99unsigned long kmem_cache_flags(unsigned long object_size,
100 unsigned long flags, const char *name,
101 void (*ctor)(void *));
60#else 102#else
61static inline struct kmem_cache * 103static inline struct kmem_cache *
62__kmem_cache_alias(const char *name, size_t size, size_t align, 104__kmem_cache_alias(const char *name, size_t size, size_t align,
63 unsigned long flags, void (*ctor)(void *)) 105 unsigned long flags, void (*ctor)(void *))
64{ return NULL; } 106{ return NULL; }
107
108static inline unsigned long kmem_cache_flags(unsigned long object_size,
109 unsigned long flags, const char *name,
110 void (*ctor)(void *))
111{
112 return flags;
113}
65#endif 114#endif
66 115
67 116
@@ -303,8 +352,8 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
303 * a kmem_cache_node structure allocated (which is true for all online nodes) 352 * a kmem_cache_node structure allocated (which is true for all online nodes)
304 */ 353 */
305#define for_each_kmem_cache_node(__s, __node, __n) \ 354#define for_each_kmem_cache_node(__s, __node, __n) \
306 for (__node = 0; __n = get_node(__s, __node), __node < nr_node_ids; __node++) \ 355 for (__node = 0; __node < nr_node_ids; __node++) \
307 if (__n) 356 if ((__n = get_node(__s, __node)))
308 357
309#endif 358#endif
310 359
diff --git a/mm/slab_common.c b/mm/slab_common.c
index d319502b2403..3a6e0cfdf03a 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -30,6 +30,43 @@ LIST_HEAD(slab_caches);
30DEFINE_MUTEX(slab_mutex); 30DEFINE_MUTEX(slab_mutex);
31struct kmem_cache *kmem_cache; 31struct kmem_cache *kmem_cache;
32 32
33/*
34 * Set of flags that will prevent slab merging
35 */
36#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
37 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
38 SLAB_FAILSLAB)
39
40#define SLAB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
41 SLAB_CACHE_DMA | SLAB_NOTRACK)
42
43/*
44 * Merge control. If this is set then no merging of slab caches will occur.
45 * (Could be removed. This was introduced to pacify the merge skeptics.)
46 */
47static int slab_nomerge;
48
49static int __init setup_slab_nomerge(char *str)
50{
51 slab_nomerge = 1;
52 return 1;
53}
54
55#ifdef CONFIG_SLUB
56__setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0);
57#endif
58
59__setup("slab_nomerge", setup_slab_nomerge);
60
61/*
62 * Determine the size of a slab object
63 */
64unsigned int kmem_cache_size(struct kmem_cache *s)
65{
66 return s->object_size;
67}
68EXPORT_SYMBOL(kmem_cache_size);
69
33#ifdef CONFIG_DEBUG_VM 70#ifdef CONFIG_DEBUG_VM
34static int kmem_cache_sanity_check(const char *name, size_t size) 71static int kmem_cache_sanity_check(const char *name, size_t size)
35{ 72{
@@ -79,6 +116,65 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size)
79#endif 116#endif
80 117
81#ifdef CONFIG_MEMCG_KMEM 118#ifdef CONFIG_MEMCG_KMEM
119static int memcg_alloc_cache_params(struct mem_cgroup *memcg,
120 struct kmem_cache *s, struct kmem_cache *root_cache)
121{
122 size_t size;
123
124 if (!memcg_kmem_enabled())
125 return 0;
126
127 if (!memcg) {
128 size = offsetof(struct memcg_cache_params, memcg_caches);
129 size += memcg_limited_groups_array_size * sizeof(void *);
130 } else
131 size = sizeof(struct memcg_cache_params);
132
133 s->memcg_params = kzalloc(size, GFP_KERNEL);
134 if (!s->memcg_params)
135 return -ENOMEM;
136
137 if (memcg) {
138 s->memcg_params->memcg = memcg;
139 s->memcg_params->root_cache = root_cache;
140 } else
141 s->memcg_params->is_root_cache = true;
142
143 return 0;
144}
145
146static void memcg_free_cache_params(struct kmem_cache *s)
147{
148 kfree(s->memcg_params);
149}
150
151static int memcg_update_cache_params(struct kmem_cache *s, int num_memcgs)
152{
153 int size;
154 struct memcg_cache_params *new_params, *cur_params;
155
156 BUG_ON(!is_root_cache(s));
157
158 size = offsetof(struct memcg_cache_params, memcg_caches);
159 size += num_memcgs * sizeof(void *);
160
161 new_params = kzalloc(size, GFP_KERNEL);
162 if (!new_params)
163 return -ENOMEM;
164
165 cur_params = s->memcg_params;
166 memcpy(new_params->memcg_caches, cur_params->memcg_caches,
167 memcg_limited_groups_array_size * sizeof(void *));
168
169 new_params->is_root_cache = true;
170
171 rcu_assign_pointer(s->memcg_params, new_params);
172 if (cur_params)
173 kfree_rcu(cur_params, rcu_head);
174
175 return 0;
176}
177
82int memcg_update_all_caches(int num_memcgs) 178int memcg_update_all_caches(int num_memcgs)
83{ 179{
84 struct kmem_cache *s; 180 struct kmem_cache *s;
@@ -89,9 +185,8 @@ int memcg_update_all_caches(int num_memcgs)
89 if (!is_root_cache(s)) 185 if (!is_root_cache(s))
90 continue; 186 continue;
91 187
92 ret = memcg_update_cache_size(s, num_memcgs); 188 ret = memcg_update_cache_params(s, num_memcgs);
93 /* 189 /*
94 * See comment in memcontrol.c, memcg_update_cache_size:
95 * Instead of freeing the memory, we'll just leave the caches 190 * Instead of freeing the memory, we'll just leave the caches
96 * up to this point in an updated state. 191 * up to this point in an updated state.
97 */ 192 */
@@ -104,7 +199,80 @@ out:
104 mutex_unlock(&slab_mutex); 199 mutex_unlock(&slab_mutex);
105 return ret; 200 return ret;
106} 201}
107#endif 202#else
203static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg,
204 struct kmem_cache *s, struct kmem_cache *root_cache)
205{
206 return 0;
207}
208
209static inline void memcg_free_cache_params(struct kmem_cache *s)
210{
211}
212#endif /* CONFIG_MEMCG_KMEM */
213
214/*
215 * Find a mergeable slab cache
216 */
217int slab_unmergeable(struct kmem_cache *s)
218{
219 if (slab_nomerge || (s->flags & SLAB_NEVER_MERGE))
220 return 1;
221
222 if (!is_root_cache(s))
223 return 1;
224
225 if (s->ctor)
226 return 1;
227
228 /*
229 * We may have set a slab to be unmergeable during bootstrap.
230 */
231 if (s->refcount < 0)
232 return 1;
233
234 return 0;
235}
236
237struct kmem_cache *find_mergeable(size_t size, size_t align,
238 unsigned long flags, const char *name, void (*ctor)(void *))
239{
240 struct kmem_cache *s;
241
242 if (slab_nomerge || (flags & SLAB_NEVER_MERGE))
243 return NULL;
244
245 if (ctor)
246 return NULL;
247
248 size = ALIGN(size, sizeof(void *));
249 align = calculate_alignment(flags, align, size);
250 size = ALIGN(size, align);
251 flags = kmem_cache_flags(size, flags, name, NULL);
252
253 list_for_each_entry(s, &slab_caches, list) {
254 if (slab_unmergeable(s))
255 continue;
256
257 if (size > s->size)
258 continue;
259
260 if ((flags & SLAB_MERGE_SAME) != (s->flags & SLAB_MERGE_SAME))
261 continue;
262 /*
263 * Check if alignment is compatible.
264 * Courtesy of Adrian Drzewiecki
265 */
266 if ((s->size & ~(align - 1)) != s->size)
267 continue;
268
269 if (s->size - size >= sizeof(void *))
270 continue;
271
272 return s;
273 }
274 return NULL;
275}
108 276
109/* 277/*
110 * Figure out what the alignment of the objects will be given a set of 278 * Figure out what the alignment of the objects will be given a set of
@@ -211,8 +379,10 @@ kmem_cache_create(const char *name, size_t size, size_t align,
211 mutex_lock(&slab_mutex); 379 mutex_lock(&slab_mutex);
212 380
213 err = kmem_cache_sanity_check(name, size); 381 err = kmem_cache_sanity_check(name, size);
214 if (err) 382 if (err) {
383 s = NULL; /* suppress uninit var warning */
215 goto out_unlock; 384 goto out_unlock;
385 }
216 386
217 /* 387 /*
218 * Some allocators will constraint the set of valid flags to a subset 388 * Some allocators will constraint the set of valid flags to a subset
diff --git a/mm/slob.c b/mm/slob.c
index 21980e0f39a8..96a86206a26b 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -468,7 +468,6 @@ void *__kmalloc(size_t size, gfp_t gfp)
468} 468}
469EXPORT_SYMBOL(__kmalloc); 469EXPORT_SYMBOL(__kmalloc);
470 470
471#ifdef CONFIG_TRACING
472void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller) 471void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller)
473{ 472{
474 return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller); 473 return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller);
@@ -481,7 +480,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfp,
481 return __do_kmalloc_node(size, gfp, node, caller); 480 return __do_kmalloc_node(size, gfp, node, caller);
482} 481}
483#endif 482#endif
484#endif
485 483
486void kfree(const void *block) 484void kfree(const void *block)
487{ 485{
diff --git a/mm/slub.c b/mm/slub.c
index 3e8afcc07a76..ae7b9f1ad394 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -169,16 +169,6 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
169 */ 169 */
170#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) 170#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
171 171
172/*
173 * Set of flags that will prevent slab merging
174 */
175#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
176 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
177 SLAB_FAILSLAB)
178
179#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
180 SLAB_CACHE_DMA | SLAB_NOTRACK)
181
182#define OO_SHIFT 16 172#define OO_SHIFT 16
183#define OO_MASK ((1 << OO_SHIFT) - 1) 173#define OO_MASK ((1 << OO_SHIFT) - 1)
184#define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ 174#define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */
@@ -1176,7 +1166,7 @@ out:
1176 1166
1177__setup("slub_debug", setup_slub_debug); 1167__setup("slub_debug", setup_slub_debug);
1178 1168
1179static unsigned long kmem_cache_flags(unsigned long object_size, 1169unsigned long kmem_cache_flags(unsigned long object_size,
1180 unsigned long flags, const char *name, 1170 unsigned long flags, const char *name,
1181 void (*ctor)(void *)) 1171 void (*ctor)(void *))
1182{ 1172{
@@ -1208,7 +1198,7 @@ static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
1208 struct page *page) {} 1198 struct page *page) {}
1209static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, 1199static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
1210 struct page *page) {} 1200 struct page *page) {}
1211static inline unsigned long kmem_cache_flags(unsigned long object_size, 1201unsigned long kmem_cache_flags(unsigned long object_size,
1212 unsigned long flags, const char *name, 1202 unsigned long flags, const char *name,
1213 void (*ctor)(void *)) 1203 void (*ctor)(void *))
1214{ 1204{
@@ -1699,7 +1689,12 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
1699 struct kmem_cache_cpu *c) 1689 struct kmem_cache_cpu *c)
1700{ 1690{
1701 void *object; 1691 void *object;
1702 int searchnode = (node == NUMA_NO_NODE) ? numa_mem_id() : node; 1692 int searchnode = node;
1693
1694 if (node == NUMA_NO_NODE)
1695 searchnode = numa_mem_id();
1696 else if (!node_present_pages(node))
1697 searchnode = node_to_mem_node(node);
1703 1698
1704 object = get_partial_node(s, get_node(s, searchnode), c, flags); 1699 object = get_partial_node(s, get_node(s, searchnode), c, flags);
1705 if (object || node != NUMA_NO_NODE) 1700 if (object || node != NUMA_NO_NODE)
@@ -2280,11 +2275,18 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
2280redo: 2275redo:
2281 2276
2282 if (unlikely(!node_match(page, node))) { 2277 if (unlikely(!node_match(page, node))) {
2283 stat(s, ALLOC_NODE_MISMATCH); 2278 int searchnode = node;
2284 deactivate_slab(s, page, c->freelist); 2279
2285 c->page = NULL; 2280 if (node != NUMA_NO_NODE && !node_present_pages(node))
2286 c->freelist = NULL; 2281 searchnode = node_to_mem_node(node);
2287 goto new_slab; 2282
2283 if (unlikely(!node_match(page, searchnode))) {
2284 stat(s, ALLOC_NODE_MISMATCH);
2285 deactivate_slab(s, page, c->freelist);
2286 c->page = NULL;
2287 c->freelist = NULL;
2288 goto new_slab;
2289 }
2288 } 2290 }
2289 2291
2290 /* 2292 /*
@@ -2707,12 +2709,6 @@ static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
2707static int slub_min_objects; 2709static int slub_min_objects;
2708 2710
2709/* 2711/*
2710 * Merge control. If this is set then no merging of slab caches will occur.
2711 * (Could be removed. This was introduced to pacify the merge skeptics.)
2712 */
2713static int slub_nomerge;
2714
2715/*
2716 * Calculate the order of allocation given an slab object size. 2712 * Calculate the order of allocation given an slab object size.
2717 * 2713 *
2718 * The order of allocation has significant impact on performance and other 2714 * The order of allocation has significant impact on performance and other
@@ -3240,14 +3236,6 @@ static int __init setup_slub_min_objects(char *str)
3240 3236
3241__setup("slub_min_objects=", setup_slub_min_objects); 3237__setup("slub_min_objects=", setup_slub_min_objects);
3242 3238
3243static int __init setup_slub_nomerge(char *str)
3244{
3245 slub_nomerge = 1;
3246 return 1;
3247}
3248
3249__setup("slub_nomerge", setup_slub_nomerge);
3250
3251void *__kmalloc(size_t size, gfp_t flags) 3239void *__kmalloc(size_t size, gfp_t flags)
3252{ 3240{
3253 struct kmem_cache *s; 3241 struct kmem_cache *s;
@@ -3625,69 +3613,6 @@ void __init kmem_cache_init_late(void)
3625{ 3613{
3626} 3614}
3627 3615
3628/*
3629 * Find a mergeable slab cache
3630 */
3631static int slab_unmergeable(struct kmem_cache *s)
3632{
3633 if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
3634 return 1;
3635
3636 if (!is_root_cache(s))
3637 return 1;
3638
3639 if (s->ctor)
3640 return 1;
3641
3642 /*
3643 * We may have set a slab to be unmergeable during bootstrap.
3644 */
3645 if (s->refcount < 0)
3646 return 1;
3647
3648 return 0;
3649}
3650
3651static struct kmem_cache *find_mergeable(size_t size, size_t align,
3652 unsigned long flags, const char *name, void (*ctor)(void *))
3653{
3654 struct kmem_cache *s;
3655
3656 if (slub_nomerge || (flags & SLUB_NEVER_MERGE))
3657 return NULL;
3658
3659 if (ctor)
3660 return NULL;
3661
3662 size = ALIGN(size, sizeof(void *));
3663 align = calculate_alignment(flags, align, size);
3664 size = ALIGN(size, align);
3665 flags = kmem_cache_flags(size, flags, name, NULL);
3666
3667 list_for_each_entry(s, &slab_caches, list) {
3668 if (slab_unmergeable(s))
3669 continue;
3670
3671 if (size > s->size)
3672 continue;
3673
3674 if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME))
3675 continue;
3676 /*
3677 * Check if alignment is compatible.
3678 * Courtesy of Adrian Drzewiecki
3679 */
3680 if ((s->size & ~(align - 1)) != s->size)
3681 continue;
3682
3683 if (s->size - size >= sizeof(void *))
3684 continue;
3685
3686 return s;
3687 }
3688 return NULL;
3689}
3690
3691struct kmem_cache * 3616struct kmem_cache *
3692__kmem_cache_alias(const char *name, size_t size, size_t align, 3617__kmem_cache_alias(const char *name, size_t size, size_t align,
3693 unsigned long flags, void (*ctor)(void *)) 3618 unsigned long flags, void (*ctor)(void *))
@@ -4604,6 +4529,14 @@ static ssize_t trace_show(struct kmem_cache *s, char *buf)
4604static ssize_t trace_store(struct kmem_cache *s, const char *buf, 4529static ssize_t trace_store(struct kmem_cache *s, const char *buf,
4605 size_t length) 4530 size_t length)
4606{ 4531{
4532 /*
4533 * Tracing a merged cache is going to give confusing results
4534 * as well as cause other issues like converting a mergeable
4535 * cache into an umergeable one.
4536 */
4537 if (s->refcount > 1)
4538 return -EINVAL;
4539
4607 s->flags &= ~SLAB_TRACE; 4540 s->flags &= ~SLAB_TRACE;
4608 if (buf[0] == '1') { 4541 if (buf[0] == '1') {
4609 s->flags &= ~__CMPXCHG_DOUBLE; 4542 s->flags &= ~__CMPXCHG_DOUBLE;
@@ -4721,6 +4654,9 @@ static ssize_t failslab_show(struct kmem_cache *s, char *buf)
4721static ssize_t failslab_store(struct kmem_cache *s, const char *buf, 4654static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
4722 size_t length) 4655 size_t length)
4723{ 4656{
4657 if (s->refcount > 1)
4658 return -EINVAL;
4659
4724 s->flags &= ~SLAB_FAILSLAB; 4660 s->flags &= ~SLAB_FAILSLAB;
4725 if (buf[0] == '1') 4661 if (buf[0] == '1')
4726 s->flags |= SLAB_FAILSLAB; 4662 s->flags |= SLAB_FAILSLAB;
diff --git a/mm/swap.c b/mm/swap.c
index 6b2dc3897cd5..8a12b33936b4 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -887,18 +887,14 @@ void lru_add_drain_all(void)
887 mutex_unlock(&lock); 887 mutex_unlock(&lock);
888} 888}
889 889
890/* 890/**
891 * Batched page_cache_release(). Decrement the reference count on all the 891 * release_pages - batched page_cache_release()
892 * passed pages. If it fell to zero then remove the page from the LRU and 892 * @pages: array of pages to release
893 * free it. 893 * @nr: number of pages
894 * 894 * @cold: whether the pages are cache cold
895 * Avoid taking zone->lru_lock if possible, but if it is taken, retain it
896 * for the remainder of the operation.
897 * 895 *
898 * The locking in this function is against shrink_inactive_list(): we recheck 896 * Decrement the reference count on all the pages in @pages. If it
899 * the page count inside the lock to see whether shrink_inactive_list() 897 * fell to zero, remove the page from the LRU and free it.
900 * grabbed the page via the LRU. If it did, give up: shrink_inactive_list()
901 * will free it.
902 */ 898 */
903void release_pages(struct page **pages, int nr, bool cold) 899void release_pages(struct page **pages, int nr, bool cold)
904{ 900{
@@ -907,6 +903,7 @@ void release_pages(struct page **pages, int nr, bool cold)
907 struct zone *zone = NULL; 903 struct zone *zone = NULL;
908 struct lruvec *lruvec; 904 struct lruvec *lruvec;
909 unsigned long uninitialized_var(flags); 905 unsigned long uninitialized_var(flags);
906 unsigned int uninitialized_var(lock_batch);
910 907
911 for (i = 0; i < nr; i++) { 908 for (i = 0; i < nr; i++) {
912 struct page *page = pages[i]; 909 struct page *page = pages[i];
@@ -920,6 +917,16 @@ void release_pages(struct page **pages, int nr, bool cold)
920 continue; 917 continue;
921 } 918 }
922 919
920 /*
921 * Make sure the IRQ-safe lock-holding time does not get
922 * excessive with a continuous string of pages from the
923 * same zone. The lock is held only if zone != NULL.
924 */
925 if (zone && ++lock_batch == SWAP_CLUSTER_MAX) {
926 spin_unlock_irqrestore(&zone->lru_lock, flags);
927 zone = NULL;
928 }
929
923 if (!put_page_testzero(page)) 930 if (!put_page_testzero(page))
924 continue; 931 continue;
925 932
@@ -930,6 +937,7 @@ void release_pages(struct page **pages, int nr, bool cold)
930 if (zone) 937 if (zone)
931 spin_unlock_irqrestore(&zone->lru_lock, 938 spin_unlock_irqrestore(&zone->lru_lock,
932 flags); 939 flags);
940 lock_batch = 0;
933 zone = pagezone; 941 zone = pagezone;
934 spin_lock_irqsave(&zone->lru_lock, flags); 942 spin_lock_irqsave(&zone->lru_lock, flags);
935 } 943 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 3e0ec83d000c..154444918685 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -28,7 +28,9 @@
28static const struct address_space_operations swap_aops = { 28static const struct address_space_operations swap_aops = {
29 .writepage = swap_writepage, 29 .writepage = swap_writepage,
30 .set_page_dirty = swap_set_page_dirty, 30 .set_page_dirty = swap_set_page_dirty,
31#ifdef CONFIG_MIGRATION
31 .migratepage = migrate_page, 32 .migratepage = migrate_page,
33#endif
32}; 34};
33 35
34static struct backing_dev_info swap_backing_dev_info = { 36static struct backing_dev_info swap_backing_dev_info = {
@@ -263,18 +265,12 @@ void free_page_and_swap_cache(struct page *page)
263void free_pages_and_swap_cache(struct page **pages, int nr) 265void free_pages_and_swap_cache(struct page **pages, int nr)
264{ 266{
265 struct page **pagep = pages; 267 struct page **pagep = pages;
268 int i;
266 269
267 lru_add_drain(); 270 lru_add_drain();
268 while (nr) { 271 for (i = 0; i < nr; i++)
269 int todo = min(nr, PAGEVEC_SIZE); 272 free_swap_cache(pagep[i]);
270 int i; 273 release_pages(pagep, nr, false);
271
272 for (i = 0; i < todo; i++)
273 free_swap_cache(pagep[i]);
274 release_pages(pagep, todo, false);
275 pagep += todo;
276 nr -= todo;
277 }
278} 274}
279 275
280/* 276/*
diff --git a/mm/util.c b/mm/util.c
index 093c973f1697..fec39d4509a9 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -170,32 +170,25 @@ static int vm_is_stack_for_task(struct task_struct *t,
170/* 170/*
171 * Check if the vma is being used as a stack. 171 * Check if the vma is being used as a stack.
172 * If is_group is non-zero, check in the entire thread group or else 172 * If is_group is non-zero, check in the entire thread group or else
173 * just check in the current task. Returns the pid of the task that 173 * just check in the current task. Returns the task_struct of the task
174 * the vma is stack for. 174 * that the vma is stack for. Must be called under rcu_read_lock().
175 */ 175 */
176pid_t vm_is_stack(struct task_struct *task, 176struct task_struct *task_of_stack(struct task_struct *task,
177 struct vm_area_struct *vma, int in_group) 177 struct vm_area_struct *vma, bool in_group)
178{ 178{
179 pid_t ret = 0;
180
181 if (vm_is_stack_for_task(task, vma)) 179 if (vm_is_stack_for_task(task, vma))
182 return task->pid; 180 return task;
183 181
184 if (in_group) { 182 if (in_group) {
185 struct task_struct *t; 183 struct task_struct *t;
186 184
187 rcu_read_lock();
188 for_each_thread(task, t) { 185 for_each_thread(task, t) {
189 if (vm_is_stack_for_task(t, vma)) { 186 if (vm_is_stack_for_task(t, vma))
190 ret = t->pid; 187 return t;
191 goto done;
192 }
193 } 188 }
194done:
195 rcu_read_unlock();
196 } 189 }
197 190
198 return ret; 191 return NULL;
199} 192}
200 193
201#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) 194#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 2b0aa5486092..90520af7f186 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2646,21 +2646,11 @@ static const struct seq_operations vmalloc_op = {
2646 2646
2647static int vmalloc_open(struct inode *inode, struct file *file) 2647static int vmalloc_open(struct inode *inode, struct file *file)
2648{ 2648{
2649 unsigned int *ptr = NULL; 2649 if (IS_ENABLED(CONFIG_NUMA))
2650 int ret; 2650 return seq_open_private(file, &vmalloc_op,
2651 2651 nr_node_ids * sizeof(unsigned int));
2652 if (IS_ENABLED(CONFIG_NUMA)) { 2652 else
2653 ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); 2653 return seq_open(file, &vmalloc_op);
2654 if (ptr == NULL)
2655 return -ENOMEM;
2656 }
2657 ret = seq_open(file, &vmalloc_op);
2658 if (!ret) {
2659 struct seq_file *m = file->private_data;
2660 m->private = ptr;
2661 } else
2662 kfree(ptr);
2663 return ret;
2664} 2654}
2665 2655
2666static const struct file_operations proc_vmalloc_operations = { 2656static const struct file_operations proc_vmalloc_operations = {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2836b5373b2e..dcb47074ae03 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -920,7 +920,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
920 /* Case 1 above */ 920 /* Case 1 above */
921 if (current_is_kswapd() && 921 if (current_is_kswapd() &&
922 PageReclaim(page) && 922 PageReclaim(page) &&
923 zone_is_reclaim_writeback(zone)) { 923 test_bit(ZONE_WRITEBACK, &zone->flags)) {
924 nr_immediate++; 924 nr_immediate++;
925 goto keep_locked; 925 goto keep_locked;
926 926
@@ -1002,7 +1002,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
1002 */ 1002 */
1003 if (page_is_file_cache(page) && 1003 if (page_is_file_cache(page) &&
1004 (!current_is_kswapd() || 1004 (!current_is_kswapd() ||
1005 !zone_is_reclaim_dirty(zone))) { 1005 !test_bit(ZONE_DIRTY, &zone->flags))) {
1006 /* 1006 /*
1007 * Immediately reclaim when written back. 1007 * Immediately reclaim when written back.
1008 * Similar in principal to deactivate_page() 1008 * Similar in principal to deactivate_page()
@@ -1563,7 +1563,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1563 * are encountered in the nr_immediate check below. 1563 * are encountered in the nr_immediate check below.
1564 */ 1564 */
1565 if (nr_writeback && nr_writeback == nr_taken) 1565 if (nr_writeback && nr_writeback == nr_taken)
1566 zone_set_flag(zone, ZONE_WRITEBACK); 1566 set_bit(ZONE_WRITEBACK, &zone->flags);
1567 1567
1568 /* 1568 /*
1569 * memcg will stall in page writeback so only consider forcibly 1569 * memcg will stall in page writeback so only consider forcibly
@@ -1575,16 +1575,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1575 * backed by a congested BDI and wait_iff_congested will stall. 1575 * backed by a congested BDI and wait_iff_congested will stall.
1576 */ 1576 */
1577 if (nr_dirty && nr_dirty == nr_congested) 1577 if (nr_dirty && nr_dirty == nr_congested)
1578 zone_set_flag(zone, ZONE_CONGESTED); 1578 set_bit(ZONE_CONGESTED, &zone->flags);
1579 1579
1580 /* 1580 /*
1581 * If dirty pages are scanned that are not queued for IO, it 1581 * If dirty pages are scanned that are not queued for IO, it
1582 * implies that flushers are not keeping up. In this case, flag 1582 * implies that flushers are not keeping up. In this case, flag
1583 * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing 1583 * the zone ZONE_DIRTY and kswapd will start writing pages from
1584 * pages from reclaim context. 1584 * reclaim context.
1585 */ 1585 */
1586 if (nr_unqueued_dirty == nr_taken) 1586 if (nr_unqueued_dirty == nr_taken)
1587 zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY); 1587 set_bit(ZONE_DIRTY, &zone->flags);
1588 1588
1589 /* 1589 /*
1590 * If kswapd scans pages marked marked for immediate 1590 * If kswapd scans pages marked marked for immediate
@@ -2315,7 +2315,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc)
2315 return reclaimable; 2315 return reclaimable;
2316} 2316}
2317 2317
2318/* Returns true if compaction should go ahead for a high-order request */ 2318/*
2319 * Returns true if compaction should go ahead for a high-order request, or
2320 * the high-order allocation would succeed without compaction.
2321 */
2319static inline bool compaction_ready(struct zone *zone, int order) 2322static inline bool compaction_ready(struct zone *zone, int order)
2320{ 2323{
2321 unsigned long balance_gap, watermark; 2324 unsigned long balance_gap, watermark;
@@ -2339,8 +2342,11 @@ static inline bool compaction_ready(struct zone *zone, int order)
2339 if (compaction_deferred(zone, order)) 2342 if (compaction_deferred(zone, order))
2340 return watermark_ok; 2343 return watermark_ok;
2341 2344
2342 /* If compaction is not ready to start, keep reclaiming */ 2345 /*
2343 if (!compaction_suitable(zone, order)) 2346 * If compaction is not ready to start and allocation is not likely
2347 * to succeed without it, then keep reclaiming.
2348 */
2349 if (compaction_suitable(zone, order) == COMPACT_SKIPPED)
2344 return false; 2350 return false;
2345 2351
2346 return watermark_ok; 2352 return watermark_ok;
@@ -2753,21 +2759,22 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2753} 2759}
2754 2760
2755unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, 2761unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
2762 unsigned long nr_pages,
2756 gfp_t gfp_mask, 2763 gfp_t gfp_mask,
2757 bool noswap) 2764 bool may_swap)
2758{ 2765{
2759 struct zonelist *zonelist; 2766 struct zonelist *zonelist;
2760 unsigned long nr_reclaimed; 2767 unsigned long nr_reclaimed;
2761 int nid; 2768 int nid;
2762 struct scan_control sc = { 2769 struct scan_control sc = {
2763 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2770 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
2764 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2771 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2765 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), 2772 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
2766 .target_mem_cgroup = memcg, 2773 .target_mem_cgroup = memcg,
2767 .priority = DEF_PRIORITY, 2774 .priority = DEF_PRIORITY,
2768 .may_writepage = !laptop_mode, 2775 .may_writepage = !laptop_mode,
2769 .may_unmap = 1, 2776 .may_unmap = 1,
2770 .may_swap = !noswap, 2777 .may_swap = may_swap,
2771 }; 2778 };
2772 2779
2773 /* 2780 /*
@@ -2818,7 +2825,7 @@ static bool zone_balanced(struct zone *zone, int order,
2818 return false; 2825 return false;
2819 2826
2820 if (IS_ENABLED(CONFIG_COMPACTION) && order && 2827 if (IS_ENABLED(CONFIG_COMPACTION) && order &&
2821 !compaction_suitable(zone, order)) 2828 compaction_suitable(zone, order) == COMPACT_SKIPPED)
2822 return false; 2829 return false;
2823 2830
2824 return true; 2831 return true;
@@ -2978,7 +2985,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
2978 /* Account for the number of pages attempted to reclaim */ 2985 /* Account for the number of pages attempted to reclaim */
2979 *nr_attempted += sc->nr_to_reclaim; 2986 *nr_attempted += sc->nr_to_reclaim;
2980 2987
2981 zone_clear_flag(zone, ZONE_WRITEBACK); 2988 clear_bit(ZONE_WRITEBACK, &zone->flags);
2982 2989
2983 /* 2990 /*
2984 * If a zone reaches its high watermark, consider it to be no longer 2991 * If a zone reaches its high watermark, consider it to be no longer
@@ -2988,8 +2995,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
2988 */ 2995 */
2989 if (zone_reclaimable(zone) && 2996 if (zone_reclaimable(zone) &&
2990 zone_balanced(zone, testorder, 0, classzone_idx)) { 2997 zone_balanced(zone, testorder, 0, classzone_idx)) {
2991 zone_clear_flag(zone, ZONE_CONGESTED); 2998 clear_bit(ZONE_CONGESTED, &zone->flags);
2992 zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); 2999 clear_bit(ZONE_DIRTY, &zone->flags);
2993 } 3000 }
2994 3001
2995 return sc->nr_scanned >= sc->nr_to_reclaim; 3002 return sc->nr_scanned >= sc->nr_to_reclaim;
@@ -3080,8 +3087,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
3080 * If balanced, clear the dirty and congested 3087 * If balanced, clear the dirty and congested
3081 * flags 3088 * flags
3082 */ 3089 */
3083 zone_clear_flag(zone, ZONE_CONGESTED); 3090 clear_bit(ZONE_CONGESTED, &zone->flags);
3084 zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); 3091 clear_bit(ZONE_DIRTY, &zone->flags);
3085 } 3092 }
3086 } 3093 }
3087 3094
@@ -3708,11 +3715,11 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3708 if (node_state(node_id, N_CPU) && node_id != numa_node_id()) 3715 if (node_state(node_id, N_CPU) && node_id != numa_node_id())
3709 return ZONE_RECLAIM_NOSCAN; 3716 return ZONE_RECLAIM_NOSCAN;
3710 3717
3711 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) 3718 if (test_and_set_bit(ZONE_RECLAIM_LOCKED, &zone->flags))
3712 return ZONE_RECLAIM_NOSCAN; 3719 return ZONE_RECLAIM_NOSCAN;
3713 3720
3714 ret = __zone_reclaim(zone, gfp_mask, order); 3721 ret = __zone_reclaim(zone, gfp_mask, order);
3715 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); 3722 clear_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
3716 3723
3717 if (!ret) 3724 if (!ret)
3718 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); 3725 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
@@ -3791,66 +3798,3 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
3791 } 3798 }
3792} 3799}
3793#endif /* CONFIG_SHMEM */ 3800#endif /* CONFIG_SHMEM */
3794
3795static void warn_scan_unevictable_pages(void)
3796{
3797 printk_once(KERN_WARNING
3798 "%s: The scan_unevictable_pages sysctl/node-interface has been "
3799 "disabled for lack of a legitimate use case. If you have "
3800 "one, please send an email to linux-mm@kvack.org.\n",
3801 current->comm);
3802}
3803
3804/*
3805 * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of
3806 * all nodes' unevictable lists for evictable pages
3807 */
3808unsigned long scan_unevictable_pages;
3809
3810int scan_unevictable_handler(struct ctl_table *table, int write,
3811 void __user *buffer,
3812 size_t *length, loff_t *ppos)
3813{
3814 warn_scan_unevictable_pages();
3815 proc_doulongvec_minmax(table, write, buffer, length, ppos);
3816 scan_unevictable_pages = 0;
3817 return 0;
3818}
3819
3820#ifdef CONFIG_NUMA
3821/*
3822 * per node 'scan_unevictable_pages' attribute. On demand re-scan of
3823 * a specified node's per zone unevictable lists for evictable pages.
3824 */
3825
3826static ssize_t read_scan_unevictable_node(struct device *dev,
3827 struct device_attribute *attr,
3828 char *buf)
3829{
3830 warn_scan_unevictable_pages();
3831 return sprintf(buf, "0\n"); /* always zero; should fit... */
3832}
3833
3834static ssize_t write_scan_unevictable_node(struct device *dev,
3835 struct device_attribute *attr,
3836 const char *buf, size_t count)
3837{
3838 warn_scan_unevictable_pages();
3839 return 1;
3840}
3841
3842
3843static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
3844 read_scan_unevictable_node,
3845 write_scan_unevictable_node);
3846
3847int scan_unevictable_register_node(struct node *node)
3848{
3849 return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages);
3850}
3851
3852void scan_unevictable_unregister_node(struct node *node)
3853{
3854 device_remove_file(&node->dev, &dev_attr_scan_unevictable_pages);
3855}
3856#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index e9ab104b956f..1b12d390dc68 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -7,6 +7,7 @@
7 * zoned VM statistics 7 * zoned VM statistics
8 * Copyright (C) 2006 Silicon Graphics, Inc., 8 * Copyright (C) 2006 Silicon Graphics, Inc.,
9 * Christoph Lameter <christoph@lameter.com> 9 * Christoph Lameter <christoph@lameter.com>
10 * Copyright (C) 2008-2014 Christoph Lameter
10 */ 11 */
11#include <linux/fs.h> 12#include <linux/fs.h>
12#include <linux/mm.h> 13#include <linux/mm.h>
@@ -14,6 +15,7 @@
14#include <linux/module.h> 15#include <linux/module.h>
15#include <linux/slab.h> 16#include <linux/slab.h>
16#include <linux/cpu.h> 17#include <linux/cpu.h>
18#include <linux/cpumask.h>
17#include <linux/vmstat.h> 19#include <linux/vmstat.h>
18#include <linux/sched.h> 20#include <linux/sched.h>
19#include <linux/math64.h> 21#include <linux/math64.h>
@@ -419,13 +421,22 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item)
419EXPORT_SYMBOL(dec_zone_page_state); 421EXPORT_SYMBOL(dec_zone_page_state);
420#endif 422#endif
421 423
422static inline void fold_diff(int *diff) 424
425/*
426 * Fold a differential into the global counters.
427 * Returns the number of counters updated.
428 */
429static int fold_diff(int *diff)
423{ 430{
424 int i; 431 int i;
432 int changes = 0;
425 433
426 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 434 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
427 if (diff[i]) 435 if (diff[i]) {
428 atomic_long_add(diff[i], &vm_stat[i]); 436 atomic_long_add(diff[i], &vm_stat[i]);
437 changes++;
438 }
439 return changes;
429} 440}
430 441
431/* 442/*
@@ -441,12 +452,15 @@ static inline void fold_diff(int *diff)
441 * statistics in the remote zone struct as well as the global cachelines 452 * statistics in the remote zone struct as well as the global cachelines
442 * with the global counters. These could cause remote node cache line 453 * with the global counters. These could cause remote node cache line
443 * bouncing and will have to be only done when necessary. 454 * bouncing and will have to be only done when necessary.
455 *
456 * The function returns the number of global counters updated.
444 */ 457 */
445static void refresh_cpu_vm_stats(void) 458static int refresh_cpu_vm_stats(void)
446{ 459{
447 struct zone *zone; 460 struct zone *zone;
448 int i; 461 int i;
449 int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; 462 int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
463 int changes = 0;
450 464
451 for_each_populated_zone(zone) { 465 for_each_populated_zone(zone) {
452 struct per_cpu_pageset __percpu *p = zone->pageset; 466 struct per_cpu_pageset __percpu *p = zone->pageset;
@@ -486,15 +500,17 @@ static void refresh_cpu_vm_stats(void)
486 continue; 500 continue;
487 } 501 }
488 502
489
490 if (__this_cpu_dec_return(p->expire)) 503 if (__this_cpu_dec_return(p->expire))
491 continue; 504 continue;
492 505
493 if (__this_cpu_read(p->pcp.count)) 506 if (__this_cpu_read(p->pcp.count)) {
494 drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); 507 drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
508 changes++;
509 }
495#endif 510#endif
496 } 511 }
497 fold_diff(global_diff); 512 changes += fold_diff(global_diff);
513 return changes;
498} 514}
499 515
500/* 516/*
@@ -735,7 +751,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
735 TEXT_FOR_HIGHMEM(xx) xx "_movable", 751 TEXT_FOR_HIGHMEM(xx) xx "_movable",
736 752
737const char * const vmstat_text[] = { 753const char * const vmstat_text[] = {
738 /* Zoned VM counters */ 754 /* enum zone_stat_item countes */
739 "nr_free_pages", 755 "nr_free_pages",
740 "nr_alloc_batch", 756 "nr_alloc_batch",
741 "nr_inactive_anon", 757 "nr_inactive_anon",
@@ -778,10 +794,13 @@ const char * const vmstat_text[] = {
778 "workingset_nodereclaim", 794 "workingset_nodereclaim",
779 "nr_anon_transparent_hugepages", 795 "nr_anon_transparent_hugepages",
780 "nr_free_cma", 796 "nr_free_cma",
797
798 /* enum writeback_stat_item counters */
781 "nr_dirty_threshold", 799 "nr_dirty_threshold",
782 "nr_dirty_background_threshold", 800 "nr_dirty_background_threshold",
783 801
784#ifdef CONFIG_VM_EVENT_COUNTERS 802#ifdef CONFIG_VM_EVENT_COUNTERS
803 /* enum vm_event_item counters */
785 "pgpgin", 804 "pgpgin",
786 "pgpgout", 805 "pgpgout",
787 "pswpin", 806 "pswpin",
@@ -860,6 +879,13 @@ const char * const vmstat_text[] = {
860 "thp_zero_page_alloc", 879 "thp_zero_page_alloc",
861 "thp_zero_page_alloc_failed", 880 "thp_zero_page_alloc_failed",
862#endif 881#endif
882#ifdef CONFIG_MEMORY_BALLOON
883 "balloon_inflate",
884 "balloon_deflate",
885#ifdef CONFIG_BALLOON_COMPACTION
886 "balloon_migrate",
887#endif
888#endif /* CONFIG_MEMORY_BALLOON */
863#ifdef CONFIG_DEBUG_TLBFLUSH 889#ifdef CONFIG_DEBUG_TLBFLUSH
864#ifdef CONFIG_SMP 890#ifdef CONFIG_SMP
865 "nr_tlb_remote_flush", 891 "nr_tlb_remote_flush",
@@ -1229,20 +1255,108 @@ static const struct file_operations proc_vmstat_file_operations = {
1229#ifdef CONFIG_SMP 1255#ifdef CONFIG_SMP
1230static DEFINE_PER_CPU(struct delayed_work, vmstat_work); 1256static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
1231int sysctl_stat_interval __read_mostly = HZ; 1257int sysctl_stat_interval __read_mostly = HZ;
1258static cpumask_var_t cpu_stat_off;
1232 1259
1233static void vmstat_update(struct work_struct *w) 1260static void vmstat_update(struct work_struct *w)
1234{ 1261{
1235 refresh_cpu_vm_stats(); 1262 if (refresh_cpu_vm_stats())
1236 schedule_delayed_work(this_cpu_ptr(&vmstat_work), 1263 /*
1264 * Counters were updated so we expect more updates
1265 * to occur in the future. Keep on running the
1266 * update worker thread.
1267 */
1268 schedule_delayed_work(this_cpu_ptr(&vmstat_work),
1269 round_jiffies_relative(sysctl_stat_interval));
1270 else {
1271 /*
1272 * We did not update any counters so the app may be in
1273 * a mode where it does not cause counter updates.
1274 * We may be uselessly running vmstat_update.
1275 * Defer the checking for differentials to the
1276 * shepherd thread on a different processor.
1277 */
1278 int r;
1279 /*
1280 * Shepherd work thread does not race since it never
1281 * changes the bit if its zero but the cpu
1282 * online / off line code may race if
1283 * worker threads are still allowed during
1284 * shutdown / startup.
1285 */
1286 r = cpumask_test_and_set_cpu(smp_processor_id(),
1287 cpu_stat_off);
1288 VM_BUG_ON(r);
1289 }
1290}
1291
1292/*
1293 * Check if the diffs for a certain cpu indicate that
1294 * an update is needed.
1295 */
1296static bool need_update(int cpu)
1297{
1298 struct zone *zone;
1299
1300 for_each_populated_zone(zone) {
1301 struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu);
1302
1303 BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1);
1304 /*
1305 * The fast way of checking if there are any vmstat diffs.
1306 * This works because the diffs are byte sized items.
1307 */
1308 if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS))
1309 return true;
1310
1311 }
1312 return false;
1313}
1314
1315
1316/*
1317 * Shepherd worker thread that checks the
1318 * differentials of processors that have their worker
1319 * threads for vm statistics updates disabled because of
1320 * inactivity.
1321 */
1322static void vmstat_shepherd(struct work_struct *w);
1323
1324static DECLARE_DELAYED_WORK(shepherd, vmstat_shepherd);
1325
1326static void vmstat_shepherd(struct work_struct *w)
1327{
1328 int cpu;
1329
1330 get_online_cpus();
1331 /* Check processors whose vmstat worker threads have been disabled */
1332 for_each_cpu(cpu, cpu_stat_off)
1333 if (need_update(cpu) &&
1334 cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
1335
1336 schedule_delayed_work_on(cpu, &per_cpu(vmstat_work, cpu),
1337 __round_jiffies_relative(sysctl_stat_interval, cpu));
1338
1339 put_online_cpus();
1340
1341 schedule_delayed_work(&shepherd,
1237 round_jiffies_relative(sysctl_stat_interval)); 1342 round_jiffies_relative(sysctl_stat_interval));
1343
1238} 1344}
1239 1345
1240static void start_cpu_timer(int cpu) 1346static void __init start_shepherd_timer(void)
1241{ 1347{
1242 struct delayed_work *work = &per_cpu(vmstat_work, cpu); 1348 int cpu;
1349
1350 for_each_possible_cpu(cpu)
1351 INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
1352 vmstat_update);
1353
1354 if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL))
1355 BUG();
1356 cpumask_copy(cpu_stat_off, cpu_online_mask);
1243 1357
1244 INIT_DEFERRABLE_WORK(work, vmstat_update); 1358 schedule_delayed_work(&shepherd,
1245 schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); 1359 round_jiffies_relative(sysctl_stat_interval));
1246} 1360}
1247 1361
1248static void vmstat_cpu_dead(int node) 1362static void vmstat_cpu_dead(int node)
@@ -1273,17 +1387,17 @@ static int vmstat_cpuup_callback(struct notifier_block *nfb,
1273 case CPU_ONLINE: 1387 case CPU_ONLINE:
1274 case CPU_ONLINE_FROZEN: 1388 case CPU_ONLINE_FROZEN:
1275 refresh_zone_stat_thresholds(); 1389 refresh_zone_stat_thresholds();
1276 start_cpu_timer(cpu);
1277 node_set_state(cpu_to_node(cpu), N_CPU); 1390 node_set_state(cpu_to_node(cpu), N_CPU);
1391 cpumask_set_cpu(cpu, cpu_stat_off);
1278 break; 1392 break;
1279 case CPU_DOWN_PREPARE: 1393 case CPU_DOWN_PREPARE:
1280 case CPU_DOWN_PREPARE_FROZEN: 1394 case CPU_DOWN_PREPARE_FROZEN:
1281 cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); 1395 cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
1282 per_cpu(vmstat_work, cpu).work.func = NULL; 1396 cpumask_clear_cpu(cpu, cpu_stat_off);
1283 break; 1397 break;
1284 case CPU_DOWN_FAILED: 1398 case CPU_DOWN_FAILED:
1285 case CPU_DOWN_FAILED_FROZEN: 1399 case CPU_DOWN_FAILED_FROZEN:
1286 start_cpu_timer(cpu); 1400 cpumask_set_cpu(cpu, cpu_stat_off);
1287 break; 1401 break;
1288 case CPU_DEAD: 1402 case CPU_DEAD:
1289 case CPU_DEAD_FROZEN: 1403 case CPU_DEAD_FROZEN:
@@ -1303,15 +1417,10 @@ static struct notifier_block vmstat_notifier =
1303static int __init setup_vmstat(void) 1417static int __init setup_vmstat(void)
1304{ 1418{
1305#ifdef CONFIG_SMP 1419#ifdef CONFIG_SMP
1306 int cpu;
1307
1308 cpu_notifier_register_begin(); 1420 cpu_notifier_register_begin();
1309 __register_cpu_notifier(&vmstat_notifier); 1421 __register_cpu_notifier(&vmstat_notifier);
1310 1422
1311 for_each_online_cpu(cpu) { 1423 start_shepherd_timer();
1312 start_cpu_timer(cpu);
1313 node_set_state(cpu_to_node(cpu), N_CPU);
1314 }
1315 cpu_notifier_register_done(); 1424 cpu_notifier_register_done();
1316#endif 1425#endif
1317#ifdef CONFIG_PROC_FS 1426#ifdef CONFIG_PROC_FS
diff --git a/mm/zbud.c b/mm/zbud.c
index f26e7fcc7fa2..ecf1dbef6983 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -60,15 +60,17 @@
60 * NCHUNKS_ORDER determines the internal allocation granularity, effectively 60 * NCHUNKS_ORDER determines the internal allocation granularity, effectively
61 * adjusting internal fragmentation. It also determines the number of 61 * adjusting internal fragmentation. It also determines the number of
62 * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the 62 * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
63 * allocation granularity will be in chunks of size PAGE_SIZE/64, and there 63 * allocation granularity will be in chunks of size PAGE_SIZE/64. As one chunk
64 * will be 64 freelists per pool. 64 * in allocated page is occupied by zbud header, NCHUNKS will be calculated to
65 * 63 which shows the max number of free chunks in zbud page, also there will be
66 * 63 freelists per pool.
65 */ 67 */
66#define NCHUNKS_ORDER 6 68#define NCHUNKS_ORDER 6
67 69
68#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) 70#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER)
69#define CHUNK_SIZE (1 << CHUNK_SHIFT) 71#define CHUNK_SIZE (1 << CHUNK_SHIFT)
70#define NCHUNKS (PAGE_SIZE >> CHUNK_SHIFT)
71#define ZHDR_SIZE_ALIGNED CHUNK_SIZE 72#define ZHDR_SIZE_ALIGNED CHUNK_SIZE
73#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
72 74
73/** 75/**
74 * struct zbud_pool - stores metadata for each zbud pool 76 * struct zbud_pool - stores metadata for each zbud pool
@@ -268,10 +270,9 @@ static int num_free_chunks(struct zbud_header *zhdr)
268{ 270{
269 /* 271 /*
270 * Rather than branch for different situations, just use the fact that 272 * Rather than branch for different situations, just use the fact that
271 * free buddies have a length of zero to simplify everything. -1 at the 273 * free buddies have a length of zero to simplify everything.
272 * end for the zbud header.
273 */ 274 */
274 return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks - 1; 275 return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks;
275} 276}
276 277
277/***************** 278/*****************
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 94f38fac5e81..839a48c3ca27 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -175,7 +175,7 @@ enum fullness_group {
175 * n <= N / f, where 175 * n <= N / f, where
176 * n = number of allocated objects 176 * n = number of allocated objects
177 * N = total number of objects zspage can store 177 * N = total number of objects zspage can store
178 * f = 1/fullness_threshold_frac 178 * f = fullness_threshold_frac
179 * 179 *
180 * Similarly, we assign zspage to: 180 * Similarly, we assign zspage to:
181 * ZS_ALMOST_FULL when n > N / f 181 * ZS_ALMOST_FULL when n > N / f
@@ -199,9 +199,6 @@ struct size_class {
199 199
200 spinlock_t lock; 200 spinlock_t lock;
201 201
202 /* stats */
203 u64 pages_allocated;
204
205 struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; 202 struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
206}; 203};
207 204
@@ -220,6 +217,7 @@ struct zs_pool {
220 struct size_class size_class[ZS_SIZE_CLASSES]; 217 struct size_class size_class[ZS_SIZE_CLASSES];
221 218
222 gfp_t flags; /* allocation flags used when growing pool */ 219 gfp_t flags; /* allocation flags used when growing pool */
220 atomic_long_t pages_allocated;
223}; 221};
224 222
225/* 223/*
@@ -299,7 +297,7 @@ static void zs_zpool_unmap(void *pool, unsigned long handle)
299 297
300static u64 zs_zpool_total_size(void *pool) 298static u64 zs_zpool_total_size(void *pool)
301{ 299{
302 return zs_get_total_size_bytes(pool); 300 return zs_get_total_pages(pool) << PAGE_SHIFT;
303} 301}
304 302
305static struct zpool_driver zs_zpool_driver = { 303static struct zpool_driver zs_zpool_driver = {
@@ -630,7 +628,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
630 while (page) { 628 while (page) {
631 struct page *next_page; 629 struct page *next_page;
632 struct link_free *link; 630 struct link_free *link;
633 unsigned int i, objs_on_page; 631 unsigned int i = 1;
634 632
635 /* 633 /*
636 * page->index stores offset of first object starting 634 * page->index stores offset of first object starting
@@ -643,14 +641,10 @@ static void init_zspage(struct page *first_page, struct size_class *class)
643 641
644 link = (struct link_free *)kmap_atomic(page) + 642 link = (struct link_free *)kmap_atomic(page) +
645 off / sizeof(*link); 643 off / sizeof(*link);
646 objs_on_page = (PAGE_SIZE - off) / class->size;
647 644
648 for (i = 1; i <= objs_on_page; i++) { 645 while ((off += class->size) < PAGE_SIZE) {
649 off += class->size; 646 link->next = obj_location_to_handle(page, i++);
650 if (off < PAGE_SIZE) { 647 link += class->size / sizeof(*link);
651 link->next = obj_location_to_handle(page, i);
652 link += class->size / sizeof(*link);
653 }
654 } 648 }
655 649
656 /* 650 /*
@@ -662,7 +656,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
662 link->next = obj_location_to_handle(next_page, 0); 656 link->next = obj_location_to_handle(next_page, 0);
663 kunmap_atomic(link); 657 kunmap_atomic(link);
664 page = next_page; 658 page = next_page;
665 off = (off + class->size) % PAGE_SIZE; 659 off %= PAGE_SIZE;
666 } 660 }
667} 661}
668 662
@@ -1028,8 +1022,9 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
1028 return 0; 1022 return 0;
1029 1023
1030 set_zspage_mapping(first_page, class->index, ZS_EMPTY); 1024 set_zspage_mapping(first_page, class->index, ZS_EMPTY);
1025 atomic_long_add(class->pages_per_zspage,
1026 &pool->pages_allocated);
1031 spin_lock(&class->lock); 1027 spin_lock(&class->lock);
1032 class->pages_allocated += class->pages_per_zspage;
1033 } 1028 }
1034 1029
1035 obj = (unsigned long)first_page->freelist; 1030 obj = (unsigned long)first_page->freelist;
@@ -1082,14 +1077,13 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
1082 1077
1083 first_page->inuse--; 1078 first_page->inuse--;
1084 fullness = fix_fullness_group(pool, first_page); 1079 fullness = fix_fullness_group(pool, first_page);
1085
1086 if (fullness == ZS_EMPTY)
1087 class->pages_allocated -= class->pages_per_zspage;
1088
1089 spin_unlock(&class->lock); 1080 spin_unlock(&class->lock);
1090 1081
1091 if (fullness == ZS_EMPTY) 1082 if (fullness == ZS_EMPTY) {
1083 atomic_long_sub(class->pages_per_zspage,
1084 &pool->pages_allocated);
1092 free_zspage(first_page); 1085 free_zspage(first_page);
1086 }
1093} 1087}
1094EXPORT_SYMBOL_GPL(zs_free); 1088EXPORT_SYMBOL_GPL(zs_free);
1095 1089
@@ -1183,17 +1177,11 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
1183} 1177}
1184EXPORT_SYMBOL_GPL(zs_unmap_object); 1178EXPORT_SYMBOL_GPL(zs_unmap_object);
1185 1179
1186u64 zs_get_total_size_bytes(struct zs_pool *pool) 1180unsigned long zs_get_total_pages(struct zs_pool *pool)
1187{ 1181{
1188 int i; 1182 return atomic_long_read(&pool->pages_allocated);
1189 u64 npages = 0;
1190
1191 for (i = 0; i < ZS_SIZE_CLASSES; i++)
1192 npages += pool->size_class[i].pages_allocated;
1193
1194 return npages << PAGE_SHIFT;
1195} 1183}
1196EXPORT_SYMBOL_GPL(zs_get_total_size_bytes); 1184EXPORT_SYMBOL_GPL(zs_get_total_pages);
1197 1185
1198module_init(zs_init); 1186module_init(zs_init);
1199module_exit(zs_exit); 1187module_exit(zs_exit);
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index 3f94e1afd6cf..4c4b1f631ecf 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -3,6 +3,7 @@
3CC = $(CROSS_COMPILE)gcc 3CC = $(CROSS_COMPILE)gcc
4CFLAGS = -Wall 4CFLAGS = -Wall
5BINARIES = hugepage-mmap hugepage-shm map_hugetlb thuge-gen hugetlbfstest 5BINARIES = hugepage-mmap hugepage-shm map_hugetlb thuge-gen hugetlbfstest
6BINARIES += transhuge-stress
6 7
7all: $(BINARIES) 8all: $(BINARIES)
8%: %.c 9%: %.c
diff --git a/tools/testing/selftests/vm/transhuge-stress.c b/tools/testing/selftests/vm/transhuge-stress.c
new file mode 100644
index 000000000000..fd7f1b4a96f9
--- /dev/null
+++ b/tools/testing/selftests/vm/transhuge-stress.c
@@ -0,0 +1,144 @@
1/*
2 * Stress test for transparent huge pages, memory compaction and migration.
3 *
4 * Authors: Konstantin Khlebnikov <koct9i@gmail.com>
5 *
6 * This is free and unencumbered software released into the public domain.
7 */
8
9#include <stdlib.h>
10#include <stdio.h>
11#include <stdint.h>
12#include <err.h>
13#include <time.h>
14#include <unistd.h>
15#include <fcntl.h>
16#include <string.h>
17#include <sys/mman.h>
18
19#define PAGE_SHIFT 12
20#define HPAGE_SHIFT 21
21
22#define PAGE_SIZE (1 << PAGE_SHIFT)
23#define HPAGE_SIZE (1 << HPAGE_SHIFT)
24
25#define PAGEMAP_PRESENT(ent) (((ent) & (1ull << 63)) != 0)
26#define PAGEMAP_PFN(ent) ((ent) & ((1ull << 55) - 1))
27
28int pagemap_fd;
29
30int64_t allocate_transhuge(void *ptr)
31{
32 uint64_t ent[2];
33
34 /* drop pmd */
35 if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE,
36 MAP_FIXED | MAP_ANONYMOUS |
37 MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr)
38 errx(2, "mmap transhuge");
39
40 if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE))
41 err(2, "MADV_HUGEPAGE");
42
43 /* allocate transparent huge page */
44 *(volatile void **)ptr = ptr;
45
46 if (pread(pagemap_fd, ent, sizeof(ent),
47 (uintptr_t)ptr >> (PAGE_SHIFT - 3)) != sizeof(ent))
48 err(2, "read pagemap");
49
50 if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) &&
51 PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) &&
52 !(PAGEMAP_PFN(ent[0]) & ((1 << (HPAGE_SHIFT - PAGE_SHIFT)) - 1)))
53 return PAGEMAP_PFN(ent[0]);
54
55 return -1;
56}
57
58int main(int argc, char **argv)
59{
60 size_t ram, len;
61 void *ptr, *p;
62 struct timespec a, b;
63 double s;
64 uint8_t *map;
65 size_t map_len;
66
67 ram = sysconf(_SC_PHYS_PAGES);
68 if (ram > SIZE_MAX / sysconf(_SC_PAGESIZE) / 4)
69 ram = SIZE_MAX / 4;
70 else
71 ram *= sysconf(_SC_PAGESIZE);
72
73 if (argc == 1)
74 len = ram;
75 else if (!strcmp(argv[1], "-h"))
76 errx(1, "usage: %s [size in MiB]", argv[0]);
77 else
78 len = atoll(argv[1]) << 20;
79
80 warnx("allocate %zd transhuge pages, using %zd MiB virtual memory"
81 " and %zd MiB of ram", len >> HPAGE_SHIFT, len >> 20,
82 len >> (20 + HPAGE_SHIFT - PAGE_SHIFT - 1));
83
84 pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
85 if (pagemap_fd < 0)
86 err(2, "open pagemap");
87
88 len -= len % HPAGE_SIZE;
89 ptr = mmap(NULL, len + HPAGE_SIZE, PROT_READ | PROT_WRITE,
90 MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0);
91 if (ptr == MAP_FAILED)
92 err(2, "initial mmap");
93 ptr += HPAGE_SIZE - (uintptr_t)ptr % HPAGE_SIZE;
94
95 if (madvise(ptr, len, MADV_HUGEPAGE))
96 err(2, "MADV_HUGEPAGE");
97
98 map_len = ram >> (HPAGE_SHIFT - 1);
99 map = malloc(map_len);
100 if (!map)
101 errx(2, "map malloc");
102
103 while (1) {
104 int nr_succeed = 0, nr_failed = 0, nr_pages = 0;
105
106 memset(map, 0, map_len);
107
108 clock_gettime(CLOCK_MONOTONIC, &a);
109 for (p = ptr; p < ptr + len; p += HPAGE_SIZE) {
110 int64_t pfn;
111
112 pfn = allocate_transhuge(p);
113
114 if (pfn < 0) {
115 nr_failed++;
116 } else {
117 size_t idx = pfn >> (HPAGE_SHIFT - PAGE_SHIFT);
118
119 nr_succeed++;
120 if (idx >= map_len) {
121 map = realloc(map, idx + 1);
122 if (!map)
123 errx(2, "map realloc");
124 memset(map + map_len, 0, idx + 1 - map_len);
125 map_len = idx + 1;
126 }
127 if (!map[idx])
128 nr_pages++;
129 map[idx] = 1;
130 }
131
132 /* split transhuge page, keep last page */
133 if (madvise(p, HPAGE_SIZE - PAGE_SIZE, MADV_DONTNEED))
134 err(2, "MADV_DONTNEED");
135 }
136 clock_gettime(CLOCK_MONOTONIC, &b);
137 s = b.tv_sec - a.tv_sec + (b.tv_nsec - a.tv_nsec) / 1000000000.;
138
139 warnx("%.3f s/loop, %.3f ms/page, %10.3f MiB/s\t"
140 "%4d succeed, %4d failed, %4d different pages",
141 s, s * 1000 / (len >> HPAGE_SHIFT), len / s / (1 << 20),
142 nr_succeed, nr_failed, nr_pages);
143 }
144}
diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c
index c4d6d2e20e0d..264fbc297e0b 100644
--- a/tools/vm/page-types.c
+++ b/tools/vm/page-types.c
@@ -132,6 +132,7 @@ static const char * const page_flag_names[] = {
132 [KPF_NOPAGE] = "n:nopage", 132 [KPF_NOPAGE] = "n:nopage",
133 [KPF_KSM] = "x:ksm", 133 [KPF_KSM] = "x:ksm",
134 [KPF_THP] = "t:thp", 134 [KPF_THP] = "t:thp",
135 [KPF_BALLOON] = "o:balloon",
135 136
136 [KPF_RESERVED] = "r:reserved", 137 [KPF_RESERVED] = "r:reserved",
137 [KPF_MLOCKED] = "m:mlocked", 138 [KPF_MLOCKED] = "m:mlocked",