aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig100
-rw-r--r--mm/Kconfig.debug20
-rw-r--r--mm/Makefile17
-rw-r--r--mm/allocpercpu.c30
-rw-r--r--mm/backing-dev.c465
-rw-r--r--mm/bootmem.c60
-rw-r--r--mm/bounce.c10
-rw-r--r--mm/dmapool.c2
-rw-r--r--mm/fadvise.c2
-rw-r--r--mm/failslab.c1
-rw-r--r--mm/filemap.c382
-rw-r--r--mm/filemap_xip.c6
-rw-r--r--mm/highmem.c18
-rw-r--r--mm/hugetlb.c385
-rw-r--r--mm/hwpoison-inject.c41
-rw-r--r--mm/init-mm.c20
-rw-r--r--mm/internal.h43
-rw-r--r--mm/kmemcheck.c122
-rw-r--r--mm/kmemleak-test.c111
-rw-r--r--mm/kmemleak.c1689
-rw-r--r--mm/ksm.c1710
-rw-r--r--mm/maccess.c2
-rw-r--r--mm/madvise.c93
-rw-r--r--mm/memcontrol.c1563
-rw-r--r--mm/memory-failure.c835
-rw-r--r--mm/memory.c630
-rw-r--r--mm/memory_hotplug.c43
-rw-r--r--mm/mempolicy.c192
-rw-r--r--mm/mempool.c9
-rw-r--r--mm/migrate.c42
-rw-r--r--mm/mlock.c201
-rw-r--r--mm/mmap.c81
-rw-r--r--mm/mmu_context.c58
-rw-r--r--mm/mmu_notifier.c20
-rw-r--r--mm/mmzone.c15
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/mremap.c18
-rw-r--r--mm/nommu.c239
-rw-r--r--mm/oom_kill.c153
-rw-r--r--mm/page-writeback.c287
-rw-r--r--mm/page_alloc.c1211
-rw-r--r--mm/page_cgroup.c70
-rw-r--r--mm/page_io.c2
-rw-r--r--mm/pdflush.c251
-rw-r--r--mm/percpu.c1728
-rw-r--r--mm/quicklist.c5
-rw-r--r--mm/readahead.c185
-rw-r--r--mm/rmap.c188
-rw-r--r--mm/shmem.c91
-rw-r--r--mm/shmem_acl.c40
-rw-r--r--mm/slab.c349
-rw-r--r--mm/slob.c60
-rw-r--r--mm/slub.c357
-rw-r--r--mm/sparse-vmemmap.c8
-rw-r--r--mm/sparse.c9
-rw-r--r--mm/swap.c58
-rw-r--r--mm/swap_state.c153
-rw-r--r--mm/swapfile.c311
-rw-r--r--mm/thrash.c32
-rw-r--r--mm/truncate.c186
-rw-r--r--mm/util.c39
-rw-r--r--mm/vmalloc.c643
-rw-r--r--mm/vmscan.c680
-rw-r--r--mm/vmstat.c48
64 files changed, 12516 insertions, 3905 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index b53427ad30a3..44cf6f0a3a6d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -67,7 +67,7 @@ config DISCONTIGMEM
67 67
68config SPARSEMEM 68config SPARSEMEM
69 def_bool y 69 def_bool y
70 depends on SPARSEMEM_MANUAL 70 depends on (!SELECT_MEMORY_MODEL && ARCH_SPARSEMEM_ENABLE) || SPARSEMEM_MANUAL
71 71
72config FLATMEM 72config FLATMEM
73 def_bool y 73 def_bool y
@@ -128,11 +128,8 @@ config SPARSEMEM_VMEMMAP
128config MEMORY_HOTPLUG 128config MEMORY_HOTPLUG
129 bool "Allow for memory hot-add" 129 bool "Allow for memory hot-add"
130 depends on SPARSEMEM || X86_64_ACPI_NUMA 130 depends on SPARSEMEM || X86_64_ACPI_NUMA
131 depends on HOTPLUG && !HIBERNATION && ARCH_ENABLE_MEMORY_HOTPLUG 131 depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
132 depends on (IA64 || X86 || PPC64 || SUPERH || S390) 132 depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
133
134comment "Memory hotplug is currently incompatible with Software Suspend"
135 depends on SPARSEMEM && HOTPLUG && HIBERNATION
136 133
137config MEMORY_HOTPLUG_SPARSE 134config MEMORY_HOTPLUG_SPARSE
138 def_bool y 135 def_bool y
@@ -153,7 +150,7 @@ config MEMORY_HOTREMOVE
153# 150#
154config PAGEFLAGS_EXTENDED 151config PAGEFLAGS_EXTENDED
155 def_bool y 152 def_bool y
156 depends on 64BIT || SPARSEMEM_VMEMMAP || !NUMA || !SPARSEMEM 153 depends on 64BIT || SPARSEMEM_VMEMMAP || !SPARSEMEM
157 154
158# Heavily threaded applications may benefit from splitting the mm-wide 155# Heavily threaded applications may benefit from splitting the mm-wide
159# page_table_lock, so that faults on different parts of the user address 156# page_table_lock, so that faults on different parts of the user address
@@ -203,23 +200,90 @@ config VIRT_TO_BUS
203 def_bool y 200 def_bool y
204 depends on !ARCH_NO_VIRT_TO_BUS 201 depends on !ARCH_NO_VIRT_TO_BUS
205 202
206config UNEVICTABLE_LRU
207 bool "Add LRU list to track non-evictable pages"
208 default y
209 help
210 Keeps unevictable pages off of the active and inactive pageout
211 lists, so kswapd will not waste CPU time or have its balancing
212 algorithms thrown off by scanning these pages. Selecting this
213 will use one page flag and increase the code size a little,
214 say Y unless you know what you are doing.
215
216config HAVE_MLOCK 203config HAVE_MLOCK
217 bool 204 bool
218 default y if MMU=y 205 default y if MMU=y
219 206
220config HAVE_MLOCKED_PAGE_BIT 207config HAVE_MLOCKED_PAGE_BIT
221 bool 208 bool
222 default y if HAVE_MLOCK=y && UNEVICTABLE_LRU=y 209 default y if HAVE_MLOCK=y
223 210
224config MMU_NOTIFIER 211config MMU_NOTIFIER
225 bool 212 bool
213
214config KSM
215 bool "Enable KSM for page merging"
216 depends on MMU
217 help
218 Enable Kernel Samepage Merging: KSM periodically scans those areas
219 of an application's address space that an app has advised may be
220 mergeable. When it finds pages of identical content, it replaces
221 the many instances by a single resident page with that content, so
222 saving memory until one or another app needs to modify the content.
223 Recommended for use with KVM, or with other duplicative applications.
224 See Documentation/vm/ksm.txt for more information: KSM is inactive
225 until a program has madvised that an area is MADV_MERGEABLE, and
226 root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
227
228config DEFAULT_MMAP_MIN_ADDR
229 int "Low address space to protect from user allocation"
230 default 4096
231 help
232 This is the portion of low virtual memory which should be protected
233 from userspace allocation. Keeping a user from writing to low pages
234 can help reduce the impact of kernel NULL pointer bugs.
235
236 For most ia64, ppc64 and x86 users with lots of address space
237 a value of 65536 is reasonable and should cause no problems.
238 On arm and other archs it should not be higher than 32768.
239 Programs which use vm86 functionality or have some need to map
240 this low address space will need CAP_SYS_RAWIO or disable this
241 protection by setting the value to 0.
242
243 This value can be changed after boot using the
244 /proc/sys/vm/mmap_min_addr tunable.
245
246config ARCH_SUPPORTS_MEMORY_FAILURE
247 bool
248
249config MEMORY_FAILURE
250 depends on MMU
251 depends on ARCH_SUPPORTS_MEMORY_FAILURE
252 bool "Enable recovery from hardware memory errors"
253 help
254 Enables code to recover from some memory failures on systems
255 with MCA recovery. This allows a system to continue running
256 even when some of its memory has uncorrected errors. This requires
257 special hardware support and typically ECC memory.
258
259config HWPOISON_INJECT
260 tristate "Poison pages injector"
261 depends on MEMORY_FAILURE && DEBUG_KERNEL
262
263config NOMMU_INITIAL_TRIM_EXCESS
264 int "Turn on mmap() excess space trimming before booting"
265 depends on !MMU
266 default 1
267 help
268 The NOMMU mmap() frequently needs to allocate large contiguous chunks
269 of memory on which to store mappings, but it can only ask the system
270 allocator for chunks in 2^N*PAGE_SIZE amounts - which is frequently
271 more than it requires. To deal with this, mmap() is able to trim off
272 the excess and return it to the allocator.
273
274 If trimming is enabled, the excess is trimmed off and returned to the
275 system allocator, which can cause extra fragmentation, particularly
276 if there are a lot of transient processes.
277
278 If trimming is disabled, the excess is kept, but not used, which for
279 long-term mappings means that the space is wasted.
280
281 Trimming can be dynamically controlled through a sysctl option
282 (/proc/sys/vm/nr_trim_pages) which specifies the minimum number of
283 excess pages there must be before trimming should occur, or zero if
284 no trimming is to occur.
285
286 This option specifies the initial value of this option. The default
287 of 1 says that all excess pages should be trimmed.
288
289 See Documentation/nommu-mmap.txt for more information.
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index c8d62d49a44e..af7cfb43d2f0 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -1,3 +1,13 @@
1config DEBUG_PAGEALLOC
2 bool "Debug page memory allocations"
3 depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC
4 depends on !HIBERNATION || !PPC && !SPARC
5 depends on !KMEMCHECK
6 ---help---
7 Unmap pages from the kernel linear mapping after free_pages().
8 This results in a large slowdown, but helps to find certain types
9 of memory corruption.
10
1config WANT_PAGE_DEBUG_FLAGS 11config WANT_PAGE_DEBUG_FLAGS
2 bool 12 bool
3 13
@@ -7,11 +17,11 @@ config PAGE_POISONING
7 depends on !HIBERNATION 17 depends on !HIBERNATION
8 select DEBUG_PAGEALLOC 18 select DEBUG_PAGEALLOC
9 select WANT_PAGE_DEBUG_FLAGS 19 select WANT_PAGE_DEBUG_FLAGS
10 help 20 ---help---
11 Fill the pages with poison patterns after free_pages() and verify 21 Fill the pages with poison patterns after free_pages() and verify
12 the patterns before alloc_pages(). This results in a large slowdown, 22 the patterns before alloc_pages(). This results in a large slowdown,
13 but helps to find certain types of memory corruptions. 23 but helps to find certain types of memory corruption.
14 24
15 This option cannot enalbe with hibernation. Otherwise, it will get 25 This option cannot be enabled in combination with hibernation as
16 wrong messages for memory corruption because the free pages are not 26 that would result in incorrect warnings of memory corruption after
17 saved to the suspend image. 27 a resume because free pages are not saved to the suspend image.
diff --git a/mm/Makefile b/mm/Makefile
index ec73c68b6015..ebf849042ed3 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,15 +5,16 @@
5mmu-y := nommu.o 5mmu-y := nommu.o
6mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ 6mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ 7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
8 vmalloc.o 8 vmalloc.o pagewalk.o
9 9
10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ 10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 maccess.o page_alloc.o page-writeback.o pdflush.o \ 11 maccess.o page_alloc.o page-writeback.o \
12 readahead.o swap.o truncate.o vmscan.o shmem.o \ 12 readahead.o swap.o truncate.o vmscan.o shmem.o \
13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
14 page_isolation.o mm_init.o $(mmu-y) 14 page_isolation.o mm_init.o mmu_context.o \
15 $(mmu-y)
16obj-y += init-mm.o
15 17
16obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
17obj-$(CONFIG_BOUNCE) += bounce.o 18obj-$(CONFIG_BOUNCE) += bounce.o
18obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o 19obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
19obj-$(CONFIG_HAS_DMA) += dmapool.o 20obj-$(CONFIG_HAS_DMA) += dmapool.o
@@ -24,17 +25,23 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
24obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o 25obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
25obj-$(CONFIG_SLOB) += slob.o 26obj-$(CONFIG_SLOB) += slob.o
26obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o 27obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
28obj-$(CONFIG_KSM) += ksm.o
27obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o 29obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
28obj-$(CONFIG_SLAB) += slab.o 30obj-$(CONFIG_SLAB) += slab.o
29obj-$(CONFIG_SLUB) += slub.o 31obj-$(CONFIG_SLUB) += slub.o
32obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
30obj-$(CONFIG_FAILSLAB) += failslab.o 33obj-$(CONFIG_FAILSLAB) += failslab.o
31obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 34obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
32obj-$(CONFIG_FS_XIP) += filemap_xip.o 35obj-$(CONFIG_FS_XIP) += filemap_xip.o
33obj-$(CONFIG_MIGRATION) += migrate.o 36obj-$(CONFIG_MIGRATION) += migrate.o
34ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA 37ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
35obj-$(CONFIG_SMP) += percpu.o 38obj-$(CONFIG_SMP) += percpu.o
36else 39else
37obj-$(CONFIG_SMP) += allocpercpu.o 40obj-$(CONFIG_SMP) += allocpercpu.o
38endif 41endif
39obj-$(CONFIG_QUICKLIST) += quicklist.o 42obj-$(CONFIG_QUICKLIST) += quicklist.o
40obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o 43obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
44obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
45obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
46obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
47obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index 139d5b7b6621..df34ceae0c67 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -5,6 +5,8 @@
5 */ 5 */
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/bootmem.h>
9#include <asm/sections.h>
8 10
9#ifndef cache_line_size 11#ifndef cache_line_size
10#define cache_line_size() L1_CACHE_BYTES 12#define cache_line_size() L1_CACHE_BYTES
@@ -31,7 +33,7 @@ static void percpu_depopulate(void *__pdata, int cpu)
31 * @__pdata: per-cpu data to depopulate 33 * @__pdata: per-cpu data to depopulate
32 * @mask: depopulate per-cpu data for cpu's selected through mask bits 34 * @mask: depopulate per-cpu data for cpu's selected through mask bits
33 */ 35 */
34static void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask) 36static void __percpu_depopulate_mask(void *__pdata, const cpumask_t *mask)
35{ 37{
36 int cpu; 38 int cpu;
37 for_each_cpu_mask_nr(cpu, *mask) 39 for_each_cpu_mask_nr(cpu, *mask)
@@ -147,3 +149,29 @@ void free_percpu(void *__pdata)
147 kfree(__percpu_disguise(__pdata)); 149 kfree(__percpu_disguise(__pdata));
148} 150}
149EXPORT_SYMBOL_GPL(free_percpu); 151EXPORT_SYMBOL_GPL(free_percpu);
152
153/*
154 * Generic percpu area setup.
155 */
156#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
157unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
158
159EXPORT_SYMBOL(__per_cpu_offset);
160
161void __init setup_per_cpu_areas(void)
162{
163 unsigned long size, i;
164 char *ptr;
165 unsigned long nr_possible_cpus = num_possible_cpus();
166
167 /* Copy section for each CPU (we discard the original) */
168 size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
169 ptr = alloc_bootmem_pages(size * nr_possible_cpus);
170
171 for_each_possible_cpu(i) {
172 __per_cpu_offset[i] = ptr - __per_cpu_start;
173 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
174 ptr += size;
175 }
176}
177#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index be68c956a660..0e8ca0347707 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1,8 +1,11 @@
1 1
2#include <linux/wait.h> 2#include <linux/wait.h>
3#include <linux/backing-dev.h> 3#include <linux/backing-dev.h>
4#include <linux/kthread.h>
5#include <linux/freezer.h>
4#include <linux/fs.h> 6#include <linux/fs.h>
5#include <linux/pagemap.h> 7#include <linux/pagemap.h>
8#include <linux/mm.h>
6#include <linux/sched.h> 9#include <linux/sched.h>
7#include <linux/module.h> 10#include <linux/module.h>
8#include <linux/writeback.h> 11#include <linux/writeback.h>
@@ -14,6 +17,7 @@ void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
14EXPORT_SYMBOL(default_unplug_io_fn); 17EXPORT_SYMBOL(default_unplug_io_fn);
15 18
16struct backing_dev_info default_backing_dev_info = { 19struct backing_dev_info default_backing_dev_info = {
20 .name = "default",
17 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, 21 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
18 .state = 0, 22 .state = 0,
19 .capabilities = BDI_CAP_MAP_COPY, 23 .capabilities = BDI_CAP_MAP_COPY,
@@ -23,6 +27,24 @@ EXPORT_SYMBOL_GPL(default_backing_dev_info);
23 27
24static struct class *bdi_class; 28static struct class *bdi_class;
25 29
30/*
31 * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as
32 * reader side protection for bdi_pending_list. bdi_list has RCU reader side
33 * locking.
34 */
35DEFINE_SPINLOCK(bdi_lock);
36LIST_HEAD(bdi_list);
37LIST_HEAD(bdi_pending_list);
38
39static struct task_struct *sync_supers_tsk;
40static struct timer_list sync_supers_timer;
41
42static int bdi_sync_supers(void *);
43static void sync_supers_timer_fn(unsigned long);
44static void arm_supers_timer(void);
45
46static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
47
26#ifdef CONFIG_DEBUG_FS 48#ifdef CONFIG_DEBUG_FS
27#include <linux/debugfs.h> 49#include <linux/debugfs.h>
28#include <linux/seq_file.h> 50#include <linux/seq_file.h>
@@ -37,9 +59,29 @@ static void bdi_debug_init(void)
37static int bdi_debug_stats_show(struct seq_file *m, void *v) 59static int bdi_debug_stats_show(struct seq_file *m, void *v)
38{ 60{
39 struct backing_dev_info *bdi = m->private; 61 struct backing_dev_info *bdi = m->private;
62 struct bdi_writeback *wb;
40 unsigned long background_thresh; 63 unsigned long background_thresh;
41 unsigned long dirty_thresh; 64 unsigned long dirty_thresh;
42 unsigned long bdi_thresh; 65 unsigned long bdi_thresh;
66 unsigned long nr_dirty, nr_io, nr_more_io, nr_wb;
67 struct inode *inode;
68
69 /*
70 * inode lock is enough here, the bdi->wb_list is protected by
71 * RCU on the reader side
72 */
73 nr_wb = nr_dirty = nr_io = nr_more_io = 0;
74 spin_lock(&inode_lock);
75 list_for_each_entry(wb, &bdi->wb_list, list) {
76 nr_wb++;
77 list_for_each_entry(inode, &wb->b_dirty, i_list)
78 nr_dirty++;
79 list_for_each_entry(inode, &wb->b_io, i_list)
80 nr_io++;
81 list_for_each_entry(inode, &wb->b_more_io, i_list)
82 nr_more_io++;
83 }
84 spin_unlock(&inode_lock);
43 85
44 get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); 86 get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
45 87
@@ -49,12 +91,22 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
49 "BdiReclaimable: %8lu kB\n" 91 "BdiReclaimable: %8lu kB\n"
50 "BdiDirtyThresh: %8lu kB\n" 92 "BdiDirtyThresh: %8lu kB\n"
51 "DirtyThresh: %8lu kB\n" 93 "DirtyThresh: %8lu kB\n"
52 "BackgroundThresh: %8lu kB\n", 94 "BackgroundThresh: %8lu kB\n"
95 "WritebackThreads: %8lu\n"
96 "b_dirty: %8lu\n"
97 "b_io: %8lu\n"
98 "b_more_io: %8lu\n"
99 "bdi_list: %8u\n"
100 "state: %8lx\n"
101 "wb_mask: %8lx\n"
102 "wb_list: %8u\n"
103 "wb_cnt: %8u\n",
53 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), 104 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
54 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), 105 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
55 K(bdi_thresh), 106 K(bdi_thresh), K(dirty_thresh),
56 K(dirty_thresh), 107 K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io,
57 K(background_thresh)); 108 !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask,
109 !list_empty(&bdi->wb_list), bdi->wb_cnt);
58#undef K 110#undef K
59 111
60 return 0; 112 return 0;
@@ -185,6 +237,13 @@ static int __init default_bdi_init(void)
185{ 237{
186 int err; 238 int err;
187 239
240 sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
241 BUG_ON(IS_ERR(sync_supers_tsk));
242
243 init_timer(&sync_supers_timer);
244 setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
245 arm_supers_timer();
246
188 err = bdi_init(&default_backing_dev_info); 247 err = bdi_init(&default_backing_dev_info);
189 if (!err) 248 if (!err)
190 bdi_register(&default_backing_dev_info, NULL, "default"); 249 bdi_register(&default_backing_dev_info, NULL, "default");
@@ -193,6 +252,279 @@ static int __init default_bdi_init(void)
193} 252}
194subsys_initcall(default_bdi_init); 253subsys_initcall(default_bdi_init);
195 254
255static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
256{
257 memset(wb, 0, sizeof(*wb));
258
259 wb->bdi = bdi;
260 wb->last_old_flush = jiffies;
261 INIT_LIST_HEAD(&wb->b_dirty);
262 INIT_LIST_HEAD(&wb->b_io);
263 INIT_LIST_HEAD(&wb->b_more_io);
264}
265
266static void bdi_task_init(struct backing_dev_info *bdi,
267 struct bdi_writeback *wb)
268{
269 struct task_struct *tsk = current;
270
271 spin_lock(&bdi->wb_lock);
272 list_add_tail_rcu(&wb->list, &bdi->wb_list);
273 spin_unlock(&bdi->wb_lock);
274
275 tsk->flags |= PF_FLUSHER | PF_SWAPWRITE;
276 set_freezable();
277
278 /*
279 * Our parent may run at a different priority, just set us to normal
280 */
281 set_user_nice(tsk, 0);
282}
283
284static int bdi_start_fn(void *ptr)
285{
286 struct bdi_writeback *wb = ptr;
287 struct backing_dev_info *bdi = wb->bdi;
288 int ret;
289
290 /*
291 * Add us to the active bdi_list
292 */
293 spin_lock_bh(&bdi_lock);
294 list_add_rcu(&bdi->bdi_list, &bdi_list);
295 spin_unlock_bh(&bdi_lock);
296
297 bdi_task_init(bdi, wb);
298
299 /*
300 * Clear pending bit and wakeup anybody waiting to tear us down
301 */
302 clear_bit(BDI_pending, &bdi->state);
303 smp_mb__after_clear_bit();
304 wake_up_bit(&bdi->state, BDI_pending);
305
306 ret = bdi_writeback_task(wb);
307
308 /*
309 * Remove us from the list
310 */
311 spin_lock(&bdi->wb_lock);
312 list_del_rcu(&wb->list);
313 spin_unlock(&bdi->wb_lock);
314
315 /*
316 * Flush any work that raced with us exiting. No new work
317 * will be added, since this bdi isn't discoverable anymore.
318 */
319 if (!list_empty(&bdi->work_list))
320 wb_do_writeback(wb, 1);
321
322 wb->task = NULL;
323 return ret;
324}
325
326int bdi_has_dirty_io(struct backing_dev_info *bdi)
327{
328 return wb_has_dirty_io(&bdi->wb);
329}
330
331static void bdi_flush_io(struct backing_dev_info *bdi)
332{
333 struct writeback_control wbc = {
334 .bdi = bdi,
335 .sync_mode = WB_SYNC_NONE,
336 .older_than_this = NULL,
337 .range_cyclic = 1,
338 .nr_to_write = 1024,
339 };
340
341 writeback_inodes_wbc(&wbc);
342}
343
344/*
345 * kupdated() used to do this. We cannot do it from the bdi_forker_task()
346 * or we risk deadlocking on ->s_umount. The longer term solution would be
347 * to implement sync_supers_bdi() or similar and simply do it from the
348 * bdi writeback tasks individually.
349 */
350static int bdi_sync_supers(void *unused)
351{
352 set_user_nice(current, 0);
353
354 while (!kthread_should_stop()) {
355 set_current_state(TASK_INTERRUPTIBLE);
356 schedule();
357
358 /*
359 * Do this periodically, like kupdated() did before.
360 */
361 sync_supers();
362 }
363
364 return 0;
365}
366
367static void arm_supers_timer(void)
368{
369 unsigned long next;
370
371 next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
372 mod_timer(&sync_supers_timer, round_jiffies_up(next));
373}
374
375static void sync_supers_timer_fn(unsigned long unused)
376{
377 wake_up_process(sync_supers_tsk);
378 arm_supers_timer();
379}
380
381static int bdi_forker_task(void *ptr)
382{
383 struct bdi_writeback *me = ptr;
384
385 bdi_task_init(me->bdi, me);
386
387 for (;;) {
388 struct backing_dev_info *bdi, *tmp;
389 struct bdi_writeback *wb;
390
391 /*
392 * Temporary measure, we want to make sure we don't see
393 * dirty data on the default backing_dev_info
394 */
395 if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list))
396 wb_do_writeback(me, 0);
397
398 spin_lock_bh(&bdi_lock);
399
400 /*
401 * Check if any existing bdi's have dirty data without
402 * a thread registered. If so, set that up.
403 */
404 list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) {
405 if (bdi->wb.task)
406 continue;
407 if (list_empty(&bdi->work_list) &&
408 !bdi_has_dirty_io(bdi))
409 continue;
410
411 bdi_add_default_flusher_task(bdi);
412 }
413
414 set_current_state(TASK_INTERRUPTIBLE);
415
416 if (list_empty(&bdi_pending_list)) {
417 unsigned long wait;
418
419 spin_unlock_bh(&bdi_lock);
420 wait = msecs_to_jiffies(dirty_writeback_interval * 10);
421 schedule_timeout(wait);
422 try_to_freeze();
423 continue;
424 }
425
426 __set_current_state(TASK_RUNNING);
427
428 /*
429 * This is our real job - check for pending entries in
430 * bdi_pending_list, and create the tasks that got added
431 */
432 bdi = list_entry(bdi_pending_list.next, struct backing_dev_info,
433 bdi_list);
434 list_del_init(&bdi->bdi_list);
435 spin_unlock_bh(&bdi_lock);
436
437 wb = &bdi->wb;
438 wb->task = kthread_run(bdi_start_fn, wb, "flush-%s",
439 dev_name(bdi->dev));
440 /*
441 * If task creation fails, then readd the bdi to
442 * the pending list and force writeout of the bdi
443 * from this forker thread. That will free some memory
444 * and we can try again.
445 */
446 if (IS_ERR(wb->task)) {
447 wb->task = NULL;
448
449 /*
450 * Add this 'bdi' to the back, so we get
451 * a chance to flush other bdi's to free
452 * memory.
453 */
454 spin_lock_bh(&bdi_lock);
455 list_add_tail(&bdi->bdi_list, &bdi_pending_list);
456 spin_unlock_bh(&bdi_lock);
457
458 bdi_flush_io(bdi);
459 }
460 }
461
462 return 0;
463}
464
465static void bdi_add_to_pending(struct rcu_head *head)
466{
467 struct backing_dev_info *bdi;
468
469 bdi = container_of(head, struct backing_dev_info, rcu_head);
470 INIT_LIST_HEAD(&bdi->bdi_list);
471
472 spin_lock(&bdi_lock);
473 list_add_tail(&bdi->bdi_list, &bdi_pending_list);
474 spin_unlock(&bdi_lock);
475
476 /*
477 * We are now on the pending list, wake up bdi_forker_task()
478 * to finish the job and add us back to the active bdi_list
479 */
480 wake_up_process(default_backing_dev_info.wb.task);
481}
482
483/*
484 * Add the default flusher task that gets created for any bdi
485 * that has dirty data pending writeout
486 */
487void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
488{
489 if (!bdi_cap_writeback_dirty(bdi))
490 return;
491
492 if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) {
493 printk(KERN_ERR "bdi %p/%s is not registered!\n",
494 bdi, bdi->name);
495 return;
496 }
497
498 /*
499 * Check with the helper whether to proceed adding a task. Will only
500 * abort if we two or more simultanous calls to
501 * bdi_add_default_flusher_task() occured, further additions will block
502 * waiting for previous additions to finish.
503 */
504 if (!test_and_set_bit(BDI_pending, &bdi->state)) {
505 list_del_rcu(&bdi->bdi_list);
506
507 /*
508 * We must wait for the current RCU period to end before
509 * moving to the pending list. So schedule that operation
510 * from an RCU callback.
511 */
512 call_rcu(&bdi->rcu_head, bdi_add_to_pending);
513 }
514}
515
516/*
517 * Remove bdi from bdi_list, and ensure that it is no longer visible
518 */
519static void bdi_remove_from_list(struct backing_dev_info *bdi)
520{
521 spin_lock_bh(&bdi_lock);
522 list_del_rcu(&bdi->bdi_list);
523 spin_unlock_bh(&bdi_lock);
524
525 synchronize_rcu();
526}
527
196int bdi_register(struct backing_dev_info *bdi, struct device *parent, 528int bdi_register(struct backing_dev_info *bdi, struct device *parent,
197 const char *fmt, ...) 529 const char *fmt, ...)
198{ 530{
@@ -211,9 +543,33 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
211 goto exit; 543 goto exit;
212 } 544 }
213 545
546 spin_lock_bh(&bdi_lock);
547 list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
548 spin_unlock_bh(&bdi_lock);
549
214 bdi->dev = dev; 550 bdi->dev = dev;
215 bdi_debug_register(bdi, dev_name(dev));
216 551
552 /*
553 * Just start the forker thread for our default backing_dev_info,
554 * and add other bdi's to the list. They will get a thread created
555 * on-demand when they need it.
556 */
557 if (bdi_cap_flush_forker(bdi)) {
558 struct bdi_writeback *wb = &bdi->wb;
559
560 wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s",
561 dev_name(dev));
562 if (IS_ERR(wb->task)) {
563 wb->task = NULL;
564 ret = -ENOMEM;
565
566 bdi_remove_from_list(bdi);
567 goto exit;
568 }
569 }
570
571 bdi_debug_register(bdi, dev_name(dev));
572 set_bit(BDI_registered, &bdi->state);
217exit: 573exit:
218 return ret; 574 return ret;
219} 575}
@@ -225,9 +581,61 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
225} 581}
226EXPORT_SYMBOL(bdi_register_dev); 582EXPORT_SYMBOL(bdi_register_dev);
227 583
584/*
585 * Remove bdi from the global list and shutdown any threads we have running
586 */
587static void bdi_wb_shutdown(struct backing_dev_info *bdi)
588{
589 struct bdi_writeback *wb;
590
591 if (!bdi_cap_writeback_dirty(bdi))
592 return;
593
594 /*
595 * If setup is pending, wait for that to complete first
596 */
597 wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
598 TASK_UNINTERRUPTIBLE);
599
600 /*
601 * Make sure nobody finds us on the bdi_list anymore
602 */
603 bdi_remove_from_list(bdi);
604
605 /*
606 * Finally, kill the kernel threads. We don't need to be RCU
607 * safe anymore, since the bdi is gone from visibility. Force
608 * unfreeze of the thread before calling kthread_stop(), otherwise
609 * it would never exet if it is currently stuck in the refrigerator.
610 */
611 list_for_each_entry(wb, &bdi->wb_list, list) {
612 thaw_process(wb->task);
613 kthread_stop(wb->task);
614 }
615}
616
617/*
618 * This bdi is going away now, make sure that no super_blocks point to it
619 */
620static void bdi_prune_sb(struct backing_dev_info *bdi)
621{
622 struct super_block *sb;
623
624 spin_lock(&sb_lock);
625 list_for_each_entry(sb, &super_blocks, s_list) {
626 if (sb->s_bdi == bdi)
627 sb->s_bdi = NULL;
628 }
629 spin_unlock(&sb_lock);
630}
631
228void bdi_unregister(struct backing_dev_info *bdi) 632void bdi_unregister(struct backing_dev_info *bdi)
229{ 633{
230 if (bdi->dev) { 634 if (bdi->dev) {
635 bdi_prune_sb(bdi);
636
637 if (!bdi_cap_flush_forker(bdi))
638 bdi_wb_shutdown(bdi);
231 bdi_debug_unregister(bdi); 639 bdi_debug_unregister(bdi);
232 device_unregister(bdi->dev); 640 device_unregister(bdi->dev);
233 bdi->dev = NULL; 641 bdi->dev = NULL;
@@ -237,14 +645,26 @@ EXPORT_SYMBOL(bdi_unregister);
237 645
238int bdi_init(struct backing_dev_info *bdi) 646int bdi_init(struct backing_dev_info *bdi)
239{ 647{
240 int i; 648 int i, err;
241 int err;
242 649
243 bdi->dev = NULL; 650 bdi->dev = NULL;
244 651
245 bdi->min_ratio = 0; 652 bdi->min_ratio = 0;
246 bdi->max_ratio = 100; 653 bdi->max_ratio = 100;
247 bdi->max_prop_frac = PROP_FRAC_BASE; 654 bdi->max_prop_frac = PROP_FRAC_BASE;
655 spin_lock_init(&bdi->wb_lock);
656 INIT_RCU_HEAD(&bdi->rcu_head);
657 INIT_LIST_HEAD(&bdi->bdi_list);
658 INIT_LIST_HEAD(&bdi->wb_list);
659 INIT_LIST_HEAD(&bdi->work_list);
660
661 bdi_wb_init(&bdi->wb, bdi);
662
663 /*
664 * Just one thread support for now, hard code mask and count
665 */
666 bdi->wb_mask = 1;
667 bdi->wb_cnt = 1;
248 668
249 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { 669 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
250 err = percpu_counter_init(&bdi->bdi_stat[i], 0); 670 err = percpu_counter_init(&bdi->bdi_stat[i], 0);
@@ -269,6 +689,20 @@ void bdi_destroy(struct backing_dev_info *bdi)
269{ 689{
270 int i; 690 int i;
271 691
692 /*
693 * Splice our entries to the default_backing_dev_info, if this
694 * bdi disappears
695 */
696 if (bdi_has_dirty_io(bdi)) {
697 struct bdi_writeback *dst = &default_backing_dev_info.wb;
698
699 spin_lock(&inode_lock);
700 list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
701 list_splice(&bdi->wb.b_io, &dst->b_io);
702 list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
703 spin_unlock(&inode_lock);
704 }
705
272 bdi_unregister(bdi); 706 bdi_unregister(bdi);
273 707
274 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) 708 for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
@@ -283,13 +717,12 @@ static wait_queue_head_t congestion_wqh[2] = {
283 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) 717 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
284 }; 718 };
285 719
286 720void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
287void clear_bdi_congested(struct backing_dev_info *bdi, int rw)
288{ 721{
289 enum bdi_state bit; 722 enum bdi_state bit;
290 wait_queue_head_t *wqh = &congestion_wqh[rw]; 723 wait_queue_head_t *wqh = &congestion_wqh[sync];
291 724
292 bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested; 725 bit = sync ? BDI_sync_congested : BDI_async_congested;
293 clear_bit(bit, &bdi->state); 726 clear_bit(bit, &bdi->state);
294 smp_mb__after_clear_bit(); 727 smp_mb__after_clear_bit();
295 if (waitqueue_active(wqh)) 728 if (waitqueue_active(wqh))
@@ -297,29 +730,29 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int rw)
297} 730}
298EXPORT_SYMBOL(clear_bdi_congested); 731EXPORT_SYMBOL(clear_bdi_congested);
299 732
300void set_bdi_congested(struct backing_dev_info *bdi, int rw) 733void set_bdi_congested(struct backing_dev_info *bdi, int sync)
301{ 734{
302 enum bdi_state bit; 735 enum bdi_state bit;
303 736
304 bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested; 737 bit = sync ? BDI_sync_congested : BDI_async_congested;
305 set_bit(bit, &bdi->state); 738 set_bit(bit, &bdi->state);
306} 739}
307EXPORT_SYMBOL(set_bdi_congested); 740EXPORT_SYMBOL(set_bdi_congested);
308 741
309/** 742/**
310 * congestion_wait - wait for a backing_dev to become uncongested 743 * congestion_wait - wait for a backing_dev to become uncongested
311 * @rw: READ or WRITE 744 * @sync: SYNC or ASYNC IO
312 * @timeout: timeout in jiffies 745 * @timeout: timeout in jiffies
313 * 746 *
314 * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit 747 * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
315 * write congestion. If no backing_devs are congested then just wait for the 748 * write congestion. If no backing_devs are congested then just wait for the
316 * next write to be completed. 749 * next write to be completed.
317 */ 750 */
318long congestion_wait(int rw, long timeout) 751long congestion_wait(int sync, long timeout)
319{ 752{
320 long ret; 753 long ret;
321 DEFINE_WAIT(wait); 754 DEFINE_WAIT(wait);
322 wait_queue_head_t *wqh = &congestion_wqh[rw]; 755 wait_queue_head_t *wqh = &congestion_wqh[sync];
323 756
324 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); 757 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
325 ret = io_schedule_timeout(timeout); 758 ret = io_schedule_timeout(timeout);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index daf92713f7de..d1dc23cc7f10 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -12,6 +12,7 @@
12#include <linux/pfn.h> 12#include <linux/pfn.h>
13#include <linux/bootmem.h> 13#include <linux/bootmem.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/kmemleak.h>
15 16
16#include <asm/bug.h> 17#include <asm/bug.h>
17#include <asm/io.h> 18#include <asm/io.h>
@@ -142,6 +143,30 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
142 return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); 143 return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
143} 144}
144 145
146/*
147 * free_bootmem_late - free bootmem pages directly to page allocator
148 * @addr: starting address of the range
149 * @size: size of the range in bytes
150 *
151 * This is only useful when the bootmem allocator has already been torn
152 * down, but we are still initializing the system. Pages are given directly
153 * to the page allocator, no bootmem metadata is updated because it is gone.
154 */
155void __init free_bootmem_late(unsigned long addr, unsigned long size)
156{
157 unsigned long cursor, end;
158
159 kmemleak_free_part(__va(addr), size);
160
161 cursor = PFN_UP(addr);
162 end = PFN_DOWN(addr + size);
163
164 for (; cursor < end; cursor++) {
165 __free_pages_bootmem(pfn_to_page(cursor), 0);
166 totalram_pages++;
167 }
168}
169
145static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) 170static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
146{ 171{
147 int aligned; 172 int aligned;
@@ -335,6 +360,8 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
335{ 360{
336 unsigned long start, end; 361 unsigned long start, end;
337 362
363 kmemleak_free_part(__va(physaddr), size);
364
338 start = PFN_UP(physaddr); 365 start = PFN_UP(physaddr);
339 end = PFN_DOWN(physaddr + size); 366 end = PFN_DOWN(physaddr + size);
340 367
@@ -354,6 +381,8 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
354{ 381{
355 unsigned long start, end; 382 unsigned long start, end;
356 383
384 kmemleak_free_part(__va(addr), size);
385
357 start = PFN_UP(addr); 386 start = PFN_UP(addr);
358 end = PFN_DOWN(addr + size); 387 end = PFN_DOWN(addr + size);
359 388
@@ -516,6 +545,11 @@ find_block:
516 region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) + 545 region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
517 start_off); 546 start_off);
518 memset(region, 0, size); 547 memset(region, 0, size);
548 /*
549 * The min_count is set to 0 so that bootmem allocated blocks
550 * are never reported as leaks.
551 */
552 kmemleak_alloc(region, size, 0, 0);
519 return region; 553 return region;
520 } 554 }
521 555
@@ -532,12 +566,19 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
532 unsigned long size, unsigned long align, 566 unsigned long size, unsigned long align,
533 unsigned long goal, unsigned long limit) 567 unsigned long goal, unsigned long limit)
534{ 568{
535#ifdef CONFIG_HAVE_ARCH_BOOTMEM 569 if (WARN_ON_ONCE(slab_is_available()))
536 bootmem_data_t *p_bdata; 570 return kzalloc(size, GFP_NOWAIT);
537 571
538 p_bdata = bootmem_arch_preferred_node(bdata, size, align, goal, limit); 572#ifdef CONFIG_HAVE_ARCH_BOOTMEM
539 if (p_bdata) 573 {
540 return alloc_bootmem_core(p_bdata, size, align, goal, limit); 574 bootmem_data_t *p_bdata;
575
576 p_bdata = bootmem_arch_preferred_node(bdata, size, align,
577 goal, limit);
578 if (p_bdata)
579 return alloc_bootmem_core(p_bdata, size, align,
580 goal, limit);
581 }
541#endif 582#endif
542 return NULL; 583 return NULL;
543} 584}
@@ -662,6 +703,9 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
662void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, 703void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
663 unsigned long align, unsigned long goal) 704 unsigned long align, unsigned long goal)
664{ 705{
706 if (WARN_ON_ONCE(slab_is_available()))
707 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
708
665 return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); 709 return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
666} 710}
667 711
@@ -693,6 +737,9 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
693{ 737{
694 void *ptr; 738 void *ptr;
695 739
740 if (WARN_ON_ONCE(slab_is_available()))
741 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
742
696 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); 743 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
697 if (ptr) 744 if (ptr)
698 return ptr; 745 return ptr;
@@ -745,6 +792,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
745void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, 792void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
746 unsigned long align, unsigned long goal) 793 unsigned long align, unsigned long goal)
747{ 794{
795 if (WARN_ON_ONCE(slab_is_available()))
796 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
797
748 return ___alloc_bootmem_node(pgdat->bdata, size, align, 798 return ___alloc_bootmem_node(pgdat->bdata, size, align,
749 goal, ARCH_LOW_ADDRESS_LIMIT); 799 goal, ARCH_LOW_ADDRESS_LIMIT);
750} 800}
diff --git a/mm/bounce.c b/mm/bounce.c
index e590272fe7a8..a2b76a588e34 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -13,17 +13,15 @@
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/hash.h> 14#include <linux/hash.h>
15#include <linux/highmem.h> 15#include <linux/highmem.h>
16#include <linux/blktrace_api.h>
17#include <trace/block.h>
18#include <asm/tlbflush.h> 16#include <asm/tlbflush.h>
19 17
18#include <trace/events/block.h>
19
20#define POOL_SIZE 64 20#define POOL_SIZE 64
21#define ISA_POOL_SIZE 16 21#define ISA_POOL_SIZE 16
22 22
23static mempool_t *page_pool, *isa_page_pool; 23static mempool_t *page_pool, *isa_page_pool;
24 24
25DEFINE_TRACE(block_bio_bounce);
26
27#ifdef CONFIG_HIGHMEM 25#ifdef CONFIG_HIGHMEM
28static __init int init_emergency_pool(void) 26static __init int init_emergency_pool(void)
29{ 27{
@@ -192,7 +190,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
192 /* 190 /*
193 * is destination page below bounce pfn? 191 * is destination page below bounce pfn?
194 */ 192 */
195 if (page_to_pfn(page) <= q->bounce_pfn) 193 if (page_to_pfn(page) <= queue_bounce_pfn(q))
196 continue; 194 continue;
197 195
198 /* 196 /*
@@ -284,7 +282,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
284 * don't waste time iterating over bio segments 282 * don't waste time iterating over bio segments
285 */ 283 */
286 if (!(q->bounce_gfp & GFP_DMA)) { 284 if (!(q->bounce_gfp & GFP_DMA)) {
287 if (q->bounce_pfn >= blk_max_pfn) 285 if (queue_bounce_pfn(q) >= blk_max_pfn)
288 return; 286 return;
289 pool = page_pool; 287 pool = page_pool;
290 } else { 288 } else {
diff --git a/mm/dmapool.c b/mm/dmapool.c
index b1f0885dda22..3df063706f53 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -86,10 +86,12 @@ show_pools(struct device *dev, struct device_attribute *attr, char *buf)
86 unsigned pages = 0; 86 unsigned pages = 0;
87 unsigned blocks = 0; 87 unsigned blocks = 0;
88 88
89 spin_lock_irq(&pool->lock);
89 list_for_each_entry(page, &pool->page_list, page_list) { 90 list_for_each_entry(page, &pool->page_list, page_list) {
90 pages++; 91 pages++;
91 blocks += page->in_use; 92 blocks += page->in_use;
92 } 93 }
94 spin_unlock_irq(&pool->lock);
93 95
94 /* per-pool info, no real statistics yet */ 96 /* per-pool info, no real statistics yet */
95 temp = scnprintf(next, size, "%-16s %4u %4Zu %4Zu %2u\n", 97 temp = scnprintf(next, size, "%-16s %4u %4Zu %4Zu %2u\n",
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 54a0f8040afa..e43359214f6f 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -101,7 +101,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
101 101
102 ret = force_page_cache_readahead(mapping, file, 102 ret = force_page_cache_readahead(mapping, file,
103 start_index, 103 start_index,
104 max_sane_readahead(nrpages)); 104 nrpages);
105 if (ret > 0) 105 if (ret > 0)
106 ret = 0; 106 ret = 0;
107 break; 107 break;
diff --git a/mm/failslab.c b/mm/failslab.c
index 7c6ea6493f80..9339de5f0a91 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -1,4 +1,5 @@
1#include <linux/fault-inject.h> 1#include <linux/fault-inject.h>
2#include <linux/gfp.h>
2 3
3static struct { 4static struct {
4 struct fault_attr attr; 5 struct fault_attr attr;
diff --git a/mm/filemap.c b/mm/filemap.c
index 126d3973b3d1..ef169f37156d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -39,11 +39,10 @@
39/* 39/*
40 * FIXME: remove all knowledge of the buffer layer from the core VM 40 * FIXME: remove all knowledge of the buffer layer from the core VM
41 */ 41 */
42#include <linux/buffer_head.h> /* for generic_osync_inode */ 42#include <linux/buffer_head.h> /* for try_to_free_buffers */
43 43
44#include <asm/mman.h> 44#include <asm/mman.h>
45 45
46
47/* 46/*
48 * Shared mappings implemented 30.11.1994. It's not fully working yet, 47 * Shared mappings implemented 30.11.1994. It's not fully working yet,
49 * though. 48 * though.
@@ -59,7 +58,7 @@
59/* 58/*
60 * Lock ordering: 59 * Lock ordering:
61 * 60 *
62 * ->i_mmap_lock (vmtruncate) 61 * ->i_mmap_lock (truncate_pagecache)
63 * ->private_lock (__free_pte->__set_page_dirty_buffers) 62 * ->private_lock (__free_pte->__set_page_dirty_buffers)
64 * ->swap_lock (exclusive_swap_page, others) 63 * ->swap_lock (exclusive_swap_page, others)
65 * ->mapping->tree_lock 64 * ->mapping->tree_lock
@@ -105,6 +104,10 @@
105 * 104 *
106 * ->task->proc_lock 105 * ->task->proc_lock
107 * ->dcache_lock (proc_pid_lookup) 106 * ->dcache_lock (proc_pid_lookup)
107 *
108 * (code doesn't rely on that order, so you could switch it around)
109 * ->tasklist_lock (memory_failure, collect_procs_ao)
110 * ->i_mmap_lock
108 */ 111 */
109 112
110/* 113/*
@@ -120,8 +123,9 @@ void __remove_from_page_cache(struct page *page)
120 page->mapping = NULL; 123 page->mapping = NULL;
121 mapping->nrpages--; 124 mapping->nrpages--;
122 __dec_zone_page_state(page, NR_FILE_PAGES); 125 __dec_zone_page_state(page, NR_FILE_PAGES);
126 if (PageSwapBacked(page))
127 __dec_zone_page_state(page, NR_SHMEM);
123 BUG_ON(page_mapped(page)); 128 BUG_ON(page_mapped(page));
124 mem_cgroup_uncharge_cache_page(page);
125 129
126 /* 130 /*
127 * Some filesystems seem to re-dirty the page even after 131 * Some filesystems seem to re-dirty the page even after
@@ -145,6 +149,7 @@ void remove_from_page_cache(struct page *page)
145 spin_lock_irq(&mapping->tree_lock); 149 spin_lock_irq(&mapping->tree_lock);
146 __remove_from_page_cache(page); 150 __remove_from_page_cache(page);
147 spin_unlock_irq(&mapping->tree_lock); 151 spin_unlock_irq(&mapping->tree_lock);
152 mem_cgroup_uncharge_cache_page(page);
148} 153}
149 154
150static int sync_page(void *word) 155static int sync_page(void *word)
@@ -307,68 +312,24 @@ int wait_on_page_writeback_range(struct address_space *mapping,
307} 312}
308 313
309/** 314/**
310 * sync_page_range - write and wait on all pages in the passed range 315 * filemap_fdatawait_range - wait for all under-writeback pages to complete in a given range
311 * @inode: target inode 316 * @mapping: address space structure to wait for
312 * @mapping: target address_space 317 * @start: offset in bytes where the range starts
313 * @pos: beginning offset in pages to write 318 * @end: offset in bytes where the range ends (inclusive)
314 * @count: number of bytes to write
315 *
316 * Write and wait upon all the pages in the passed range. This is a "data
317 * integrity" operation. It waits upon in-flight writeout before starting and
318 * waiting upon new writeout. If there was an IO error, return it.
319 * 319 *
320 * We need to re-take i_mutex during the generic_osync_inode list walk because 320 * Walk the list of under-writeback pages of the given address space
321 * it is otherwise livelockable. 321 * in the given range and wait for all of them.
322 */
323int sync_page_range(struct inode *inode, struct address_space *mapping,
324 loff_t pos, loff_t count)
325{
326 pgoff_t start = pos >> PAGE_CACHE_SHIFT;
327 pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
328 int ret;
329
330 if (!mapping_cap_writeback_dirty(mapping) || !count)
331 return 0;
332 ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
333 if (ret == 0) {
334 mutex_lock(&inode->i_mutex);
335 ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
336 mutex_unlock(&inode->i_mutex);
337 }
338 if (ret == 0)
339 ret = wait_on_page_writeback_range(mapping, start, end);
340 return ret;
341}
342EXPORT_SYMBOL(sync_page_range);
343
344/**
345 * sync_page_range_nolock - write & wait on all pages in the passed range without locking
346 * @inode: target inode
347 * @mapping: target address_space
348 * @pos: beginning offset in pages to write
349 * @count: number of bytes to write
350 * 322 *
351 * Note: Holding i_mutex across sync_page_range_nolock() is not a good idea 323 * This is just a simple wrapper so that callers don't have to convert offsets
352 * as it forces O_SYNC writers to different parts of the same file 324 * to page indexes themselves
353 * to be serialised right until io completion.
354 */ 325 */
355int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, 326int filemap_fdatawait_range(struct address_space *mapping, loff_t start,
356 loff_t pos, loff_t count) 327 loff_t end)
357{ 328{
358 pgoff_t start = pos >> PAGE_CACHE_SHIFT; 329 return wait_on_page_writeback_range(mapping, start >> PAGE_CACHE_SHIFT,
359 pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; 330 end >> PAGE_CACHE_SHIFT);
360 int ret;
361
362 if (!mapping_cap_writeback_dirty(mapping) || !count)
363 return 0;
364 ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
365 if (ret == 0)
366 ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
367 if (ret == 0)
368 ret = wait_on_page_writeback_range(mapping, start, end);
369 return ret;
370} 331}
371EXPORT_SYMBOL(sync_page_range_nolock); 332EXPORT_SYMBOL(filemap_fdatawait_range);
372 333
373/** 334/**
374 * filemap_fdatawait - wait for all under-writeback pages to complete 335 * filemap_fdatawait - wait for all under-writeback pages to complete
@@ -441,6 +402,7 @@ int filemap_write_and_wait_range(struct address_space *mapping,
441 } 402 }
442 return err; 403 return err;
443} 404}
405EXPORT_SYMBOL(filemap_write_and_wait_range);
444 406
445/** 407/**
446 * add_to_page_cache_locked - add a locked page to the pagecache 408 * add_to_page_cache_locked - add a locked page to the pagecache
@@ -475,13 +437,15 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
475 if (likely(!error)) { 437 if (likely(!error)) {
476 mapping->nrpages++; 438 mapping->nrpages++;
477 __inc_zone_page_state(page, NR_FILE_PAGES); 439 __inc_zone_page_state(page, NR_FILE_PAGES);
440 if (PageSwapBacked(page))
441 __inc_zone_page_state(page, NR_SHMEM);
442 spin_unlock_irq(&mapping->tree_lock);
478 } else { 443 } else {
479 page->mapping = NULL; 444 page->mapping = NULL;
445 spin_unlock_irq(&mapping->tree_lock);
480 mem_cgroup_uncharge_cache_page(page); 446 mem_cgroup_uncharge_cache_page(page);
481 page_cache_release(page); 447 page_cache_release(page);
482 } 448 }
483
484 spin_unlock_irq(&mapping->tree_lock);
485 radix_tree_preload_end(); 449 radix_tree_preload_end();
486 } else 450 } else
487 mem_cgroup_uncharge_cache_page(page); 451 mem_cgroup_uncharge_cache_page(page);
@@ -513,13 +477,14 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
513 } 477 }
514 return ret; 478 return ret;
515} 479}
480EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
516 481
517#ifdef CONFIG_NUMA 482#ifdef CONFIG_NUMA
518struct page *__page_cache_alloc(gfp_t gfp) 483struct page *__page_cache_alloc(gfp_t gfp)
519{ 484{
520 if (cpuset_do_page_mem_spread()) { 485 if (cpuset_do_page_mem_spread()) {
521 int n = cpuset_mem_spread_node(); 486 int n = cpuset_mem_spread_node();
522 return alloc_pages_node(n, gfp, 0); 487 return alloc_pages_exact_node(n, gfp, 0);
523 } 488 }
524 return alloc_pages(gfp, 0); 489 return alloc_pages(gfp, 0);
525} 490}
@@ -565,6 +530,24 @@ void wait_on_page_bit(struct page *page, int bit_nr)
565EXPORT_SYMBOL(wait_on_page_bit); 530EXPORT_SYMBOL(wait_on_page_bit);
566 531
567/** 532/**
533 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
534 * @page: Page defining the wait queue of interest
535 * @waiter: Waiter to add to the queue
536 *
537 * Add an arbitrary @waiter to the wait queue for the nominated @page.
538 */
539void add_page_wait_queue(struct page *page, wait_queue_t *waiter)
540{
541 wait_queue_head_t *q = page_waitqueue(page);
542 unsigned long flags;
543
544 spin_lock_irqsave(&q->lock, flags);
545 __add_wait_queue(q, waiter);
546 spin_unlock_irqrestore(&q->lock, flags);
547}
548EXPORT_SYMBOL_GPL(add_page_wait_queue);
549
550/**
568 * unlock_page - unlock a locked page 551 * unlock_page - unlock a locked page
569 * @page: the page 552 * @page: the page
570 * 553 *
@@ -627,6 +610,7 @@ int __lock_page_killable(struct page *page)
627 return __wait_on_bit_lock(page_waitqueue(page), &wait, 610 return __wait_on_bit_lock(page_waitqueue(page), &wait,
628 sync_page_killable, TASK_KILLABLE); 611 sync_page_killable, TASK_KILLABLE);
629} 612}
613EXPORT_SYMBOL_GPL(__lock_page_killable);
630 614
631/** 615/**
632 * __lock_page_nosync - get a lock on the page, without calling sync_page() 616 * __lock_page_nosync - get a lock on the page, without calling sync_page()
@@ -983,9 +967,6 @@ EXPORT_SYMBOL(grab_cache_page_nowait);
983static void shrink_readahead_size_eio(struct file *filp, 967static void shrink_readahead_size_eio(struct file *filp,
984 struct file_ra_state *ra) 968 struct file_ra_state *ra)
985{ 969{
986 if (!ra->ra_pages)
987 return;
988
989 ra->ra_pages /= 4; 970 ra->ra_pages /= 4;
990} 971}
991 972
@@ -1369,8 +1350,7 @@ do_readahead(struct address_space *mapping, struct file *filp,
1369 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) 1350 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
1370 return -EINVAL; 1351 return -EINVAL;
1371 1352
1372 force_page_cache_readahead(mapping, filp, index, 1353 force_page_cache_readahead(mapping, filp, index, nr);
1373 max_sane_readahead(nr));
1374 return 0; 1354 return 0;
1375} 1355}
1376 1356
@@ -1436,6 +1416,73 @@ static int page_cache_read(struct file *file, pgoff_t offset)
1436 1416
1437#define MMAP_LOTSAMISS (100) 1417#define MMAP_LOTSAMISS (100)
1438 1418
1419/*
1420 * Synchronous readahead happens when we don't even find
1421 * a page in the page cache at all.
1422 */
1423static void do_sync_mmap_readahead(struct vm_area_struct *vma,
1424 struct file_ra_state *ra,
1425 struct file *file,
1426 pgoff_t offset)
1427{
1428 unsigned long ra_pages;
1429 struct address_space *mapping = file->f_mapping;
1430
1431 /* If we don't want any read-ahead, don't bother */
1432 if (VM_RandomReadHint(vma))
1433 return;
1434
1435 if (VM_SequentialReadHint(vma) ||
1436 offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) {
1437 page_cache_sync_readahead(mapping, ra, file, offset,
1438 ra->ra_pages);
1439 return;
1440 }
1441
1442 if (ra->mmap_miss < INT_MAX)
1443 ra->mmap_miss++;
1444
1445 /*
1446 * Do we miss much more than hit in this file? If so,
1447 * stop bothering with read-ahead. It will only hurt.
1448 */
1449 if (ra->mmap_miss > MMAP_LOTSAMISS)
1450 return;
1451
1452 /*
1453 * mmap read-around
1454 */
1455 ra_pages = max_sane_readahead(ra->ra_pages);
1456 if (ra_pages) {
1457 ra->start = max_t(long, 0, offset - ra_pages/2);
1458 ra->size = ra_pages;
1459 ra->async_size = 0;
1460 ra_submit(ra, mapping, file);
1461 }
1462}
1463
1464/*
1465 * Asynchronous readahead happens when we find the page and PG_readahead,
1466 * so we want to possibly extend the readahead further..
1467 */
1468static void do_async_mmap_readahead(struct vm_area_struct *vma,
1469 struct file_ra_state *ra,
1470 struct file *file,
1471 struct page *page,
1472 pgoff_t offset)
1473{
1474 struct address_space *mapping = file->f_mapping;
1475
1476 /* If we don't want any read-ahead, don't bother */
1477 if (VM_RandomReadHint(vma))
1478 return;
1479 if (ra->mmap_miss > 0)
1480 ra->mmap_miss--;
1481 if (PageReadahead(page))
1482 page_cache_async_readahead(mapping, ra, file,
1483 page, offset, ra->ra_pages);
1484}
1485
1439/** 1486/**
1440 * filemap_fault - read in file data for page fault handling 1487 * filemap_fault - read in file data for page fault handling
1441 * @vma: vma in which the fault was taken 1488 * @vma: vma in which the fault was taken
@@ -1455,78 +1502,44 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1455 struct address_space *mapping = file->f_mapping; 1502 struct address_space *mapping = file->f_mapping;
1456 struct file_ra_state *ra = &file->f_ra; 1503 struct file_ra_state *ra = &file->f_ra;
1457 struct inode *inode = mapping->host; 1504 struct inode *inode = mapping->host;
1505 pgoff_t offset = vmf->pgoff;
1458 struct page *page; 1506 struct page *page;
1459 pgoff_t size; 1507 pgoff_t size;
1460 int did_readaround = 0;
1461 int ret = 0; 1508 int ret = 0;
1462 1509
1463 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1510 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1464 if (vmf->pgoff >= size) 1511 if (offset >= size)
1465 return VM_FAULT_SIGBUS; 1512 return VM_FAULT_SIGBUS;
1466 1513
1467 /* If we don't want any read-ahead, don't bother */
1468 if (VM_RandomReadHint(vma))
1469 goto no_cached_page;
1470
1471 /* 1514 /*
1472 * Do we have something in the page cache already? 1515 * Do we have something in the page cache already?
1473 */ 1516 */
1474retry_find: 1517 page = find_get_page(mapping, offset);
1475 page = find_lock_page(mapping, vmf->pgoff); 1518 if (likely(page)) {
1476 /*
1477 * For sequential accesses, we use the generic readahead logic.
1478 */
1479 if (VM_SequentialReadHint(vma)) {
1480 if (!page) {
1481 page_cache_sync_readahead(mapping, ra, file,
1482 vmf->pgoff, 1);
1483 page = find_lock_page(mapping, vmf->pgoff);
1484 if (!page)
1485 goto no_cached_page;
1486 }
1487 if (PageReadahead(page)) {
1488 page_cache_async_readahead(mapping, ra, file, page,
1489 vmf->pgoff, 1);
1490 }
1491 }
1492
1493 if (!page) {
1494 unsigned long ra_pages;
1495
1496 ra->mmap_miss++;
1497
1498 /* 1519 /*
1499 * Do we miss much more than hit in this file? If so, 1520 * We found the page, so try async readahead before
1500 * stop bothering with read-ahead. It will only hurt. 1521 * waiting for the lock.
1501 */ 1522 */
1502 if (ra->mmap_miss > MMAP_LOTSAMISS) 1523 do_async_mmap_readahead(vma, ra, file, page, offset);
1503 goto no_cached_page; 1524 lock_page(page);
1504 1525
1505 /* 1526 /* Did it get truncated? */
1506 * To keep the pgmajfault counter straight, we need to 1527 if (unlikely(page->mapping != mapping)) {
1507 * check did_readaround, as this is an inner loop. 1528 unlock_page(page);
1508 */ 1529 put_page(page);
1509 if (!did_readaround) { 1530 goto no_cached_page;
1510 ret = VM_FAULT_MAJOR;
1511 count_vm_event(PGMAJFAULT);
1512 }
1513 did_readaround = 1;
1514 ra_pages = max_sane_readahead(file->f_ra.ra_pages);
1515 if (ra_pages) {
1516 pgoff_t start = 0;
1517
1518 if (vmf->pgoff > ra_pages / 2)
1519 start = vmf->pgoff - ra_pages / 2;
1520 do_page_cache_readahead(mapping, file, start, ra_pages);
1521 } 1531 }
1522 page = find_lock_page(mapping, vmf->pgoff); 1532 } else {
1533 /* No page in the page cache at all */
1534 do_sync_mmap_readahead(vma, ra, file, offset);
1535 count_vm_event(PGMAJFAULT);
1536 ret = VM_FAULT_MAJOR;
1537retry_find:
1538 page = find_lock_page(mapping, offset);
1523 if (!page) 1539 if (!page)
1524 goto no_cached_page; 1540 goto no_cached_page;
1525 } 1541 }
1526 1542
1527 if (!did_readaround)
1528 ra->mmap_miss--;
1529
1530 /* 1543 /*
1531 * We have a locked page in the page cache, now we need to check 1544 * We have a locked page in the page cache, now we need to check
1532 * that it's up-to-date. If not, it is going to be due to an error. 1545 * that it's up-to-date. If not, it is going to be due to an error.
@@ -1534,18 +1547,18 @@ retry_find:
1534 if (unlikely(!PageUptodate(page))) 1547 if (unlikely(!PageUptodate(page)))
1535 goto page_not_uptodate; 1548 goto page_not_uptodate;
1536 1549
1537 /* Must recheck i_size under page lock */ 1550 /*
1551 * Found the page and have a reference on it.
1552 * We must recheck i_size under page lock.
1553 */
1538 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1554 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1539 if (unlikely(vmf->pgoff >= size)) { 1555 if (unlikely(offset >= size)) {
1540 unlock_page(page); 1556 unlock_page(page);
1541 page_cache_release(page); 1557 page_cache_release(page);
1542 return VM_FAULT_SIGBUS; 1558 return VM_FAULT_SIGBUS;
1543 } 1559 }
1544 1560
1545 /* 1561 ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT;
1546 * Found the page and have a reference on it.
1547 */
1548 ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
1549 vmf->page = page; 1562 vmf->page = page;
1550 return ret | VM_FAULT_LOCKED; 1563 return ret | VM_FAULT_LOCKED;
1551 1564
@@ -1554,7 +1567,7 @@ no_cached_page:
1554 * We're only likely to ever get here if MADV_RANDOM is in 1567 * We're only likely to ever get here if MADV_RANDOM is in
1555 * effect. 1568 * effect.
1556 */ 1569 */
1557 error = page_cache_read(file, vmf->pgoff); 1570 error = page_cache_read(file, offset);
1558 1571
1559 /* 1572 /*
1560 * The page we want has now been added to the page cache. 1573 * The page we want has now been added to the page cache.
@@ -1574,12 +1587,6 @@ no_cached_page:
1574 return VM_FAULT_SIGBUS; 1587 return VM_FAULT_SIGBUS;
1575 1588
1576page_not_uptodate: 1589page_not_uptodate:
1577 /* IO error path */
1578 if (!did_readaround) {
1579 ret = VM_FAULT_MAJOR;
1580 count_vm_event(PGMAJFAULT);
1581 }
1582
1583 /* 1590 /*
1584 * Umm, take care of errors if the page isn't up-to-date. 1591 * Umm, take care of errors if the page isn't up-to-date.
1585 * Try to re-read it _once_. We do this synchronously, 1592 * Try to re-read it _once_. We do this synchronously,
@@ -1604,7 +1611,7 @@ page_not_uptodate:
1604} 1611}
1605EXPORT_SYMBOL(filemap_fault); 1612EXPORT_SYMBOL(filemap_fault);
1606 1613
1607struct vm_operations_struct generic_file_vm_ops = { 1614const struct vm_operations_struct generic_file_vm_ops = {
1608 .fault = filemap_fault, 1615 .fault = filemap_fault,
1609}; 1616};
1610 1617
@@ -2123,20 +2130,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2123 } 2130 }
2124 *ppos = end; 2131 *ppos = end;
2125 } 2132 }
2126
2127 /*
2128 * Sync the fs metadata but not the minor inode changes and
2129 * of course not the data as we did direct DMA for the IO.
2130 * i_mutex is held, which protects generic_osync_inode() from
2131 * livelocking. AIO O_DIRECT ops attempt to sync metadata here.
2132 */
2133out: 2133out:
2134 if ((written >= 0 || written == -EIOCBQUEUED) &&
2135 ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2136 int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
2137 if (err < 0)
2138 written = err;
2139 }
2140 return written; 2134 return written;
2141} 2135}
2142EXPORT_SYMBOL(generic_file_direct_write); 2136EXPORT_SYMBOL(generic_file_direct_write);
@@ -2228,6 +2222,7 @@ again:
2228 pagefault_enable(); 2222 pagefault_enable();
2229 flush_dcache_page(page); 2223 flush_dcache_page(page);
2230 2224
2225 mark_page_accessed(page);
2231 status = a_ops->write_end(file, mapping, pos, bytes, copied, 2226 status = a_ops->write_end(file, mapping, pos, bytes, copied,
2232 page, fsdata); 2227 page, fsdata);
2233 if (unlikely(status < 0)) 2228 if (unlikely(status < 0))
@@ -2267,8 +2262,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2267{ 2262{
2268 struct file *file = iocb->ki_filp; 2263 struct file *file = iocb->ki_filp;
2269 struct address_space *mapping = file->f_mapping; 2264 struct address_space *mapping = file->f_mapping;
2270 const struct address_space_operations *a_ops = mapping->a_ops;
2271 struct inode *inode = mapping->host;
2272 ssize_t status; 2265 ssize_t status;
2273 struct iov_iter i; 2266 struct iov_iter i;
2274 2267
@@ -2278,16 +2271,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2278 if (likely(status >= 0)) { 2271 if (likely(status >= 0)) {
2279 written += status; 2272 written += status;
2280 *ppos = pos + status; 2273 *ppos = pos + status;
2281
2282 /*
2283 * For now, when the user asks for O_SYNC, we'll actually give
2284 * O_DSYNC
2285 */
2286 if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2287 if (!a_ops->writepage || !is_sync_kiocb(iocb))
2288 status = generic_osync_inode(inode, mapping,
2289 OSYNC_METADATA|OSYNC_DATA);
2290 }
2291 } 2274 }
2292 2275
2293 /* 2276 /*
@@ -2303,9 +2286,27 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2303} 2286}
2304EXPORT_SYMBOL(generic_file_buffered_write); 2287EXPORT_SYMBOL(generic_file_buffered_write);
2305 2288
2306static ssize_t 2289/**
2307__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, 2290 * __generic_file_aio_write - write data to a file
2308 unsigned long nr_segs, loff_t *ppos) 2291 * @iocb: IO state structure (file, offset, etc.)
2292 * @iov: vector with data to write
2293 * @nr_segs: number of segments in the vector
2294 * @ppos: position where to write
2295 *
2296 * This function does all the work needed for actually writing data to a
2297 * file. It does all basic checks, removes SUID from the file, updates
2298 * modification times and calls proper subroutines depending on whether we
2299 * do direct IO or a standard buffered write.
2300 *
2301 * It expects i_mutex to be grabbed unless we work on a block device or similar
2302 * object which does not need locking at all.
2303 *
2304 * This function does *not* take care of syncing data in case of O_SYNC write.
2305 * A caller has to handle it. This is mainly due to the fact that we want to
2306 * avoid syncing under i_mutex.
2307 */
2308ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2309 unsigned long nr_segs, loff_t *ppos)
2309{ 2310{
2310 struct file *file = iocb->ki_filp; 2311 struct file *file = iocb->ki_filp;
2311 struct address_space * mapping = file->f_mapping; 2312 struct address_space * mapping = file->f_mapping;
@@ -2402,51 +2403,37 @@ out:
2402 current->backing_dev_info = NULL; 2403 current->backing_dev_info = NULL;
2403 return written ? written : err; 2404 return written ? written : err;
2404} 2405}
2406EXPORT_SYMBOL(__generic_file_aio_write);
2405 2407
2406ssize_t generic_file_aio_write_nolock(struct kiocb *iocb, 2408/**
2407 const struct iovec *iov, unsigned long nr_segs, loff_t pos) 2409 * generic_file_aio_write - write data to a file
2408{ 2410 * @iocb: IO state structure
2409 struct file *file = iocb->ki_filp; 2411 * @iov: vector with data to write
2410 struct address_space *mapping = file->f_mapping; 2412 * @nr_segs: number of segments in the vector
2411 struct inode *inode = mapping->host; 2413 * @pos: position in file where to write
2412 ssize_t ret; 2414 *
2413 2415 * This is a wrapper around __generic_file_aio_write() to be used by most
2414 BUG_ON(iocb->ki_pos != pos); 2416 * filesystems. It takes care of syncing the file in case of O_SYNC file
2415 2417 * and acquires i_mutex as needed.
2416 ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, 2418 */
2417 &iocb->ki_pos);
2418
2419 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2420 ssize_t err;
2421
2422 err = sync_page_range_nolock(inode, mapping, pos, ret);
2423 if (err < 0)
2424 ret = err;
2425 }
2426 return ret;
2427}
2428EXPORT_SYMBOL(generic_file_aio_write_nolock);
2429
2430ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, 2419ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2431 unsigned long nr_segs, loff_t pos) 2420 unsigned long nr_segs, loff_t pos)
2432{ 2421{
2433 struct file *file = iocb->ki_filp; 2422 struct file *file = iocb->ki_filp;
2434 struct address_space *mapping = file->f_mapping; 2423 struct inode *inode = file->f_mapping->host;
2435 struct inode *inode = mapping->host;
2436 ssize_t ret; 2424 ssize_t ret;
2437 2425
2438 BUG_ON(iocb->ki_pos != pos); 2426 BUG_ON(iocb->ki_pos != pos);
2439 2427
2440 mutex_lock(&inode->i_mutex); 2428 mutex_lock(&inode->i_mutex);
2441 ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, 2429 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
2442 &iocb->ki_pos);
2443 mutex_unlock(&inode->i_mutex); 2430 mutex_unlock(&inode->i_mutex);
2444 2431
2445 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2432 if (ret > 0 || ret == -EIOCBQUEUED) {
2446 ssize_t err; 2433 ssize_t err;
2447 2434
2448 err = sync_page_range(inode, mapping, pos, ret); 2435 err = generic_write_sync(file, pos, ret);
2449 if (err < 0) 2436 if (err < 0 && ret > 0)
2450 ret = err; 2437 ret = err;
2451 } 2438 }
2452 return ret; 2439 return ret;
@@ -2463,6 +2450,9 @@ EXPORT_SYMBOL(generic_file_aio_write);
2463 * (presumably at page->private). If the release was successful, return `1'. 2450 * (presumably at page->private). If the release was successful, return `1'.
2464 * Otherwise return zero. 2451 * Otherwise return zero.
2465 * 2452 *
2453 * This may also be called if PG_fscache is set on a page, indicating that the
2454 * page is known to the local caching routines.
2455 *
2466 * The @gfp_mask argument specifies whether I/O may be performed to release 2456 * The @gfp_mask argument specifies whether I/O may be performed to release
2467 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). 2457 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
2468 * 2458 *
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 0c04615651b7..1888b2d71bb8 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -89,8 +89,8 @@ do_xip_mapping_read(struct address_space *mapping,
89 } 89 }
90 } 90 }
91 nr = nr - offset; 91 nr = nr - offset;
92 if (nr > len) 92 if (nr > len - copied)
93 nr = len; 93 nr = len - copied;
94 94
95 error = mapping->a_ops->get_xip_mem(mapping, index, 0, 95 error = mapping->a_ops->get_xip_mem(mapping, index, 0,
96 &xip_mem, &xip_pfn); 96 &xip_mem, &xip_pfn);
@@ -296,7 +296,7 @@ out:
296 } 296 }
297} 297}
298 298
299static struct vm_operations_struct xip_file_vm_ops = { 299static const struct vm_operations_struct xip_file_vm_ops = {
300 .fault = xip_file_fault, 300 .fault = xip_file_fault,
301}; 301};
302 302
diff --git a/mm/highmem.c b/mm/highmem.c
index 68eb1d9b63fa..9c1e627f282e 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -26,7 +26,6 @@
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/hash.h> 27#include <linux/hash.h>
28#include <linux/highmem.h> 28#include <linux/highmem.h>
29#include <linux/blktrace_api.h>
30#include <asm/tlbflush.h> 29#include <asm/tlbflush.h>
31 30
32/* 31/*
@@ -427,16 +426,21 @@ void __init page_address_init(void)
427 426
428void debug_kmap_atomic(enum km_type type) 427void debug_kmap_atomic(enum km_type type)
429{ 428{
430 static unsigned warn_count = 10; 429 static int warn_count = 10;
431 430
432 if (unlikely(warn_count == 0)) 431 if (unlikely(warn_count < 0))
433 return; 432 return;
434 433
435 if (unlikely(in_interrupt())) { 434 if (unlikely(in_interrupt())) {
436 if (in_irq()) { 435 if (in_nmi()) {
436 if (type != KM_NMI && type != KM_NMI_PTE) {
437 WARN_ON(1);
438 warn_count--;
439 }
440 } else if (in_irq()) {
437 if (type != KM_IRQ0 && type != KM_IRQ1 && 441 if (type != KM_IRQ0 && type != KM_IRQ1 &&
438 type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ && 442 type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
439 type != KM_BOUNCE_READ) { 443 type != KM_BOUNCE_READ && type != KM_IRQ_PTE) {
440 WARN_ON(1); 444 WARN_ON(1);
441 warn_count--; 445 warn_count--;
442 } 446 }
@@ -453,7 +457,9 @@ void debug_kmap_atomic(enum km_type type)
453 } 457 }
454 458
455 if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ || 459 if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
456 type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) { 460 type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ ||
461 type == KM_IRQ_PTE || type == KM_NMI ||
462 type == KM_NMI_PTE ) {
457 if (!irqs_disabled()) { 463 if (!irqs_disabled()) {
458 WARN_ON(1); 464 WARN_ON(1);
459 warn_count--; 465 warn_count--;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 28c655ba9353..5d7601b02874 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -234,6 +234,7 @@ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
234 234
235 return 1UL << (hstate->order + PAGE_SHIFT); 235 return 1UL << (hstate->order + PAGE_SHIFT);
236} 236}
237EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
237 238
238/* 239/*
239 * Return the page size being used by the MMU to back a VMA. In the majority 240 * Return the page size being used by the MMU to back a VMA. In the majority
@@ -316,7 +317,7 @@ static void resv_map_release(struct kref *ref)
316static struct resv_map *vma_resv_map(struct vm_area_struct *vma) 317static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
317{ 318{
318 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 319 VM_BUG_ON(!is_vm_hugetlb_page(vma));
319 if (!(vma->vm_flags & VM_SHARED)) 320 if (!(vma->vm_flags & VM_MAYSHARE))
320 return (struct resv_map *)(get_vma_private_data(vma) & 321 return (struct resv_map *)(get_vma_private_data(vma) &
321 ~HPAGE_RESV_MASK); 322 ~HPAGE_RESV_MASK);
322 return NULL; 323 return NULL;
@@ -325,7 +326,7 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
325static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) 326static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
326{ 327{
327 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 328 VM_BUG_ON(!is_vm_hugetlb_page(vma));
328 VM_BUG_ON(vma->vm_flags & VM_SHARED); 329 VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
329 330
330 set_vma_private_data(vma, (get_vma_private_data(vma) & 331 set_vma_private_data(vma, (get_vma_private_data(vma) &
331 HPAGE_RESV_MASK) | (unsigned long)map); 332 HPAGE_RESV_MASK) | (unsigned long)map);
@@ -334,7 +335,7 @@ static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
334static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) 335static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
335{ 336{
336 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 337 VM_BUG_ON(!is_vm_hugetlb_page(vma));
337 VM_BUG_ON(vma->vm_flags & VM_SHARED); 338 VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
338 339
339 set_vma_private_data(vma, get_vma_private_data(vma) | flags); 340 set_vma_private_data(vma, get_vma_private_data(vma) | flags);
340} 341}
@@ -353,7 +354,7 @@ static void decrement_hugepage_resv_vma(struct hstate *h,
353 if (vma->vm_flags & VM_NORESERVE) 354 if (vma->vm_flags & VM_NORESERVE)
354 return; 355 return;
355 356
356 if (vma->vm_flags & VM_SHARED) { 357 if (vma->vm_flags & VM_MAYSHARE) {
357 /* Shared mappings always use reserves */ 358 /* Shared mappings always use reserves */
358 h->resv_huge_pages--; 359 h->resv_huge_pages--;
359 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 360 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
@@ -369,14 +370,14 @@ static void decrement_hugepage_resv_vma(struct hstate *h,
369void reset_vma_resv_huge_pages(struct vm_area_struct *vma) 370void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
370{ 371{
371 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 372 VM_BUG_ON(!is_vm_hugetlb_page(vma));
372 if (!(vma->vm_flags & VM_SHARED)) 373 if (!(vma->vm_flags & VM_MAYSHARE))
373 vma->vm_private_data = (void *)0; 374 vma->vm_private_data = (void *)0;
374} 375}
375 376
376/* Returns true if the VMA has associated reserve pages */ 377/* Returns true if the VMA has associated reserve pages */
377static int vma_has_reserves(struct vm_area_struct *vma) 378static int vma_has_reserves(struct vm_area_struct *vma)
378{ 379{
379 if (vma->vm_flags & VM_SHARED) 380 if (vma->vm_flags & VM_MAYSHARE)
380 return 1; 381 return 1;
381 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 382 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
382 return 1; 383 return 1;
@@ -455,24 +456,6 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
455 h->free_huge_pages_node[nid]++; 456 h->free_huge_pages_node[nid]++;
456} 457}
457 458
458static struct page *dequeue_huge_page(struct hstate *h)
459{
460 int nid;
461 struct page *page = NULL;
462
463 for (nid = 0; nid < MAX_NUMNODES; ++nid) {
464 if (!list_empty(&h->hugepage_freelists[nid])) {
465 page = list_entry(h->hugepage_freelists[nid].next,
466 struct page, lru);
467 list_del(&page->lru);
468 h->free_huge_pages--;
469 h->free_huge_pages_node[nid]--;
470 break;
471 }
472 }
473 return page;
474}
475
476static struct page *dequeue_huge_page_vma(struct hstate *h, 459static struct page *dequeue_huge_page_vma(struct hstate *h,
477 struct vm_area_struct *vma, 460 struct vm_area_struct *vma,
478 unsigned long address, int avoid_reserve) 461 unsigned long address, int avoid_reserve)
@@ -578,41 +561,6 @@ static void free_huge_page(struct page *page)
578 hugetlb_put_quota(mapping, 1); 561 hugetlb_put_quota(mapping, 1);
579} 562}
580 563
581/*
582 * Increment or decrement surplus_huge_pages. Keep node-specific counters
583 * balanced by operating on them in a round-robin fashion.
584 * Returns 1 if an adjustment was made.
585 */
586static int adjust_pool_surplus(struct hstate *h, int delta)
587{
588 static int prev_nid;
589 int nid = prev_nid;
590 int ret = 0;
591
592 VM_BUG_ON(delta != -1 && delta != 1);
593 do {
594 nid = next_node(nid, node_online_map);
595 if (nid == MAX_NUMNODES)
596 nid = first_node(node_online_map);
597
598 /* To shrink on this node, there must be a surplus page */
599 if (delta < 0 && !h->surplus_huge_pages_node[nid])
600 continue;
601 /* Surplus cannot exceed the total number of pages */
602 if (delta > 0 && h->surplus_huge_pages_node[nid] >=
603 h->nr_huge_pages_node[nid])
604 continue;
605
606 h->surplus_huge_pages += delta;
607 h->surplus_huge_pages_node[nid] += delta;
608 ret = 1;
609 break;
610 } while (nid != prev_nid);
611
612 prev_nid = nid;
613 return ret;
614}
615
616static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) 564static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
617{ 565{
618 set_compound_page_dtor(page, free_huge_page); 566 set_compound_page_dtor(page, free_huge_page);
@@ -623,6 +571,34 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
623 put_page(page); /* free it into the hugepage allocator */ 571 put_page(page); /* free it into the hugepage allocator */
624} 572}
625 573
574static void prep_compound_gigantic_page(struct page *page, unsigned long order)
575{
576 int i;
577 int nr_pages = 1 << order;
578 struct page *p = page + 1;
579
580 /* we rely on prep_new_huge_page to set the destructor */
581 set_compound_order(page, order);
582 __SetPageHead(page);
583 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
584 __SetPageTail(p);
585 p->first_page = page;
586 }
587}
588
589int PageHuge(struct page *page)
590{
591 compound_page_dtor *dtor;
592
593 if (!PageCompound(page))
594 return 0;
595
596 page = compound_head(page);
597 dtor = get_compound_page_dtor(page);
598
599 return dtor == free_huge_page;
600}
601
626static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) 602static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
627{ 603{
628 struct page *page; 604 struct page *page;
@@ -630,7 +606,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
630 if (h->order >= MAX_ORDER) 606 if (h->order >= MAX_ORDER)
631 return NULL; 607 return NULL;
632 608
633 page = alloc_pages_node(nid, 609 page = alloc_pages_exact_node(nid,
634 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| 610 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
635 __GFP_REPEAT|__GFP_NOWARN, 611 __GFP_REPEAT|__GFP_NOWARN,
636 huge_page_order(h)); 612 huge_page_order(h));
@@ -647,22 +623,22 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
647 623
648/* 624/*
649 * Use a helper variable to find the next node and then 625 * Use a helper variable to find the next node and then
650 * copy it back to hugetlb_next_nid afterwards: 626 * copy it back to next_nid_to_alloc afterwards:
651 * otherwise there's a window in which a racer might 627 * otherwise there's a window in which a racer might
652 * pass invalid nid MAX_NUMNODES to alloc_pages_node. 628 * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node.
653 * But we don't need to use a spin_lock here: it really 629 * But we don't need to use a spin_lock here: it really
654 * doesn't matter if occasionally a racer chooses the 630 * doesn't matter if occasionally a racer chooses the
655 * same nid as we do. Move nid forward in the mask even 631 * same nid as we do. Move nid forward in the mask even
656 * if we just successfully allocated a hugepage so that 632 * if we just successfully allocated a hugepage so that
657 * the next caller gets hugepages on the next node. 633 * the next caller gets hugepages on the next node.
658 */ 634 */
659static int hstate_next_node(struct hstate *h) 635static int hstate_next_node_to_alloc(struct hstate *h)
660{ 636{
661 int next_nid; 637 int next_nid;
662 next_nid = next_node(h->hugetlb_next_nid, node_online_map); 638 next_nid = next_node(h->next_nid_to_alloc, node_online_map);
663 if (next_nid == MAX_NUMNODES) 639 if (next_nid == MAX_NUMNODES)
664 next_nid = first_node(node_online_map); 640 next_nid = first_node(node_online_map);
665 h->hugetlb_next_nid = next_nid; 641 h->next_nid_to_alloc = next_nid;
666 return next_nid; 642 return next_nid;
667} 643}
668 644
@@ -673,14 +649,15 @@ static int alloc_fresh_huge_page(struct hstate *h)
673 int next_nid; 649 int next_nid;
674 int ret = 0; 650 int ret = 0;
675 651
676 start_nid = h->hugetlb_next_nid; 652 start_nid = h->next_nid_to_alloc;
653 next_nid = start_nid;
677 654
678 do { 655 do {
679 page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid); 656 page = alloc_fresh_huge_page_node(h, next_nid);
680 if (page) 657 if (page)
681 ret = 1; 658 ret = 1;
682 next_nid = hstate_next_node(h); 659 next_nid = hstate_next_node_to_alloc(h);
683 } while (!page && h->hugetlb_next_nid != start_nid); 660 } while (!page && next_nid != start_nid);
684 661
685 if (ret) 662 if (ret)
686 count_vm_event(HTLB_BUDDY_PGALLOC); 663 count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -690,6 +667,61 @@ static int alloc_fresh_huge_page(struct hstate *h)
690 return ret; 667 return ret;
691} 668}
692 669
670/*
671 * helper for free_pool_huge_page() - find next node
672 * from which to free a huge page
673 */
674static int hstate_next_node_to_free(struct hstate *h)
675{
676 int next_nid;
677 next_nid = next_node(h->next_nid_to_free, node_online_map);
678 if (next_nid == MAX_NUMNODES)
679 next_nid = first_node(node_online_map);
680 h->next_nid_to_free = next_nid;
681 return next_nid;
682}
683
684/*
685 * Free huge page from pool from next node to free.
686 * Attempt to keep persistent huge pages more or less
687 * balanced over allowed nodes.
688 * Called with hugetlb_lock locked.
689 */
690static int free_pool_huge_page(struct hstate *h, bool acct_surplus)
691{
692 int start_nid;
693 int next_nid;
694 int ret = 0;
695
696 start_nid = h->next_nid_to_free;
697 next_nid = start_nid;
698
699 do {
700 /*
701 * If we're returning unused surplus pages, only examine
702 * nodes with surplus pages.
703 */
704 if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) &&
705 !list_empty(&h->hugepage_freelists[next_nid])) {
706 struct page *page =
707 list_entry(h->hugepage_freelists[next_nid].next,
708 struct page, lru);
709 list_del(&page->lru);
710 h->free_huge_pages--;
711 h->free_huge_pages_node[next_nid]--;
712 if (acct_surplus) {
713 h->surplus_huge_pages--;
714 h->surplus_huge_pages_node[next_nid]--;
715 }
716 update_and_free_page(h, page);
717 ret = 1;
718 }
719 next_nid = hstate_next_node_to_free(h);
720 } while (!ret && next_nid != start_nid);
721
722 return ret;
723}
724
693static struct page *alloc_buddy_huge_page(struct hstate *h, 725static struct page *alloc_buddy_huge_page(struct hstate *h,
694 struct vm_area_struct *vma, unsigned long address) 726 struct vm_area_struct *vma, unsigned long address)
695{ 727{
@@ -861,22 +893,13 @@ free:
861 * When releasing a hugetlb pool reservation, any surplus pages that were 893 * When releasing a hugetlb pool reservation, any surplus pages that were
862 * allocated to satisfy the reservation must be explicitly freed if they were 894 * allocated to satisfy the reservation must be explicitly freed if they were
863 * never used. 895 * never used.
896 * Called with hugetlb_lock held.
864 */ 897 */
865static void return_unused_surplus_pages(struct hstate *h, 898static void return_unused_surplus_pages(struct hstate *h,
866 unsigned long unused_resv_pages) 899 unsigned long unused_resv_pages)
867{ 900{
868 static int nid = -1;
869 struct page *page;
870 unsigned long nr_pages; 901 unsigned long nr_pages;
871 902
872 /*
873 * We want to release as many surplus pages as possible, spread
874 * evenly across all nodes. Iterate across all nodes until we
875 * can no longer free unreserved surplus pages. This occurs when
876 * the nodes with surplus pages have no free pages.
877 */
878 unsigned long remaining_iterations = num_online_nodes();
879
880 /* Uncommit the reservation */ 903 /* Uncommit the reservation */
881 h->resv_huge_pages -= unused_resv_pages; 904 h->resv_huge_pages -= unused_resv_pages;
882 905
@@ -886,26 +909,17 @@ static void return_unused_surplus_pages(struct hstate *h,
886 909
887 nr_pages = min(unused_resv_pages, h->surplus_huge_pages); 910 nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
888 911
889 while (remaining_iterations-- && nr_pages) { 912 /*
890 nid = next_node(nid, node_online_map); 913 * We want to release as many surplus pages as possible, spread
891 if (nid == MAX_NUMNODES) 914 * evenly across all nodes. Iterate across all nodes until we
892 nid = first_node(node_online_map); 915 * can no longer free unreserved surplus pages. This occurs when
893 916 * the nodes with surplus pages have no free pages.
894 if (!h->surplus_huge_pages_node[nid]) 917 * free_pool_huge_page() will balance the the frees across the
895 continue; 918 * on-line nodes for us and will handle the hstate accounting.
896 919 */
897 if (!list_empty(&h->hugepage_freelists[nid])) { 920 while (nr_pages--) {
898 page = list_entry(h->hugepage_freelists[nid].next, 921 if (!free_pool_huge_page(h, 1))
899 struct page, lru); 922 break;
900 list_del(&page->lru);
901 update_and_free_page(h, page);
902 h->free_huge_pages--;
903 h->free_huge_pages_node[nid]--;
904 h->surplus_huge_pages--;
905 h->surplus_huge_pages_node[nid]--;
906 nr_pages--;
907 remaining_iterations = num_online_nodes();
908 }
909 } 923 }
910} 924}
911 925
@@ -924,7 +938,7 @@ static long vma_needs_reservation(struct hstate *h,
924 struct address_space *mapping = vma->vm_file->f_mapping; 938 struct address_space *mapping = vma->vm_file->f_mapping;
925 struct inode *inode = mapping->host; 939 struct inode *inode = mapping->host;
926 940
927 if (vma->vm_flags & VM_SHARED) { 941 if (vma->vm_flags & VM_MAYSHARE) {
928 pgoff_t idx = vma_hugecache_offset(h, vma, addr); 942 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
929 return region_chg(&inode->i_mapping->private_list, 943 return region_chg(&inode->i_mapping->private_list,
930 idx, idx + 1); 944 idx, idx + 1);
@@ -949,7 +963,7 @@ static void vma_commit_reservation(struct hstate *h,
949 struct address_space *mapping = vma->vm_file->f_mapping; 963 struct address_space *mapping = vma->vm_file->f_mapping;
950 struct inode *inode = mapping->host; 964 struct inode *inode = mapping->host;
951 965
952 if (vma->vm_flags & VM_SHARED) { 966 if (vma->vm_flags & VM_MAYSHARE) {
953 pgoff_t idx = vma_hugecache_offset(h, vma, addr); 967 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
954 region_add(&inode->i_mapping->private_list, idx, idx + 1); 968 region_add(&inode->i_mapping->private_list, idx, idx + 1);
955 969
@@ -1014,9 +1028,10 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
1014 void *addr; 1028 void *addr;
1015 1029
1016 addr = __alloc_bootmem_node_nopanic( 1030 addr = __alloc_bootmem_node_nopanic(
1017 NODE_DATA(h->hugetlb_next_nid), 1031 NODE_DATA(h->next_nid_to_alloc),
1018 huge_page_size(h), huge_page_size(h), 0); 1032 huge_page_size(h), huge_page_size(h), 0);
1019 1033
1034 hstate_next_node_to_alloc(h);
1020 if (addr) { 1035 if (addr) {
1021 /* 1036 /*
1022 * Use the beginning of the huge page to store the 1037 * Use the beginning of the huge page to store the
@@ -1026,7 +1041,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
1026 m = addr; 1041 m = addr;
1027 goto found; 1042 goto found;
1028 } 1043 }
1029 hstate_next_node(h);
1030 nr_nodes--; 1044 nr_nodes--;
1031 } 1045 }
1032 return 0; 1046 return 0;
@@ -1140,6 +1154,53 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count)
1140} 1154}
1141#endif 1155#endif
1142 1156
1157/*
1158 * Increment or decrement surplus_huge_pages. Keep node-specific counters
1159 * balanced by operating on them in a round-robin fashion.
1160 * Returns 1 if an adjustment was made.
1161 */
1162static int adjust_pool_surplus(struct hstate *h, int delta)
1163{
1164 int start_nid, next_nid;
1165 int ret = 0;
1166
1167 VM_BUG_ON(delta != -1 && delta != 1);
1168
1169 if (delta < 0)
1170 start_nid = h->next_nid_to_alloc;
1171 else
1172 start_nid = h->next_nid_to_free;
1173 next_nid = start_nid;
1174
1175 do {
1176 int nid = next_nid;
1177 if (delta < 0) {
1178 next_nid = hstate_next_node_to_alloc(h);
1179 /*
1180 * To shrink on this node, there must be a surplus page
1181 */
1182 if (!h->surplus_huge_pages_node[nid])
1183 continue;
1184 }
1185 if (delta > 0) {
1186 next_nid = hstate_next_node_to_free(h);
1187 /*
1188 * Surplus cannot exceed the total number of pages
1189 */
1190 if (h->surplus_huge_pages_node[nid] >=
1191 h->nr_huge_pages_node[nid])
1192 continue;
1193 }
1194
1195 h->surplus_huge_pages += delta;
1196 h->surplus_huge_pages_node[nid] += delta;
1197 ret = 1;
1198 break;
1199 } while (next_nid != start_nid);
1200
1201 return ret;
1202}
1203
1143#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) 1204#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
1144static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) 1205static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
1145{ 1206{
@@ -1198,10 +1259,8 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
1198 min_count = max(count, min_count); 1259 min_count = max(count, min_count);
1199 try_to_free_low(h, min_count); 1260 try_to_free_low(h, min_count);
1200 while (min_count < persistent_huge_pages(h)) { 1261 while (min_count < persistent_huge_pages(h)) {
1201 struct page *page = dequeue_huge_page(h); 1262 if (!free_pool_huge_page(h, 0))
1202 if (!page)
1203 break; 1263 break;
1204 update_and_free_page(h, page);
1205 } 1264 }
1206 while (count < persistent_huge_pages(h)) { 1265 while (count < persistent_huge_pages(h)) {
1207 if (!adjust_pool_surplus(h, 1)) 1266 if (!adjust_pool_surplus(h, 1))
@@ -1413,7 +1472,8 @@ void __init hugetlb_add_hstate(unsigned order)
1413 h->free_huge_pages = 0; 1472 h->free_huge_pages = 0;
1414 for (i = 0; i < MAX_NUMNODES; ++i) 1473 for (i = 0; i < MAX_NUMNODES; ++i)
1415 INIT_LIST_HEAD(&h->hugepage_freelists[i]); 1474 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
1416 h->hugetlb_next_nid = first_node(node_online_map); 1475 h->next_nid_to_alloc = first_node(node_online_map);
1476 h->next_nid_to_free = first_node(node_online_map);
1417 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 1477 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1418 huge_page_size(h)/1024); 1478 huge_page_size(h)/1024);
1419 1479
@@ -1477,7 +1537,7 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
1477 1537
1478#ifdef CONFIG_SYSCTL 1538#ifdef CONFIG_SYSCTL
1479int hugetlb_sysctl_handler(struct ctl_table *table, int write, 1539int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1480 struct file *file, void __user *buffer, 1540 void __user *buffer,
1481 size_t *length, loff_t *ppos) 1541 size_t *length, loff_t *ppos)
1482{ 1542{
1483 struct hstate *h = &default_hstate; 1543 struct hstate *h = &default_hstate;
@@ -1488,7 +1548,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1488 1548
1489 table->data = &tmp; 1549 table->data = &tmp;
1490 table->maxlen = sizeof(unsigned long); 1550 table->maxlen = sizeof(unsigned long);
1491 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 1551 proc_doulongvec_minmax(table, write, buffer, length, ppos);
1492 1552
1493 if (write) 1553 if (write)
1494 h->max_huge_pages = set_max_huge_pages(h, tmp); 1554 h->max_huge_pages = set_max_huge_pages(h, tmp);
@@ -1497,10 +1557,10 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1497} 1557}
1498 1558
1499int hugetlb_treat_movable_handler(struct ctl_table *table, int write, 1559int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
1500 struct file *file, void __user *buffer, 1560 void __user *buffer,
1501 size_t *length, loff_t *ppos) 1561 size_t *length, loff_t *ppos)
1502{ 1562{
1503 proc_dointvec(table, write, file, buffer, length, ppos); 1563 proc_dointvec(table, write, buffer, length, ppos);
1504 if (hugepages_treat_as_movable) 1564 if (hugepages_treat_as_movable)
1505 htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; 1565 htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
1506 else 1566 else
@@ -1509,7 +1569,7 @@ int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
1509} 1569}
1510 1570
1511int hugetlb_overcommit_handler(struct ctl_table *table, int write, 1571int hugetlb_overcommit_handler(struct ctl_table *table, int write,
1512 struct file *file, void __user *buffer, 1572 void __user *buffer,
1513 size_t *length, loff_t *ppos) 1573 size_t *length, loff_t *ppos)
1514{ 1574{
1515 struct hstate *h = &default_hstate; 1575 struct hstate *h = &default_hstate;
@@ -1520,7 +1580,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
1520 1580
1521 table->data = &tmp; 1581 table->data = &tmp;
1522 table->maxlen = sizeof(unsigned long); 1582 table->maxlen = sizeof(unsigned long);
1523 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 1583 proc_doulongvec_minmax(table, write, buffer, length, ppos);
1524 1584
1525 if (write) { 1585 if (write) {
1526 spin_lock(&hugetlb_lock); 1586 spin_lock(&hugetlb_lock);
@@ -1661,7 +1721,7 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1661 return 0; 1721 return 0;
1662} 1722}
1663 1723
1664struct vm_operations_struct hugetlb_vm_ops = { 1724const struct vm_operations_struct hugetlb_vm_ops = {
1665 .fault = hugetlb_vm_op_fault, 1725 .fault = hugetlb_vm_op_fault,
1666 .open = hugetlb_vm_op_open, 1726 .open = hugetlb_vm_op_open,
1667 .close = hugetlb_vm_op_close, 1727 .close = hugetlb_vm_op_close,
@@ -1893,7 +1953,7 @@ retry_avoidcopy:
1893 * at the time of fork() could consume its reserves on COW instead 1953 * at the time of fork() could consume its reserves on COW instead
1894 * of the full address range. 1954 * of the full address range.
1895 */ 1955 */
1896 if (!(vma->vm_flags & VM_SHARED) && 1956 if (!(vma->vm_flags & VM_MAYSHARE) &&
1897 is_vma_resv_set(vma, HPAGE_RESV_OWNER) && 1957 is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
1898 old_page != pagecache_page) 1958 old_page != pagecache_page)
1899 outside_reserve = 1; 1959 outside_reserve = 1;
@@ -1956,8 +2016,28 @@ static struct page *hugetlbfs_pagecache_page(struct hstate *h,
1956 return find_lock_page(mapping, idx); 2016 return find_lock_page(mapping, idx);
1957} 2017}
1958 2018
2019/*
2020 * Return whether there is a pagecache page to back given address within VMA.
2021 * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
2022 */
2023static bool hugetlbfs_pagecache_present(struct hstate *h,
2024 struct vm_area_struct *vma, unsigned long address)
2025{
2026 struct address_space *mapping;
2027 pgoff_t idx;
2028 struct page *page;
2029
2030 mapping = vma->vm_file->f_mapping;
2031 idx = vma_hugecache_offset(h, vma, address);
2032
2033 page = find_get_page(mapping, idx);
2034 if (page)
2035 put_page(page);
2036 return page != NULL;
2037}
2038
1959static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 2039static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1960 unsigned long address, pte_t *ptep, int write_access) 2040 unsigned long address, pte_t *ptep, unsigned int flags)
1961{ 2041{
1962 struct hstate *h = hstate_vma(vma); 2042 struct hstate *h = hstate_vma(vma);
1963 int ret = VM_FAULT_SIGBUS; 2043 int ret = VM_FAULT_SIGBUS;
@@ -2000,7 +2080,7 @@ retry:
2000 clear_huge_page(page, address, huge_page_size(h)); 2080 clear_huge_page(page, address, huge_page_size(h));
2001 __SetPageUptodate(page); 2081 __SetPageUptodate(page);
2002 2082
2003 if (vma->vm_flags & VM_SHARED) { 2083 if (vma->vm_flags & VM_MAYSHARE) {
2004 int err; 2084 int err;
2005 struct inode *inode = mapping->host; 2085 struct inode *inode = mapping->host;
2006 2086
@@ -2025,7 +2105,7 @@ retry:
2025 * any allocations necessary to record that reservation occur outside 2105 * any allocations necessary to record that reservation occur outside
2026 * the spinlock. 2106 * the spinlock.
2027 */ 2107 */
2028 if (write_access && !(vma->vm_flags & VM_SHARED)) 2108 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
2029 if (vma_needs_reservation(h, vma, address) < 0) { 2109 if (vma_needs_reservation(h, vma, address) < 0) {
2030 ret = VM_FAULT_OOM; 2110 ret = VM_FAULT_OOM;
2031 goto backout_unlocked; 2111 goto backout_unlocked;
@@ -2044,7 +2124,7 @@ retry:
2044 && (vma->vm_flags & VM_SHARED))); 2124 && (vma->vm_flags & VM_SHARED)));
2045 set_huge_pte_at(mm, address, ptep, new_pte); 2125 set_huge_pte_at(mm, address, ptep, new_pte);
2046 2126
2047 if (write_access && !(vma->vm_flags & VM_SHARED)) { 2127 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
2048 /* Optimization, do the COW without a second fault */ 2128 /* Optimization, do the COW without a second fault */
2049 ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); 2129 ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
2050 } 2130 }
@@ -2063,7 +2143,7 @@ backout_unlocked:
2063} 2143}
2064 2144
2065int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 2145int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2066 unsigned long address, int write_access) 2146 unsigned long address, unsigned int flags)
2067{ 2147{
2068 pte_t *ptep; 2148 pte_t *ptep;
2069 pte_t entry; 2149 pte_t entry;
@@ -2084,7 +2164,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2084 mutex_lock(&hugetlb_instantiation_mutex); 2164 mutex_lock(&hugetlb_instantiation_mutex);
2085 entry = huge_ptep_get(ptep); 2165 entry = huge_ptep_get(ptep);
2086 if (huge_pte_none(entry)) { 2166 if (huge_pte_none(entry)) {
2087 ret = hugetlb_no_page(mm, vma, address, ptep, write_access); 2167 ret = hugetlb_no_page(mm, vma, address, ptep, flags);
2088 goto out_mutex; 2168 goto out_mutex;
2089 } 2169 }
2090 2170
@@ -2098,13 +2178,13 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2098 * page now as it is used to determine if a reservation has been 2178 * page now as it is used to determine if a reservation has been
2099 * consumed. 2179 * consumed.
2100 */ 2180 */
2101 if (write_access && !pte_write(entry)) { 2181 if ((flags & FAULT_FLAG_WRITE) && !pte_write(entry)) {
2102 if (vma_needs_reservation(h, vma, address) < 0) { 2182 if (vma_needs_reservation(h, vma, address) < 0) {
2103 ret = VM_FAULT_OOM; 2183 ret = VM_FAULT_OOM;
2104 goto out_mutex; 2184 goto out_mutex;
2105 } 2185 }
2106 2186
2107 if (!(vma->vm_flags & VM_SHARED)) 2187 if (!(vma->vm_flags & VM_MAYSHARE))
2108 pagecache_page = hugetlbfs_pagecache_page(h, 2188 pagecache_page = hugetlbfs_pagecache_page(h,
2109 vma, address); 2189 vma, address);
2110 } 2190 }
@@ -2115,7 +2195,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2115 goto out_page_table_lock; 2195 goto out_page_table_lock;
2116 2196
2117 2197
2118 if (write_access) { 2198 if (flags & FAULT_FLAG_WRITE) {
2119 if (!pte_write(entry)) { 2199 if (!pte_write(entry)) {
2120 ret = hugetlb_cow(mm, vma, address, ptep, entry, 2200 ret = hugetlb_cow(mm, vma, address, ptep, entry,
2121 pagecache_page); 2201 pagecache_page);
@@ -2124,7 +2204,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2124 entry = pte_mkdirty(entry); 2204 entry = pte_mkdirty(entry);
2125 } 2205 }
2126 entry = pte_mkyoung(entry); 2206 entry = pte_mkyoung(entry);
2127 if (huge_ptep_set_access_flags(vma, address, ptep, entry, write_access)) 2207 if (huge_ptep_set_access_flags(vma, address, ptep, entry,
2208 flags & FAULT_FLAG_WRITE))
2128 update_mmu_cache(vma, address, entry); 2209 update_mmu_cache(vma, address, entry);
2129 2210
2130out_page_table_lock: 2211out_page_table_lock:
@@ -2150,54 +2231,55 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
2150 return NULL; 2231 return NULL;
2151} 2232}
2152 2233
2153static int huge_zeropage_ok(pte_t *ptep, int write, int shared)
2154{
2155 if (!ptep || write || shared)
2156 return 0;
2157 else
2158 return huge_pte_none(huge_ptep_get(ptep));
2159}
2160
2161int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 2234int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
2162 struct page **pages, struct vm_area_struct **vmas, 2235 struct page **pages, struct vm_area_struct **vmas,
2163 unsigned long *position, int *length, int i, 2236 unsigned long *position, int *length, int i,
2164 int write) 2237 unsigned int flags)
2165{ 2238{
2166 unsigned long pfn_offset; 2239 unsigned long pfn_offset;
2167 unsigned long vaddr = *position; 2240 unsigned long vaddr = *position;
2168 int remainder = *length; 2241 int remainder = *length;
2169 struct hstate *h = hstate_vma(vma); 2242 struct hstate *h = hstate_vma(vma);
2170 int zeropage_ok = 0;
2171 int shared = vma->vm_flags & VM_SHARED;
2172 2243
2173 spin_lock(&mm->page_table_lock); 2244 spin_lock(&mm->page_table_lock);
2174 while (vaddr < vma->vm_end && remainder) { 2245 while (vaddr < vma->vm_end && remainder) {
2175 pte_t *pte; 2246 pte_t *pte;
2247 int absent;
2176 struct page *page; 2248 struct page *page;
2177 2249
2178 /* 2250 /*
2179 * Some archs (sparc64, sh*) have multiple pte_ts to 2251 * Some archs (sparc64, sh*) have multiple pte_ts to
2180 * each hugepage. We have to make * sure we get the 2252 * each hugepage. We have to make sure we get the
2181 * first, for the page indexing below to work. 2253 * first, for the page indexing below to work.
2182 */ 2254 */
2183 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); 2255 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
2184 if (huge_zeropage_ok(pte, write, shared)) 2256 absent = !pte || huge_pte_none(huge_ptep_get(pte));
2185 zeropage_ok = 1; 2257
2258 /*
2259 * When coredumping, it suits get_dump_page if we just return
2260 * an error where there's an empty slot with no huge pagecache
2261 * to back it. This way, we avoid allocating a hugepage, and
2262 * the sparse dumpfile avoids allocating disk blocks, but its
2263 * huge holes still show up with zeroes where they need to be.
2264 */
2265 if (absent && (flags & FOLL_DUMP) &&
2266 !hugetlbfs_pagecache_present(h, vma, vaddr)) {
2267 remainder = 0;
2268 break;
2269 }
2186 2270
2187 if (!pte || 2271 if (absent ||
2188 (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) || 2272 ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) {
2189 (write && !pte_write(huge_ptep_get(pte)))) {
2190 int ret; 2273 int ret;
2191 2274
2192 spin_unlock(&mm->page_table_lock); 2275 spin_unlock(&mm->page_table_lock);
2193 ret = hugetlb_fault(mm, vma, vaddr, write); 2276 ret = hugetlb_fault(mm, vma, vaddr,
2277 (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
2194 spin_lock(&mm->page_table_lock); 2278 spin_lock(&mm->page_table_lock);
2195 if (!(ret & VM_FAULT_ERROR)) 2279 if (!(ret & VM_FAULT_ERROR))
2196 continue; 2280 continue;
2197 2281
2198 remainder = 0; 2282 remainder = 0;
2199 if (!i)
2200 i = -EFAULT;
2201 break; 2283 break;
2202 } 2284 }
2203 2285
@@ -2205,10 +2287,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
2205 page = pte_page(huge_ptep_get(pte)); 2287 page = pte_page(huge_ptep_get(pte));
2206same_page: 2288same_page:
2207 if (pages) { 2289 if (pages) {
2208 if (zeropage_ok) 2290 pages[i] = mem_map_offset(page, pfn_offset);
2209 pages[i] = ZERO_PAGE(0);
2210 else
2211 pages[i] = mem_map_offset(page, pfn_offset);
2212 get_page(pages[i]); 2291 get_page(pages[i]);
2213 } 2292 }
2214 2293
@@ -2232,7 +2311,7 @@ same_page:
2232 *length = remainder; 2311 *length = remainder;
2233 *position = vaddr; 2312 *position = vaddr;
2234 2313
2235 return i; 2314 return i ? i : -EFAULT;
2236} 2315}
2237 2316
2238void hugetlb_change_protection(struct vm_area_struct *vma, 2317void hugetlb_change_protection(struct vm_area_struct *vma,
@@ -2289,7 +2368,7 @@ int hugetlb_reserve_pages(struct inode *inode,
2289 * to reserve the full area even if read-only as mprotect() may be 2368 * to reserve the full area even if read-only as mprotect() may be
2290 * called to make the mapping read-write. Assume !vma is a shm mapping 2369 * called to make the mapping read-write. Assume !vma is a shm mapping
2291 */ 2370 */
2292 if (!vma || vma->vm_flags & VM_SHARED) 2371 if (!vma || vma->vm_flags & VM_MAYSHARE)
2293 chg = region_chg(&inode->i_mapping->private_list, from, to); 2372 chg = region_chg(&inode->i_mapping->private_list, from, to);
2294 else { 2373 else {
2295 struct resv_map *resv_map = resv_map_alloc(); 2374 struct resv_map *resv_map = resv_map_alloc();
@@ -2330,7 +2409,7 @@ int hugetlb_reserve_pages(struct inode *inode,
2330 * consumed reservations are stored in the map. Hence, nothing 2409 * consumed reservations are stored in the map. Hence, nothing
2331 * else has to be done for private mappings here 2410 * else has to be done for private mappings here
2332 */ 2411 */
2333 if (!vma || vma->vm_flags & VM_SHARED) 2412 if (!vma || vma->vm_flags & VM_MAYSHARE)
2334 region_add(&inode->i_mapping->private_list, from, to); 2413 region_add(&inode->i_mapping->private_list, from, to);
2335 return 0; 2414 return 0;
2336} 2415}
@@ -2341,7 +2420,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
2341 long chg = region_truncate(&inode->i_mapping->private_list, offset); 2420 long chg = region_truncate(&inode->i_mapping->private_list, offset);
2342 2421
2343 spin_lock(&inode->i_lock); 2422 spin_lock(&inode->i_lock);
2344 inode->i_blocks -= blocks_per_huge_page(h); 2423 inode->i_blocks -= (blocks_per_huge_page(h) * freed);
2345 spin_unlock(&inode->i_lock); 2424 spin_unlock(&inode->i_lock);
2346 2425
2347 hugetlb_put_quota(inode->i_mapping, (chg - freed)); 2426 hugetlb_put_quota(inode->i_mapping, (chg - freed));
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
new file mode 100644
index 000000000000..e1d85137f086
--- /dev/null
+++ b/mm/hwpoison-inject.c
@@ -0,0 +1,41 @@
1/* Inject a hwpoison memory failure on a arbitary pfn */
2#include <linux/module.h>
3#include <linux/debugfs.h>
4#include <linux/kernel.h>
5#include <linux/mm.h>
6
7static struct dentry *hwpoison_dir, *corrupt_pfn;
8
9static int hwpoison_inject(void *data, u64 val)
10{
11 if (!capable(CAP_SYS_ADMIN))
12 return -EPERM;
13 printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val);
14 return __memory_failure(val, 18, 0);
15}
16
17DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n");
18
19static void pfn_inject_exit(void)
20{
21 if (hwpoison_dir)
22 debugfs_remove_recursive(hwpoison_dir);
23}
24
25static int pfn_inject_init(void)
26{
27 hwpoison_dir = debugfs_create_dir("hwpoison", NULL);
28 if (hwpoison_dir == NULL)
29 return -ENOMEM;
30 corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir,
31 NULL, &hwpoison_fops);
32 if (corrupt_pfn == NULL) {
33 pfn_inject_exit();
34 return -ENOMEM;
35 }
36 return 0;
37}
38
39module_init(pfn_inject_init);
40module_exit(pfn_inject_exit);
41MODULE_LICENSE("GPL");
diff --git a/mm/init-mm.c b/mm/init-mm.c
new file mode 100644
index 000000000000..57aba0da9668
--- /dev/null
+++ b/mm/init-mm.c
@@ -0,0 +1,20 @@
1#include <linux/mm_types.h>
2#include <linux/rbtree.h>
3#include <linux/rwsem.h>
4#include <linux/spinlock.h>
5#include <linux/list.h>
6#include <linux/cpumask.h>
7
8#include <asm/atomic.h>
9#include <asm/pgtable.h>
10
11struct mm_struct init_mm = {
12 .mm_rb = RB_ROOT,
13 .pgd = swapper_pg_dir,
14 .mm_users = ATOMIC_INIT(2),
15 .mm_count = ATOMIC_INIT(1),
16 .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
17 .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
18 .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
19 .cpu_vm_mask = CPU_MASK_ALL,
20};
diff --git a/mm/internal.h b/mm/internal.h
index 987bb03fbdd8..22ec8d2b0fb8 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -16,9 +16,6 @@
16void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, 16void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
17 unsigned long floor, unsigned long ceiling); 17 unsigned long floor, unsigned long ceiling);
18 18
19extern void prep_compound_page(struct page *page, unsigned long order);
20extern void prep_compound_gigantic_page(struct page *page, unsigned long order);
21
22static inline void set_page_count(struct page *page, int v) 19static inline void set_page_count(struct page *page, int v)
23{ 20{
24 atomic_set(&page->_count, v); 21 atomic_set(&page->_count, v);
@@ -40,6 +37,8 @@ static inline void __put_page(struct page *page)
40 atomic_dec(&page->_count); 37 atomic_dec(&page->_count);
41} 38}
42 39
40extern unsigned long highest_memmap_pfn;
41
43/* 42/*
44 * in mm/vmscan.c: 43 * in mm/vmscan.c:
45 */ 44 */
@@ -49,8 +48,9 @@ extern void putback_lru_page(struct page *page);
49/* 48/*
50 * in mm/page_alloc.c 49 * in mm/page_alloc.c
51 */ 50 */
52extern unsigned long highest_memmap_pfn;
53extern void __free_pages_bootmem(struct page *page, unsigned int order); 51extern void __free_pages_bootmem(struct page *page, unsigned int order);
52extern void prep_compound_page(struct page *page, unsigned long order);
53
54 54
55/* 55/*
56 * function for dealing with page's order in buddy system. 56 * function for dealing with page's order in buddy system.
@@ -74,7 +74,6 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
74} 74}
75#endif 75#endif
76 76
77#ifdef CONFIG_UNEVICTABLE_LRU
78/* 77/*
79 * unevictable_migrate_page() called only from migrate_page_copy() to 78 * unevictable_migrate_page() called only from migrate_page_copy() to
80 * migrate unevictable flag to new page. 79 * migrate unevictable flag to new page.
@@ -86,11 +85,6 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old)
86 if (TestClearPageUnevictable(old)) 85 if (TestClearPageUnevictable(old))
87 SetPageUnevictable(new); 86 SetPageUnevictable(new);
88} 87}
89#else
90static inline void unevictable_migrate_page(struct page *new, struct page *old)
91{
92}
93#endif
94 88
95#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT 89#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
96/* 90/*
@@ -150,23 +144,6 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
150 } 144 }
151} 145}
152 146
153/*
154 * free_page_mlock() -- clean up attempts to free and mlocked() page.
155 * Page should not be on lru, so no need to fix that up.
156 * free_pages_check() will verify...
157 */
158static inline void free_page_mlock(struct page *page)
159{
160 if (unlikely(TestClearPageMlocked(page))) {
161 unsigned long flags;
162
163 local_irq_save(flags);
164 __dec_zone_page_state(page, NR_MLOCK);
165 __count_vm_event(UNEVICTABLE_MLOCKFREED);
166 local_irq_restore(flags);
167 }
168}
169
170#else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ 147#else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
171static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) 148static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
172{ 149{
@@ -175,7 +152,6 @@ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
175static inline void clear_page_mlock(struct page *page) { } 152static inline void clear_page_mlock(struct page *page) { }
176static inline void mlock_vma_page(struct page *page) { } 153static inline void mlock_vma_page(struct page *page) { }
177static inline void mlock_migrate_page(struct page *new, struct page *old) { } 154static inline void mlock_migrate_page(struct page *new, struct page *old) { }
178static inline void free_page_mlock(struct page *page) { }
179 155
180#endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ 156#endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
181 157
@@ -275,13 +251,12 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
275} 251}
276#endif /* CONFIG_SPARSEMEM */ 252#endif /* CONFIG_SPARSEMEM */
277 253
278#define GUP_FLAGS_WRITE 0x1
279#define GUP_FLAGS_FORCE 0x2
280#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4
281#define GUP_FLAGS_IGNORE_SIGKILL 0x8
282
283int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 254int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
284 unsigned long start, int len, int flags, 255 unsigned long start, int len, unsigned int foll_flags,
285 struct page **pages, struct vm_area_struct **vmas); 256 struct page **pages, struct vm_area_struct **vmas);
286 257
258#define ZONE_RECLAIM_NOSCAN -2
259#define ZONE_RECLAIM_FULL -1
260#define ZONE_RECLAIM_SOME 0
261#define ZONE_RECLAIM_SUCCESS 1
287#endif 262#endif
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
new file mode 100644
index 000000000000..fd814fd61319
--- /dev/null
+++ b/mm/kmemcheck.c
@@ -0,0 +1,122 @@
1#include <linux/gfp.h>
2#include <linux/mm_types.h>
3#include <linux/mm.h>
4#include <linux/slab.h>
5#include <linux/kmemcheck.h>
6
7void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
8{
9 struct page *shadow;
10 int pages;
11 int i;
12
13 pages = 1 << order;
14
15 /*
16 * With kmemcheck enabled, we need to allocate a memory area for the
17 * shadow bits as well.
18 */
19 shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order);
20 if (!shadow) {
21 if (printk_ratelimit())
22 printk(KERN_ERR "kmemcheck: failed to allocate "
23 "shadow bitmap\n");
24 return;
25 }
26
27 for(i = 0; i < pages; ++i)
28 page[i].shadow = page_address(&shadow[i]);
29
30 /*
31 * Mark it as non-present for the MMU so that our accesses to
32 * this memory will trigger a page fault and let us analyze
33 * the memory accesses.
34 */
35 kmemcheck_hide_pages(page, pages);
36}
37
38void kmemcheck_free_shadow(struct page *page, int order)
39{
40 struct page *shadow;
41 int pages;
42 int i;
43
44 if (!kmemcheck_page_is_tracked(page))
45 return;
46
47 pages = 1 << order;
48
49 kmemcheck_show_pages(page, pages);
50
51 shadow = virt_to_page(page[0].shadow);
52
53 for(i = 0; i < pages; ++i)
54 page[i].shadow = NULL;
55
56 __free_pages(shadow, order);
57}
58
59void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
60 size_t size)
61{
62 /*
63 * Has already been memset(), which initializes the shadow for us
64 * as well.
65 */
66 if (gfpflags & __GFP_ZERO)
67 return;
68
69 /* No need to initialize the shadow of a non-tracked slab. */
70 if (s->flags & SLAB_NOTRACK)
71 return;
72
73 if (!kmemcheck_enabled || gfpflags & __GFP_NOTRACK) {
74 /*
75 * Allow notracked objects to be allocated from
76 * tracked caches. Note however that these objects
77 * will still get page faults on access, they just
78 * won't ever be flagged as uninitialized. If page
79 * faults are not acceptable, the slab cache itself
80 * should be marked NOTRACK.
81 */
82 kmemcheck_mark_initialized(object, size);
83 } else if (!s->ctor) {
84 /*
85 * New objects should be marked uninitialized before
86 * they're returned to the called.
87 */
88 kmemcheck_mark_uninitialized(object, size);
89 }
90}
91
92void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size)
93{
94 /* TODO: RCU freeing is unsupported for now; hide false positives. */
95 if (!s->ctor && !(s->flags & SLAB_DESTROY_BY_RCU))
96 kmemcheck_mark_freed(object, size);
97}
98
99void kmemcheck_pagealloc_alloc(struct page *page, unsigned int order,
100 gfp_t gfpflags)
101{
102 int pages;
103
104 if (gfpflags & (__GFP_HIGHMEM | __GFP_NOTRACK))
105 return;
106
107 pages = 1 << order;
108
109 /*
110 * NOTE: We choose to track GFP_ZERO pages too; in fact, they
111 * can become uninitialized by copying uninitialized memory
112 * into them.
113 */
114
115 /* XXX: Can use zone->node for node? */
116 kmemcheck_alloc_shadow(page, order, gfpflags, -1);
117
118 if (gfpflags & __GFP_ZERO)
119 kmemcheck_mark_initialized_pages(page, pages);
120 else
121 kmemcheck_mark_uninitialized_pages(page, pages);
122}
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
new file mode 100644
index 000000000000..177a5169bbde
--- /dev/null
+++ b/mm/kmemleak-test.c
@@ -0,0 +1,111 @@
1/*
2 * mm/kmemleak-test.c
3 *
4 * Copyright (C) 2008 ARM Limited
5 * Written by Catalin Marinas <catalin.marinas@arm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20
21#include <linux/init.h>
22#include <linux/kernel.h>
23#include <linux/module.h>
24#include <linux/slab.h>
25#include <linux/vmalloc.h>
26#include <linux/list.h>
27#include <linux/percpu.h>
28#include <linux/fdtable.h>
29
30#include <linux/kmemleak.h>
31
32struct test_node {
33 long header[25];
34 struct list_head list;
35 long footer[25];
36};
37
38static LIST_HEAD(test_list);
39static DEFINE_PER_CPU(void *, kmemleak_test_pointer);
40
41/*
42 * Some very simple testing. This function needs to be extended for
43 * proper testing.
44 */
45static int __init kmemleak_test_init(void)
46{
47 struct test_node *elem;
48 int i;
49
50 printk(KERN_INFO "Kmemleak testing\n");
51
52 /* make some orphan objects */
53 pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
54 pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
55 pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL));
56 pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL));
57 pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL));
58 pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL));
59 pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL));
60 pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL));
61#ifndef CONFIG_MODULES
62 pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n",
63 kmem_cache_alloc(files_cachep, GFP_KERNEL));
64 pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n",
65 kmem_cache_alloc(files_cachep, GFP_KERNEL));
66#endif
67 pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
68 pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
69 pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
70 pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
71 pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
72
73 /*
74 * Add elements to a list. They should only appear as orphan
75 * after the module is removed.
76 */
77 for (i = 0; i < 10; i++) {
78 elem = kmalloc(sizeof(*elem), GFP_KERNEL);
79 pr_info("kmemleak: kmalloc(sizeof(*elem)) = %p\n", elem);
80 if (!elem)
81 return -ENOMEM;
82 memset(elem, 0, sizeof(*elem));
83 INIT_LIST_HEAD(&elem->list);
84
85 list_add_tail(&elem->list, &test_list);
86 }
87
88 for_each_possible_cpu(i) {
89 per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL);
90 pr_info("kmemleak: kmalloc(129) = %p\n",
91 per_cpu(kmemleak_test_pointer, i));
92 }
93
94 return 0;
95}
96module_init(kmemleak_test_init);
97
98static void __exit kmemleak_test_exit(void)
99{
100 struct test_node *elem, *tmp;
101
102 /*
103 * Remove the list elements without actually freeing the
104 * memory.
105 */
106 list_for_each_entry_safe(elem, tmp, &test_list, list)
107 list_del(&elem->list);
108}
109module_exit(kmemleak_test_exit);
110
111MODULE_LICENSE("GPL");
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
new file mode 100644
index 000000000000..8bf765c4f58d
--- /dev/null
+++ b/mm/kmemleak.c
@@ -0,0 +1,1689 @@
1/*
2 * mm/kmemleak.c
3 *
4 * Copyright (C) 2008 ARM Limited
5 * Written by Catalin Marinas <catalin.marinas@arm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *
20 *
21 * For more information on the algorithm and kmemleak usage, please see
22 * Documentation/kmemleak.txt.
23 *
24 * Notes on locking
25 * ----------------
26 *
27 * The following locks and mutexes are used by kmemleak:
28 *
29 * - kmemleak_lock (rwlock): protects the object_list modifications and
30 * accesses to the object_tree_root. The object_list is the main list
31 * holding the metadata (struct kmemleak_object) for the allocated memory
32 * blocks. The object_tree_root is a priority search tree used to look-up
33 * metadata based on a pointer to the corresponding memory block. The
34 * kmemleak_object structures are added to the object_list and
35 * object_tree_root in the create_object() function called from the
36 * kmemleak_alloc() callback and removed in delete_object() called from the
37 * kmemleak_free() callback
38 * - kmemleak_object.lock (spinlock): protects a kmemleak_object. Accesses to
39 * the metadata (e.g. count) are protected by this lock. Note that some
40 * members of this structure may be protected by other means (atomic or
41 * kmemleak_lock). This lock is also held when scanning the corresponding
42 * memory block to avoid the kernel freeing it via the kmemleak_free()
43 * callback. This is less heavyweight than holding a global lock like
44 * kmemleak_lock during scanning
45 * - scan_mutex (mutex): ensures that only one thread may scan the memory for
46 * unreferenced objects at a time. The gray_list contains the objects which
47 * are already referenced or marked as false positives and need to be
48 * scanned. This list is only modified during a scanning episode when the
49 * scan_mutex is held. At the end of a scan, the gray_list is always empty.
50 * Note that the kmemleak_object.use_count is incremented when an object is
51 * added to the gray_list and therefore cannot be freed. This mutex also
52 * prevents multiple users of the "kmemleak" debugfs file together with
53 * modifications to the memory scanning parameters including the scan_thread
54 * pointer
55 *
56 * The kmemleak_object structures have a use_count incremented or decremented
57 * using the get_object()/put_object() functions. When the use_count becomes
58 * 0, this count can no longer be incremented and put_object() schedules the
59 * kmemleak_object freeing via an RCU callback. All calls to the get_object()
60 * function must be protected by rcu_read_lock() to avoid accessing a freed
61 * structure.
62 */
63
64#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
65
66#include <linux/init.h>
67#include <linux/kernel.h>
68#include <linux/list.h>
69#include <linux/sched.h>
70#include <linux/jiffies.h>
71#include <linux/delay.h>
72#include <linux/module.h>
73#include <linux/kthread.h>
74#include <linux/prio_tree.h>
75#include <linux/gfp.h>
76#include <linux/fs.h>
77#include <linux/debugfs.h>
78#include <linux/seq_file.h>
79#include <linux/cpumask.h>
80#include <linux/spinlock.h>
81#include <linux/mutex.h>
82#include <linux/rcupdate.h>
83#include <linux/stacktrace.h>
84#include <linux/cache.h>
85#include <linux/percpu.h>
86#include <linux/hardirq.h>
87#include <linux/mmzone.h>
88#include <linux/slab.h>
89#include <linux/thread_info.h>
90#include <linux/err.h>
91#include <linux/uaccess.h>
92#include <linux/string.h>
93#include <linux/nodemask.h>
94#include <linux/mm.h>
95#include <linux/workqueue.h>
96
97#include <asm/sections.h>
98#include <asm/processor.h>
99#include <asm/atomic.h>
100
101#include <linux/kmemcheck.h>
102#include <linux/kmemleak.h>
103
104/*
105 * Kmemleak configuration and common defines.
106 */
107#define MAX_TRACE 16 /* stack trace length */
108#define MSECS_MIN_AGE 5000 /* minimum object age for reporting */
109#define SECS_FIRST_SCAN 60 /* delay before the first scan */
110#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */
111#define GRAY_LIST_PASSES 25 /* maximum number of gray list scans */
112#define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */
113
114#define BYTES_PER_POINTER sizeof(void *)
115
116/* GFP bitmask for kmemleak internal allocations */
117#define GFP_KMEMLEAK_MASK (GFP_KERNEL | GFP_ATOMIC)
118
119/* scanning area inside a memory block */
120struct kmemleak_scan_area {
121 struct hlist_node node;
122 unsigned long offset;
123 size_t length;
124};
125
126#define KMEMLEAK_GREY 0
127#define KMEMLEAK_BLACK -1
128
129/*
130 * Structure holding the metadata for each allocated memory block.
131 * Modifications to such objects should be made while holding the
132 * object->lock. Insertions or deletions from object_list, gray_list or
133 * tree_node are already protected by the corresponding locks or mutex (see
134 * the notes on locking above). These objects are reference-counted
135 * (use_count) and freed using the RCU mechanism.
136 */
137struct kmemleak_object {
138 spinlock_t lock;
139 unsigned long flags; /* object status flags */
140 struct list_head object_list;
141 struct list_head gray_list;
142 struct prio_tree_node tree_node;
143 struct rcu_head rcu; /* object_list lockless traversal */
144 /* object usage count; object freed when use_count == 0 */
145 atomic_t use_count;
146 unsigned long pointer;
147 size_t size;
148 /* minimum number of a pointers found before it is considered leak */
149 int min_count;
150 /* the total number of pointers found pointing to this object */
151 int count;
152 /* memory ranges to be scanned inside an object (empty for all) */
153 struct hlist_head area_list;
154 unsigned long trace[MAX_TRACE];
155 unsigned int trace_len;
156 unsigned long jiffies; /* creation timestamp */
157 pid_t pid; /* pid of the current task */
158 char comm[TASK_COMM_LEN]; /* executable name */
159};
160
161/* flag representing the memory block allocation status */
162#define OBJECT_ALLOCATED (1 << 0)
163/* flag set after the first reporting of an unreference object */
164#define OBJECT_REPORTED (1 << 1)
165/* flag set to not scan the object */
166#define OBJECT_NO_SCAN (1 << 2)
167/* flag set on newly allocated objects */
168#define OBJECT_NEW (1 << 3)
169
170/* number of bytes to print per line; must be 16 or 32 */
171#define HEX_ROW_SIZE 16
172/* number of bytes to print at a time (1, 2, 4, 8) */
173#define HEX_GROUP_SIZE 1
174/* include ASCII after the hex output */
175#define HEX_ASCII 1
176/* max number of lines to be printed */
177#define HEX_MAX_LINES 2
178
179/* the list of all allocated objects */
180static LIST_HEAD(object_list);
181/* the list of gray-colored objects (see color_gray comment below) */
182static LIST_HEAD(gray_list);
183/* prio search tree for object boundaries */
184static struct prio_tree_root object_tree_root;
185/* rw_lock protecting the access to object_list and prio_tree_root */
186static DEFINE_RWLOCK(kmemleak_lock);
187
188/* allocation caches for kmemleak internal data */
189static struct kmem_cache *object_cache;
190static struct kmem_cache *scan_area_cache;
191
192/* set if tracing memory operations is enabled */
193static atomic_t kmemleak_enabled = ATOMIC_INIT(0);
194/* set in the late_initcall if there were no errors */
195static atomic_t kmemleak_initialized = ATOMIC_INIT(0);
196/* enables or disables early logging of the memory operations */
197static atomic_t kmemleak_early_log = ATOMIC_INIT(1);
198/* set if a fata kmemleak error has occurred */
199static atomic_t kmemleak_error = ATOMIC_INIT(0);
200
201/* minimum and maximum address that may be valid pointers */
202static unsigned long min_addr = ULONG_MAX;
203static unsigned long max_addr;
204
205static struct task_struct *scan_thread;
206/* used to avoid reporting of recently allocated objects */
207static unsigned long jiffies_min_age;
208static unsigned long jiffies_last_scan;
209/* delay between automatic memory scannings */
210static signed long jiffies_scan_wait;
211/* enables or disables the task stacks scanning */
212static int kmemleak_stack_scan = 1;
213/* protects the memory scanning, parameters and debug/kmemleak file access */
214static DEFINE_MUTEX(scan_mutex);
215
216/*
217 * Early object allocation/freeing logging. Kmemleak is initialized after the
218 * kernel allocator. However, both the kernel allocator and kmemleak may
219 * allocate memory blocks which need to be tracked. Kmemleak defines an
220 * arbitrary buffer to hold the allocation/freeing information before it is
221 * fully initialized.
222 */
223
224/* kmemleak operation type for early logging */
225enum {
226 KMEMLEAK_ALLOC,
227 KMEMLEAK_FREE,
228 KMEMLEAK_FREE_PART,
229 KMEMLEAK_NOT_LEAK,
230 KMEMLEAK_IGNORE,
231 KMEMLEAK_SCAN_AREA,
232 KMEMLEAK_NO_SCAN
233};
234
235/*
236 * Structure holding the information passed to kmemleak callbacks during the
237 * early logging.
238 */
239struct early_log {
240 int op_type; /* kmemleak operation type */
241 const void *ptr; /* allocated/freed memory block */
242 size_t size; /* memory block size */
243 int min_count; /* minimum reference count */
244 unsigned long offset; /* scan area offset */
245 size_t length; /* scan area length */
246 unsigned long trace[MAX_TRACE]; /* stack trace */
247 unsigned int trace_len; /* stack trace length */
248};
249
250/* early logging buffer and current position */
251static struct early_log
252 early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE] __initdata;
253static int crt_early_log __initdata;
254
255static void kmemleak_disable(void);
256
257/*
258 * Print a warning and dump the stack trace.
259 */
260#define kmemleak_warn(x...) do { \
261 pr_warning(x); \
262 dump_stack(); \
263} while (0)
264
265/*
266 * Macro invoked when a serious kmemleak condition occured and cannot be
267 * recovered from. Kmemleak will be disabled and further allocation/freeing
268 * tracing no longer available.
269 */
270#define kmemleak_stop(x...) do { \
271 kmemleak_warn(x); \
272 kmemleak_disable(); \
273} while (0)
274
275/*
276 * Printing of the objects hex dump to the seq file. The number of lines to be
277 * printed is limited to HEX_MAX_LINES to prevent seq file spamming. The
278 * actual number of printed bytes depends on HEX_ROW_SIZE. It must be called
279 * with the object->lock held.
280 */
281static void hex_dump_object(struct seq_file *seq,
282 struct kmemleak_object *object)
283{
284 const u8 *ptr = (const u8 *)object->pointer;
285 int i, len, remaining;
286 unsigned char linebuf[HEX_ROW_SIZE * 5];
287
288 /* limit the number of lines to HEX_MAX_LINES */
289 remaining = len =
290 min(object->size, (size_t)(HEX_MAX_LINES * HEX_ROW_SIZE));
291
292 seq_printf(seq, " hex dump (first %d bytes):\n", len);
293 for (i = 0; i < len; i += HEX_ROW_SIZE) {
294 int linelen = min(remaining, HEX_ROW_SIZE);
295
296 remaining -= HEX_ROW_SIZE;
297 hex_dump_to_buffer(ptr + i, linelen, HEX_ROW_SIZE,
298 HEX_GROUP_SIZE, linebuf, sizeof(linebuf),
299 HEX_ASCII);
300 seq_printf(seq, " %s\n", linebuf);
301 }
302}
303
304/*
305 * Object colors, encoded with count and min_count:
306 * - white - orphan object, not enough references to it (count < min_count)
307 * - gray - not orphan, not marked as false positive (min_count == 0) or
308 * sufficient references to it (count >= min_count)
309 * - black - ignore, it doesn't contain references (e.g. text section)
310 * (min_count == -1). No function defined for this color.
311 * Newly created objects don't have any color assigned (object->count == -1)
312 * before the next memory scan when they become white.
313 */
314static bool color_white(const struct kmemleak_object *object)
315{
316 return object->count != KMEMLEAK_BLACK &&
317 object->count < object->min_count;
318}
319
320static bool color_gray(const struct kmemleak_object *object)
321{
322 return object->min_count != KMEMLEAK_BLACK &&
323 object->count >= object->min_count;
324}
325
326static bool color_black(const struct kmemleak_object *object)
327{
328 return object->min_count == KMEMLEAK_BLACK;
329}
330
331/*
332 * Objects are considered unreferenced only if their color is white, they have
333 * not be deleted and have a minimum age to avoid false positives caused by
334 * pointers temporarily stored in CPU registers.
335 */
336static bool unreferenced_object(struct kmemleak_object *object)
337{
338 return (object->flags & OBJECT_ALLOCATED) && color_white(object) &&
339 time_before_eq(object->jiffies + jiffies_min_age,
340 jiffies_last_scan);
341}
342
343/*
344 * Printing of the unreferenced objects information to the seq file. The
345 * print_unreferenced function must be called with the object->lock held.
346 */
347static void print_unreferenced(struct seq_file *seq,
348 struct kmemleak_object *object)
349{
350 int i;
351
352 seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n",
353 object->pointer, object->size);
354 seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n",
355 object->comm, object->pid, object->jiffies);
356 hex_dump_object(seq, object);
357 seq_printf(seq, " backtrace:\n");
358
359 for (i = 0; i < object->trace_len; i++) {
360 void *ptr = (void *)object->trace[i];
361 seq_printf(seq, " [<%p>] %pS\n", ptr, ptr);
362 }
363}
364
365/*
366 * Print the kmemleak_object information. This function is used mainly for
367 * debugging special cases when kmemleak operations. It must be called with
368 * the object->lock held.
369 */
370static void dump_object_info(struct kmemleak_object *object)
371{
372 struct stack_trace trace;
373
374 trace.nr_entries = object->trace_len;
375 trace.entries = object->trace;
376
377 pr_notice("Object 0x%08lx (size %zu):\n",
378 object->tree_node.start, object->size);
379 pr_notice(" comm \"%s\", pid %d, jiffies %lu\n",
380 object->comm, object->pid, object->jiffies);
381 pr_notice(" min_count = %d\n", object->min_count);
382 pr_notice(" count = %d\n", object->count);
383 pr_notice(" flags = 0x%lx\n", object->flags);
384 pr_notice(" backtrace:\n");
385 print_stack_trace(&trace, 4);
386}
387
388/*
389 * Look-up a memory block metadata (kmemleak_object) in the priority search
390 * tree based on a pointer value. If alias is 0, only values pointing to the
391 * beginning of the memory block are allowed. The kmemleak_lock must be held
392 * when calling this function.
393 */
394static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
395{
396 struct prio_tree_node *node;
397 struct prio_tree_iter iter;
398 struct kmemleak_object *object;
399
400 prio_tree_iter_init(&iter, &object_tree_root, ptr, ptr);
401 node = prio_tree_next(&iter);
402 if (node) {
403 object = prio_tree_entry(node, struct kmemleak_object,
404 tree_node);
405 if (!alias && object->pointer != ptr) {
406 kmemleak_warn("Found object by alias");
407 object = NULL;
408 }
409 } else
410 object = NULL;
411
412 return object;
413}
414
415/*
416 * Increment the object use_count. Return 1 if successful or 0 otherwise. Note
417 * that once an object's use_count reached 0, the RCU freeing was already
418 * registered and the object should no longer be used. This function must be
419 * called under the protection of rcu_read_lock().
420 */
421static int get_object(struct kmemleak_object *object)
422{
423 return atomic_inc_not_zero(&object->use_count);
424}
425
426/*
427 * RCU callback to free a kmemleak_object.
428 */
429static void free_object_rcu(struct rcu_head *rcu)
430{
431 struct hlist_node *elem, *tmp;
432 struct kmemleak_scan_area *area;
433 struct kmemleak_object *object =
434 container_of(rcu, struct kmemleak_object, rcu);
435
436 /*
437 * Once use_count is 0 (guaranteed by put_object), there is no other
438 * code accessing this object, hence no need for locking.
439 */
440 hlist_for_each_entry_safe(area, elem, tmp, &object->area_list, node) {
441 hlist_del(elem);
442 kmem_cache_free(scan_area_cache, area);
443 }
444 kmem_cache_free(object_cache, object);
445}
446
447/*
448 * Decrement the object use_count. Once the count is 0, free the object using
449 * an RCU callback. Since put_object() may be called via the kmemleak_free() ->
450 * delete_object() path, the delayed RCU freeing ensures that there is no
451 * recursive call to the kernel allocator. Lock-less RCU object_list traversal
452 * is also possible.
453 */
454static void put_object(struct kmemleak_object *object)
455{
456 if (!atomic_dec_and_test(&object->use_count))
457 return;
458
459 /* should only get here after delete_object was called */
460 WARN_ON(object->flags & OBJECT_ALLOCATED);
461
462 call_rcu(&object->rcu, free_object_rcu);
463}
464
465/*
466 * Look up an object in the prio search tree and increase its use_count.
467 */
468static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
469{
470 unsigned long flags;
471 struct kmemleak_object *object = NULL;
472
473 rcu_read_lock();
474 read_lock_irqsave(&kmemleak_lock, flags);
475 if (ptr >= min_addr && ptr < max_addr)
476 object = lookup_object(ptr, alias);
477 read_unlock_irqrestore(&kmemleak_lock, flags);
478
479 /* check whether the object is still available */
480 if (object && !get_object(object))
481 object = NULL;
482 rcu_read_unlock();
483
484 return object;
485}
486
487/*
488 * Save stack trace to the given array of MAX_TRACE size.
489 */
490static int __save_stack_trace(unsigned long *trace)
491{
492 struct stack_trace stack_trace;
493
494 stack_trace.max_entries = MAX_TRACE;
495 stack_trace.nr_entries = 0;
496 stack_trace.entries = trace;
497 stack_trace.skip = 2;
498 save_stack_trace(&stack_trace);
499
500 return stack_trace.nr_entries;
501}
502
503/*
504 * Create the metadata (struct kmemleak_object) corresponding to an allocated
505 * memory block and add it to the object_list and object_tree_root.
506 */
507static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
508 int min_count, gfp_t gfp)
509{
510 unsigned long flags;
511 struct kmemleak_object *object;
512 struct prio_tree_node *node;
513
514 object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK);
515 if (!object) {
516 kmemleak_stop("Cannot allocate a kmemleak_object structure\n");
517 return NULL;
518 }
519
520 INIT_LIST_HEAD(&object->object_list);
521 INIT_LIST_HEAD(&object->gray_list);
522 INIT_HLIST_HEAD(&object->area_list);
523 spin_lock_init(&object->lock);
524 atomic_set(&object->use_count, 1);
525 object->flags = OBJECT_ALLOCATED | OBJECT_NEW;
526 object->pointer = ptr;
527 object->size = size;
528 object->min_count = min_count;
529 object->count = -1; /* no color initially */
530 object->jiffies = jiffies;
531
532 /* task information */
533 if (in_irq()) {
534 object->pid = 0;
535 strncpy(object->comm, "hardirq", sizeof(object->comm));
536 } else if (in_softirq()) {
537 object->pid = 0;
538 strncpy(object->comm, "softirq", sizeof(object->comm));
539 } else {
540 object->pid = current->pid;
541 /*
542 * There is a small chance of a race with set_task_comm(),
543 * however using get_task_comm() here may cause locking
544 * dependency issues with current->alloc_lock. In the worst
545 * case, the command line is not correct.
546 */
547 strncpy(object->comm, current->comm, sizeof(object->comm));
548 }
549
550 /* kernel backtrace */
551 object->trace_len = __save_stack_trace(object->trace);
552
553 INIT_PRIO_TREE_NODE(&object->tree_node);
554 object->tree_node.start = ptr;
555 object->tree_node.last = ptr + size - 1;
556
557 write_lock_irqsave(&kmemleak_lock, flags);
558
559 min_addr = min(min_addr, ptr);
560 max_addr = max(max_addr, ptr + size);
561 node = prio_tree_insert(&object_tree_root, &object->tree_node);
562 /*
563 * The code calling the kernel does not yet have the pointer to the
564 * memory block to be able to free it. However, we still hold the
565 * kmemleak_lock here in case parts of the kernel started freeing
566 * random memory blocks.
567 */
568 if (node != &object->tree_node) {
569 kmemleak_stop("Cannot insert 0x%lx into the object search tree "
570 "(already existing)\n", ptr);
571 object = lookup_object(ptr, 1);
572 spin_lock(&object->lock);
573 dump_object_info(object);
574 spin_unlock(&object->lock);
575
576 goto out;
577 }
578 list_add_tail_rcu(&object->object_list, &object_list);
579out:
580 write_unlock_irqrestore(&kmemleak_lock, flags);
581 return object;
582}
583
584/*
585 * Remove the metadata (struct kmemleak_object) for a memory block from the
586 * object_list and object_tree_root and decrement its use_count.
587 */
588static void __delete_object(struct kmemleak_object *object)
589{
590 unsigned long flags;
591
592 write_lock_irqsave(&kmemleak_lock, flags);
593 prio_tree_remove(&object_tree_root, &object->tree_node);
594 list_del_rcu(&object->object_list);
595 write_unlock_irqrestore(&kmemleak_lock, flags);
596
597 WARN_ON(!(object->flags & OBJECT_ALLOCATED));
598 WARN_ON(atomic_read(&object->use_count) < 2);
599
600 /*
601 * Locking here also ensures that the corresponding memory block
602 * cannot be freed when it is being scanned.
603 */
604 spin_lock_irqsave(&object->lock, flags);
605 object->flags &= ~OBJECT_ALLOCATED;
606 spin_unlock_irqrestore(&object->lock, flags);
607 put_object(object);
608}
609
610/*
611 * Look up the metadata (struct kmemleak_object) corresponding to ptr and
612 * delete it.
613 */
614static void delete_object_full(unsigned long ptr)
615{
616 struct kmemleak_object *object;
617
618 object = find_and_get_object(ptr, 0);
619 if (!object) {
620#ifdef DEBUG
621 kmemleak_warn("Freeing unknown object at 0x%08lx\n",
622 ptr);
623#endif
624 return;
625 }
626 __delete_object(object);
627 put_object(object);
628}
629
630/*
631 * Look up the metadata (struct kmemleak_object) corresponding to ptr and
632 * delete it. If the memory block is partially freed, the function may create
633 * additional metadata for the remaining parts of the block.
634 */
635static void delete_object_part(unsigned long ptr, size_t size)
636{
637 struct kmemleak_object *object;
638 unsigned long start, end;
639
640 object = find_and_get_object(ptr, 1);
641 if (!object) {
642#ifdef DEBUG
643 kmemleak_warn("Partially freeing unknown object at 0x%08lx "
644 "(size %zu)\n", ptr, size);
645#endif
646 return;
647 }
648 __delete_object(object);
649
650 /*
651 * Create one or two objects that may result from the memory block
652 * split. Note that partial freeing is only done by free_bootmem() and
653 * this happens before kmemleak_init() is called. The path below is
654 * only executed during early log recording in kmemleak_init(), so
655 * GFP_KERNEL is enough.
656 */
657 start = object->pointer;
658 end = object->pointer + object->size;
659 if (ptr > start)
660 create_object(start, ptr - start, object->min_count,
661 GFP_KERNEL);
662 if (ptr + size < end)
663 create_object(ptr + size, end - ptr - size, object->min_count,
664 GFP_KERNEL);
665
666 put_object(object);
667}
668
669static void __paint_it(struct kmemleak_object *object, int color)
670{
671 object->min_count = color;
672 if (color == KMEMLEAK_BLACK)
673 object->flags |= OBJECT_NO_SCAN;
674}
675
676static void paint_it(struct kmemleak_object *object, int color)
677{
678 unsigned long flags;
679
680 spin_lock_irqsave(&object->lock, flags);
681 __paint_it(object, color);
682 spin_unlock_irqrestore(&object->lock, flags);
683}
684
685static void paint_ptr(unsigned long ptr, int color)
686{
687 struct kmemleak_object *object;
688
689 object = find_and_get_object(ptr, 0);
690 if (!object) {
691 kmemleak_warn("Trying to color unknown object "
692 "at 0x%08lx as %s\n", ptr,
693 (color == KMEMLEAK_GREY) ? "Grey" :
694 (color == KMEMLEAK_BLACK) ? "Black" : "Unknown");
695 return;
696 }
697 paint_it(object, color);
698 put_object(object);
699}
700
701/*
702 * Make a object permanently as gray-colored so that it can no longer be
703 * reported as a leak. This is used in general to mark a false positive.
704 */
705static void make_gray_object(unsigned long ptr)
706{
707 paint_ptr(ptr, KMEMLEAK_GREY);
708}
709
710/*
711 * Mark the object as black-colored so that it is ignored from scans and
712 * reporting.
713 */
714static void make_black_object(unsigned long ptr)
715{
716 paint_ptr(ptr, KMEMLEAK_BLACK);
717}
718
719/*
720 * Add a scanning area to the object. If at least one such area is added,
721 * kmemleak will only scan these ranges rather than the whole memory block.
722 */
723static void add_scan_area(unsigned long ptr, unsigned long offset,
724 size_t length, gfp_t gfp)
725{
726 unsigned long flags;
727 struct kmemleak_object *object;
728 struct kmemleak_scan_area *area;
729
730 object = find_and_get_object(ptr, 0);
731 if (!object) {
732 kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n",
733 ptr);
734 return;
735 }
736
737 area = kmem_cache_alloc(scan_area_cache, gfp & GFP_KMEMLEAK_MASK);
738 if (!area) {
739 kmemleak_warn("Cannot allocate a scan area\n");
740 goto out;
741 }
742
743 spin_lock_irqsave(&object->lock, flags);
744 if (offset + length > object->size) {
745 kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr);
746 dump_object_info(object);
747 kmem_cache_free(scan_area_cache, area);
748 goto out_unlock;
749 }
750
751 INIT_HLIST_NODE(&area->node);
752 area->offset = offset;
753 area->length = length;
754
755 hlist_add_head(&area->node, &object->area_list);
756out_unlock:
757 spin_unlock_irqrestore(&object->lock, flags);
758out:
759 put_object(object);
760}
761
762/*
763 * Set the OBJECT_NO_SCAN flag for the object corresponding to the give
764 * pointer. Such object will not be scanned by kmemleak but references to it
765 * are searched.
766 */
767static void object_no_scan(unsigned long ptr)
768{
769 unsigned long flags;
770 struct kmemleak_object *object;
771
772 object = find_and_get_object(ptr, 0);
773 if (!object) {
774 kmemleak_warn("Not scanning unknown object at 0x%08lx\n", ptr);
775 return;
776 }
777
778 spin_lock_irqsave(&object->lock, flags);
779 object->flags |= OBJECT_NO_SCAN;
780 spin_unlock_irqrestore(&object->lock, flags);
781 put_object(object);
782}
783
784/*
785 * Log an early kmemleak_* call to the early_log buffer. These calls will be
786 * processed later once kmemleak is fully initialized.
787 */
788static void __init log_early(int op_type, const void *ptr, size_t size,
789 int min_count, unsigned long offset, size_t length)
790{
791 unsigned long flags;
792 struct early_log *log;
793
794 if (crt_early_log >= ARRAY_SIZE(early_log)) {
795 pr_warning("Early log buffer exceeded, "
796 "please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n");
797 kmemleak_disable();
798 return;
799 }
800
801 /*
802 * There is no need for locking since the kernel is still in UP mode
803 * at this stage. Disabling the IRQs is enough.
804 */
805 local_irq_save(flags);
806 log = &early_log[crt_early_log];
807 log->op_type = op_type;
808 log->ptr = ptr;
809 log->size = size;
810 log->min_count = min_count;
811 log->offset = offset;
812 log->length = length;
813 if (op_type == KMEMLEAK_ALLOC)
814 log->trace_len = __save_stack_trace(log->trace);
815 crt_early_log++;
816 local_irq_restore(flags);
817}
818
819/*
820 * Log an early allocated block and populate the stack trace.
821 */
822static void early_alloc(struct early_log *log)
823{
824 struct kmemleak_object *object;
825 unsigned long flags;
826 int i;
827
828 if (!atomic_read(&kmemleak_enabled) || !log->ptr || IS_ERR(log->ptr))
829 return;
830
831 /*
832 * RCU locking needed to ensure object is not freed via put_object().
833 */
834 rcu_read_lock();
835 object = create_object((unsigned long)log->ptr, log->size,
836 log->min_count, GFP_ATOMIC);
837 if (!object)
838 goto out;
839 spin_lock_irqsave(&object->lock, flags);
840 for (i = 0; i < log->trace_len; i++)
841 object->trace[i] = log->trace[i];
842 object->trace_len = log->trace_len;
843 spin_unlock_irqrestore(&object->lock, flags);
844out:
845 rcu_read_unlock();
846}
847
848/*
849 * Memory allocation function callback. This function is called from the
850 * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc,
851 * vmalloc etc.).
852 */
853void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
854 gfp_t gfp)
855{
856 pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count);
857
858 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
859 create_object((unsigned long)ptr, size, min_count, gfp);
860 else if (atomic_read(&kmemleak_early_log))
861 log_early(KMEMLEAK_ALLOC, ptr, size, min_count, 0, 0);
862}
863EXPORT_SYMBOL_GPL(kmemleak_alloc);
864
865/*
866 * Memory freeing function callback. This function is called from the kernel
867 * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.).
868 */
869void __ref kmemleak_free(const void *ptr)
870{
871 pr_debug("%s(0x%p)\n", __func__, ptr);
872
873 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
874 delete_object_full((unsigned long)ptr);
875 else if (atomic_read(&kmemleak_early_log))
876 log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0);
877}
878EXPORT_SYMBOL_GPL(kmemleak_free);
879
880/*
881 * Partial memory freeing function callback. This function is usually called
882 * from bootmem allocator when (part of) a memory block is freed.
883 */
884void __ref kmemleak_free_part(const void *ptr, size_t size)
885{
886 pr_debug("%s(0x%p)\n", __func__, ptr);
887
888 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
889 delete_object_part((unsigned long)ptr, size);
890 else if (atomic_read(&kmemleak_early_log))
891 log_early(KMEMLEAK_FREE_PART, ptr, size, 0, 0, 0);
892}
893EXPORT_SYMBOL_GPL(kmemleak_free_part);
894
895/*
896 * Mark an already allocated memory block as a false positive. This will cause
897 * the block to no longer be reported as leak and always be scanned.
898 */
899void __ref kmemleak_not_leak(const void *ptr)
900{
901 pr_debug("%s(0x%p)\n", __func__, ptr);
902
903 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
904 make_gray_object((unsigned long)ptr);
905 else if (atomic_read(&kmemleak_early_log))
906 log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0, 0, 0);
907}
908EXPORT_SYMBOL(kmemleak_not_leak);
909
910/*
911 * Ignore a memory block. This is usually done when it is known that the
912 * corresponding block is not a leak and does not contain any references to
913 * other allocated memory blocks.
914 */
915void __ref kmemleak_ignore(const void *ptr)
916{
917 pr_debug("%s(0x%p)\n", __func__, ptr);
918
919 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
920 make_black_object((unsigned long)ptr);
921 else if (atomic_read(&kmemleak_early_log))
922 log_early(KMEMLEAK_IGNORE, ptr, 0, 0, 0, 0);
923}
924EXPORT_SYMBOL(kmemleak_ignore);
925
926/*
927 * Limit the range to be scanned in an allocated memory block.
928 */
929void __ref kmemleak_scan_area(const void *ptr, unsigned long offset,
930 size_t length, gfp_t gfp)
931{
932 pr_debug("%s(0x%p)\n", __func__, ptr);
933
934 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
935 add_scan_area((unsigned long)ptr, offset, length, gfp);
936 else if (atomic_read(&kmemleak_early_log))
937 log_early(KMEMLEAK_SCAN_AREA, ptr, 0, 0, offset, length);
938}
939EXPORT_SYMBOL(kmemleak_scan_area);
940
941/*
942 * Inform kmemleak not to scan the given memory block.
943 */
944void __ref kmemleak_no_scan(const void *ptr)
945{
946 pr_debug("%s(0x%p)\n", __func__, ptr);
947
948 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
949 object_no_scan((unsigned long)ptr);
950 else if (atomic_read(&kmemleak_early_log))
951 log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0, 0, 0);
952}
953EXPORT_SYMBOL(kmemleak_no_scan);
954
955/*
956 * Memory scanning is a long process and it needs to be interruptable. This
957 * function checks whether such interrupt condition occured.
958 */
959static int scan_should_stop(void)
960{
961 if (!atomic_read(&kmemleak_enabled))
962 return 1;
963
964 /*
965 * This function may be called from either process or kthread context,
966 * hence the need to check for both stop conditions.
967 */
968 if (current->mm)
969 return signal_pending(current);
970 else
971 return kthread_should_stop();
972
973 return 0;
974}
975
976/*
977 * Scan a memory block (exclusive range) for valid pointers and add those
978 * found to the gray list.
979 */
980static void scan_block(void *_start, void *_end,
981 struct kmemleak_object *scanned, int allow_resched)
982{
983 unsigned long *ptr;
984 unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER);
985 unsigned long *end = _end - (BYTES_PER_POINTER - 1);
986
987 for (ptr = start; ptr < end; ptr++) {
988 struct kmemleak_object *object;
989 unsigned long flags;
990 unsigned long pointer;
991
992 if (allow_resched)
993 cond_resched();
994 if (scan_should_stop())
995 break;
996
997 /* don't scan uninitialized memory */
998 if (!kmemcheck_is_obj_initialized((unsigned long)ptr,
999 BYTES_PER_POINTER))
1000 continue;
1001
1002 pointer = *ptr;
1003
1004 object = find_and_get_object(pointer, 1);
1005 if (!object)
1006 continue;
1007 if (object == scanned) {
1008 /* self referenced, ignore */
1009 put_object(object);
1010 continue;
1011 }
1012
1013 /*
1014 * Avoid the lockdep recursive warning on object->lock being
1015 * previously acquired in scan_object(). These locks are
1016 * enclosed by scan_mutex.
1017 */
1018 spin_lock_irqsave_nested(&object->lock, flags,
1019 SINGLE_DEPTH_NESTING);
1020 if (!color_white(object)) {
1021 /* non-orphan, ignored or new */
1022 spin_unlock_irqrestore(&object->lock, flags);
1023 put_object(object);
1024 continue;
1025 }
1026
1027 /*
1028 * Increase the object's reference count (number of pointers
1029 * to the memory block). If this count reaches the required
1030 * minimum, the object's color will become gray and it will be
1031 * added to the gray_list.
1032 */
1033 object->count++;
1034 if (color_gray(object))
1035 list_add_tail(&object->gray_list, &gray_list);
1036 else
1037 put_object(object);
1038 spin_unlock_irqrestore(&object->lock, flags);
1039 }
1040}
1041
1042/*
1043 * Scan a memory block corresponding to a kmemleak_object. A condition is
1044 * that object->use_count >= 1.
1045 */
1046static void scan_object(struct kmemleak_object *object)
1047{
1048 struct kmemleak_scan_area *area;
1049 struct hlist_node *elem;
1050 unsigned long flags;
1051
1052 /*
1053 * Once the object->lock is aquired, the corresponding memory block
1054 * cannot be freed (the same lock is aquired in delete_object).
1055 */
1056 spin_lock_irqsave(&object->lock, flags);
1057 if (object->flags & OBJECT_NO_SCAN)
1058 goto out;
1059 if (!(object->flags & OBJECT_ALLOCATED))
1060 /* already freed object */
1061 goto out;
1062 if (hlist_empty(&object->area_list)) {
1063 void *start = (void *)object->pointer;
1064 void *end = (void *)(object->pointer + object->size);
1065
1066 while (start < end && (object->flags & OBJECT_ALLOCATED) &&
1067 !(object->flags & OBJECT_NO_SCAN)) {
1068 scan_block(start, min(start + MAX_SCAN_SIZE, end),
1069 object, 0);
1070 start += MAX_SCAN_SIZE;
1071
1072 spin_unlock_irqrestore(&object->lock, flags);
1073 cond_resched();
1074 spin_lock_irqsave(&object->lock, flags);
1075 }
1076 } else
1077 hlist_for_each_entry(area, elem, &object->area_list, node)
1078 scan_block((void *)(object->pointer + area->offset),
1079 (void *)(object->pointer + area->offset
1080 + area->length), object, 0);
1081out:
1082 spin_unlock_irqrestore(&object->lock, flags);
1083}
1084
1085/*
1086 * Scan data sections and all the referenced memory blocks allocated via the
1087 * kernel's standard allocators. This function must be called with the
1088 * scan_mutex held.
1089 */
1090static void kmemleak_scan(void)
1091{
1092 unsigned long flags;
1093 struct kmemleak_object *object, *tmp;
1094 int i;
1095 int new_leaks = 0;
1096 int gray_list_pass = 0;
1097
1098 jiffies_last_scan = jiffies;
1099
1100 /* prepare the kmemleak_object's */
1101 rcu_read_lock();
1102 list_for_each_entry_rcu(object, &object_list, object_list) {
1103 spin_lock_irqsave(&object->lock, flags);
1104#ifdef DEBUG
1105 /*
1106 * With a few exceptions there should be a maximum of
1107 * 1 reference to any object at this point.
1108 */
1109 if (atomic_read(&object->use_count) > 1) {
1110 pr_debug("object->use_count = %d\n",
1111 atomic_read(&object->use_count));
1112 dump_object_info(object);
1113 }
1114#endif
1115 /* reset the reference count (whiten the object) */
1116 object->count = 0;
1117 object->flags &= ~OBJECT_NEW;
1118 if (color_gray(object) && get_object(object))
1119 list_add_tail(&object->gray_list, &gray_list);
1120
1121 spin_unlock_irqrestore(&object->lock, flags);
1122 }
1123 rcu_read_unlock();
1124
1125 /* data/bss scanning */
1126 scan_block(_sdata, _edata, NULL, 1);
1127 scan_block(__bss_start, __bss_stop, NULL, 1);
1128
1129#ifdef CONFIG_SMP
1130 /* per-cpu sections scanning */
1131 for_each_possible_cpu(i)
1132 scan_block(__per_cpu_start + per_cpu_offset(i),
1133 __per_cpu_end + per_cpu_offset(i), NULL, 1);
1134#endif
1135
1136 /*
1137 * Struct page scanning for each node. The code below is not yet safe
1138 * with MEMORY_HOTPLUG.
1139 */
1140 for_each_online_node(i) {
1141 pg_data_t *pgdat = NODE_DATA(i);
1142 unsigned long start_pfn = pgdat->node_start_pfn;
1143 unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
1144 unsigned long pfn;
1145
1146 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1147 struct page *page;
1148
1149 if (!pfn_valid(pfn))
1150 continue;
1151 page = pfn_to_page(pfn);
1152 /* only scan if page is in use */
1153 if (page_count(page) == 0)
1154 continue;
1155 scan_block(page, page + 1, NULL, 1);
1156 }
1157 }
1158
1159 /*
1160 * Scanning the task stacks (may introduce false negatives).
1161 */
1162 if (kmemleak_stack_scan) {
1163 struct task_struct *p, *g;
1164
1165 read_lock(&tasklist_lock);
1166 do_each_thread(g, p) {
1167 scan_block(task_stack_page(p), task_stack_page(p) +
1168 THREAD_SIZE, NULL, 0);
1169 } while_each_thread(g, p);
1170 read_unlock(&tasklist_lock);
1171 }
1172
1173 /*
1174 * Scan the objects already referenced from the sections scanned
1175 * above. More objects will be referenced and, if there are no memory
1176 * leaks, all the objects will be scanned. The list traversal is safe
1177 * for both tail additions and removals from inside the loop. The
1178 * kmemleak objects cannot be freed from outside the loop because their
1179 * use_count was increased.
1180 */
1181repeat:
1182 object = list_entry(gray_list.next, typeof(*object), gray_list);
1183 while (&object->gray_list != &gray_list) {
1184 cond_resched();
1185
1186 /* may add new objects to the list */
1187 if (!scan_should_stop())
1188 scan_object(object);
1189
1190 tmp = list_entry(object->gray_list.next, typeof(*object),
1191 gray_list);
1192
1193 /* remove the object from the list and release it */
1194 list_del(&object->gray_list);
1195 put_object(object);
1196
1197 object = tmp;
1198 }
1199
1200 if (scan_should_stop() || ++gray_list_pass >= GRAY_LIST_PASSES)
1201 goto scan_end;
1202
1203 /*
1204 * Check for new objects allocated during this scanning and add them
1205 * to the gray list.
1206 */
1207 rcu_read_lock();
1208 list_for_each_entry_rcu(object, &object_list, object_list) {
1209 spin_lock_irqsave(&object->lock, flags);
1210 if ((object->flags & OBJECT_NEW) && !color_black(object) &&
1211 get_object(object)) {
1212 object->flags &= ~OBJECT_NEW;
1213 list_add_tail(&object->gray_list, &gray_list);
1214 }
1215 spin_unlock_irqrestore(&object->lock, flags);
1216 }
1217 rcu_read_unlock();
1218
1219 if (!list_empty(&gray_list))
1220 goto repeat;
1221
1222scan_end:
1223 WARN_ON(!list_empty(&gray_list));
1224
1225 /*
1226 * If scanning was stopped or new objects were being allocated at a
1227 * higher rate than gray list scanning, do not report any new
1228 * unreferenced objects.
1229 */
1230 if (scan_should_stop() || gray_list_pass >= GRAY_LIST_PASSES)
1231 return;
1232
1233 /*
1234 * Scanning result reporting.
1235 */
1236 rcu_read_lock();
1237 list_for_each_entry_rcu(object, &object_list, object_list) {
1238 spin_lock_irqsave(&object->lock, flags);
1239 if (unreferenced_object(object) &&
1240 !(object->flags & OBJECT_REPORTED)) {
1241 object->flags |= OBJECT_REPORTED;
1242 new_leaks++;
1243 }
1244 spin_unlock_irqrestore(&object->lock, flags);
1245 }
1246 rcu_read_unlock();
1247
1248 if (new_leaks)
1249 pr_info("%d new suspected memory leaks (see "
1250 "/sys/kernel/debug/kmemleak)\n", new_leaks);
1251
1252}
1253
1254/*
1255 * Thread function performing automatic memory scanning. Unreferenced objects
1256 * at the end of a memory scan are reported but only the first time.
1257 */
1258static int kmemleak_scan_thread(void *arg)
1259{
1260 static int first_run = 1;
1261
1262 pr_info("Automatic memory scanning thread started\n");
1263 set_user_nice(current, 10);
1264
1265 /*
1266 * Wait before the first scan to allow the system to fully initialize.
1267 */
1268 if (first_run) {
1269 first_run = 0;
1270 ssleep(SECS_FIRST_SCAN);
1271 }
1272
1273 while (!kthread_should_stop()) {
1274 signed long timeout = jiffies_scan_wait;
1275
1276 mutex_lock(&scan_mutex);
1277 kmemleak_scan();
1278 mutex_unlock(&scan_mutex);
1279
1280 /* wait before the next scan */
1281 while (timeout && !kthread_should_stop())
1282 timeout = schedule_timeout_interruptible(timeout);
1283 }
1284
1285 pr_info("Automatic memory scanning thread ended\n");
1286
1287 return 0;
1288}
1289
1290/*
1291 * Start the automatic memory scanning thread. This function must be called
1292 * with the scan_mutex held.
1293 */
1294static void start_scan_thread(void)
1295{
1296 if (scan_thread)
1297 return;
1298 scan_thread = kthread_run(kmemleak_scan_thread, NULL, "kmemleak");
1299 if (IS_ERR(scan_thread)) {
1300 pr_warning("Failed to create the scan thread\n");
1301 scan_thread = NULL;
1302 }
1303}
1304
1305/*
1306 * Stop the automatic memory scanning thread. This function must be called
1307 * with the scan_mutex held.
1308 */
1309static void stop_scan_thread(void)
1310{
1311 if (scan_thread) {
1312 kthread_stop(scan_thread);
1313 scan_thread = NULL;
1314 }
1315}
1316
1317/*
1318 * Iterate over the object_list and return the first valid object at or after
1319 * the required position with its use_count incremented. The function triggers
1320 * a memory scanning when the pos argument points to the first position.
1321 */
1322static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos)
1323{
1324 struct kmemleak_object *object;
1325 loff_t n = *pos;
1326 int err;
1327
1328 err = mutex_lock_interruptible(&scan_mutex);
1329 if (err < 0)
1330 return ERR_PTR(err);
1331
1332 rcu_read_lock();
1333 list_for_each_entry_rcu(object, &object_list, object_list) {
1334 if (n-- > 0)
1335 continue;
1336 if (get_object(object))
1337 goto out;
1338 }
1339 object = NULL;
1340out:
1341 return object;
1342}
1343
1344/*
1345 * Return the next object in the object_list. The function decrements the
1346 * use_count of the previous object and increases that of the next one.
1347 */
1348static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1349{
1350 struct kmemleak_object *prev_obj = v;
1351 struct kmemleak_object *next_obj = NULL;
1352 struct list_head *n = &prev_obj->object_list;
1353
1354 ++(*pos);
1355
1356 list_for_each_continue_rcu(n, &object_list) {
1357 next_obj = list_entry(n, struct kmemleak_object, object_list);
1358 if (get_object(next_obj))
1359 break;
1360 }
1361
1362 put_object(prev_obj);
1363 return next_obj;
1364}
1365
1366/*
1367 * Decrement the use_count of the last object required, if any.
1368 */
1369static void kmemleak_seq_stop(struct seq_file *seq, void *v)
1370{
1371 if (!IS_ERR(v)) {
1372 /*
1373 * kmemleak_seq_start may return ERR_PTR if the scan_mutex
1374 * waiting was interrupted, so only release it if !IS_ERR.
1375 */
1376 rcu_read_unlock();
1377 mutex_unlock(&scan_mutex);
1378 if (v)
1379 put_object(v);
1380 }
1381}
1382
1383/*
1384 * Print the information for an unreferenced object to the seq file.
1385 */
1386static int kmemleak_seq_show(struct seq_file *seq, void *v)
1387{
1388 struct kmemleak_object *object = v;
1389 unsigned long flags;
1390
1391 spin_lock_irqsave(&object->lock, flags);
1392 if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object))
1393 print_unreferenced(seq, object);
1394 spin_unlock_irqrestore(&object->lock, flags);
1395 return 0;
1396}
1397
1398static const struct seq_operations kmemleak_seq_ops = {
1399 .start = kmemleak_seq_start,
1400 .next = kmemleak_seq_next,
1401 .stop = kmemleak_seq_stop,
1402 .show = kmemleak_seq_show,
1403};
1404
1405static int kmemleak_open(struct inode *inode, struct file *file)
1406{
1407 if (!atomic_read(&kmemleak_enabled))
1408 return -EBUSY;
1409
1410 return seq_open(file, &kmemleak_seq_ops);
1411}
1412
1413static int kmemleak_release(struct inode *inode, struct file *file)
1414{
1415 return seq_release(inode, file);
1416}
1417
1418static int dump_str_object_info(const char *str)
1419{
1420 unsigned long flags;
1421 struct kmemleak_object *object;
1422 unsigned long addr;
1423
1424 addr= simple_strtoul(str, NULL, 0);
1425 object = find_and_get_object(addr, 0);
1426 if (!object) {
1427 pr_info("Unknown object at 0x%08lx\n", addr);
1428 return -EINVAL;
1429 }
1430
1431 spin_lock_irqsave(&object->lock, flags);
1432 dump_object_info(object);
1433 spin_unlock_irqrestore(&object->lock, flags);
1434
1435 put_object(object);
1436 return 0;
1437}
1438
1439/*
1440 * We use grey instead of black to ensure we can do future scans on the same
1441 * objects. If we did not do future scans these black objects could
1442 * potentially contain references to newly allocated objects in the future and
1443 * we'd end up with false positives.
1444 */
1445static void kmemleak_clear(void)
1446{
1447 struct kmemleak_object *object;
1448 unsigned long flags;
1449
1450 rcu_read_lock();
1451 list_for_each_entry_rcu(object, &object_list, object_list) {
1452 spin_lock_irqsave(&object->lock, flags);
1453 if ((object->flags & OBJECT_REPORTED) &&
1454 unreferenced_object(object))
1455 __paint_it(object, KMEMLEAK_GREY);
1456 spin_unlock_irqrestore(&object->lock, flags);
1457 }
1458 rcu_read_unlock();
1459}
1460
1461/*
1462 * File write operation to configure kmemleak at run-time. The following
1463 * commands can be written to the /sys/kernel/debug/kmemleak file:
1464 * off - disable kmemleak (irreversible)
1465 * stack=on - enable the task stacks scanning
1466 * stack=off - disable the tasks stacks scanning
1467 * scan=on - start the automatic memory scanning thread
1468 * scan=off - stop the automatic memory scanning thread
1469 * scan=... - set the automatic memory scanning period in seconds (0 to
1470 * disable it)
1471 * scan - trigger a memory scan
1472 * clear - mark all current reported unreferenced kmemleak objects as
1473 * grey to ignore printing them
1474 * dump=... - dump information about the object found at the given address
1475 */
1476static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
1477 size_t size, loff_t *ppos)
1478{
1479 char buf[64];
1480 int buf_size;
1481 int ret;
1482
1483 buf_size = min(size, (sizeof(buf) - 1));
1484 if (strncpy_from_user(buf, user_buf, buf_size) < 0)
1485 return -EFAULT;
1486 buf[buf_size] = 0;
1487
1488 ret = mutex_lock_interruptible(&scan_mutex);
1489 if (ret < 0)
1490 return ret;
1491
1492 if (strncmp(buf, "off", 3) == 0)
1493 kmemleak_disable();
1494 else if (strncmp(buf, "stack=on", 8) == 0)
1495 kmemleak_stack_scan = 1;
1496 else if (strncmp(buf, "stack=off", 9) == 0)
1497 kmemleak_stack_scan = 0;
1498 else if (strncmp(buf, "scan=on", 7) == 0)
1499 start_scan_thread();
1500 else if (strncmp(buf, "scan=off", 8) == 0)
1501 stop_scan_thread();
1502 else if (strncmp(buf, "scan=", 5) == 0) {
1503 unsigned long secs;
1504
1505 ret = strict_strtoul(buf + 5, 0, &secs);
1506 if (ret < 0)
1507 goto out;
1508 stop_scan_thread();
1509 if (secs) {
1510 jiffies_scan_wait = msecs_to_jiffies(secs * 1000);
1511 start_scan_thread();
1512 }
1513 } else if (strncmp(buf, "scan", 4) == 0)
1514 kmemleak_scan();
1515 else if (strncmp(buf, "clear", 5) == 0)
1516 kmemleak_clear();
1517 else if (strncmp(buf, "dump=", 5) == 0)
1518 ret = dump_str_object_info(buf + 5);
1519 else
1520 ret = -EINVAL;
1521
1522out:
1523 mutex_unlock(&scan_mutex);
1524 if (ret < 0)
1525 return ret;
1526
1527 /* ignore the rest of the buffer, only one command at a time */
1528 *ppos += size;
1529 return size;
1530}
1531
1532static const struct file_operations kmemleak_fops = {
1533 .owner = THIS_MODULE,
1534 .open = kmemleak_open,
1535 .read = seq_read,
1536 .write = kmemleak_write,
1537 .llseek = seq_lseek,
1538 .release = kmemleak_release,
1539};
1540
1541/*
1542 * Perform the freeing of the kmemleak internal objects after waiting for any
1543 * current memory scan to complete.
1544 */
1545static void kmemleak_do_cleanup(struct work_struct *work)
1546{
1547 struct kmemleak_object *object;
1548
1549 mutex_lock(&scan_mutex);
1550 stop_scan_thread();
1551
1552 rcu_read_lock();
1553 list_for_each_entry_rcu(object, &object_list, object_list)
1554 delete_object_full(object->pointer);
1555 rcu_read_unlock();
1556 mutex_unlock(&scan_mutex);
1557}
1558
1559static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup);
1560
1561/*
1562 * Disable kmemleak. No memory allocation/freeing will be traced once this
1563 * function is called. Disabling kmemleak is an irreversible operation.
1564 */
1565static void kmemleak_disable(void)
1566{
1567 /* atomically check whether it was already invoked */
1568 if (atomic_cmpxchg(&kmemleak_error, 0, 1))
1569 return;
1570
1571 /* stop any memory operation tracing */
1572 atomic_set(&kmemleak_early_log, 0);
1573 atomic_set(&kmemleak_enabled, 0);
1574
1575 /* check whether it is too early for a kernel thread */
1576 if (atomic_read(&kmemleak_initialized))
1577 schedule_work(&cleanup_work);
1578
1579 pr_info("Kernel memory leak detector disabled\n");
1580}
1581
1582/*
1583 * Allow boot-time kmemleak disabling (enabled by default).
1584 */
1585static int kmemleak_boot_config(char *str)
1586{
1587 if (!str)
1588 return -EINVAL;
1589 if (strcmp(str, "off") == 0)
1590 kmemleak_disable();
1591 else if (strcmp(str, "on") != 0)
1592 return -EINVAL;
1593 return 0;
1594}
1595early_param("kmemleak", kmemleak_boot_config);
1596
1597/*
1598 * Kmemleak initialization.
1599 */
1600void __init kmemleak_init(void)
1601{
1602 int i;
1603 unsigned long flags;
1604
1605 jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
1606 jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);
1607
1608 object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
1609 scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
1610 INIT_PRIO_TREE_ROOT(&object_tree_root);
1611
1612 /* the kernel is still in UP mode, so disabling the IRQs is enough */
1613 local_irq_save(flags);
1614 if (!atomic_read(&kmemleak_error)) {
1615 atomic_set(&kmemleak_enabled, 1);
1616 atomic_set(&kmemleak_early_log, 0);
1617 }
1618 local_irq_restore(flags);
1619
1620 /*
1621 * This is the point where tracking allocations is safe. Automatic
1622 * scanning is started during the late initcall. Add the early logged
1623 * callbacks to the kmemleak infrastructure.
1624 */
1625 for (i = 0; i < crt_early_log; i++) {
1626 struct early_log *log = &early_log[i];
1627
1628 switch (log->op_type) {
1629 case KMEMLEAK_ALLOC:
1630 early_alloc(log);
1631 break;
1632 case KMEMLEAK_FREE:
1633 kmemleak_free(log->ptr);
1634 break;
1635 case KMEMLEAK_FREE_PART:
1636 kmemleak_free_part(log->ptr, log->size);
1637 break;
1638 case KMEMLEAK_NOT_LEAK:
1639 kmemleak_not_leak(log->ptr);
1640 break;
1641 case KMEMLEAK_IGNORE:
1642 kmemleak_ignore(log->ptr);
1643 break;
1644 case KMEMLEAK_SCAN_AREA:
1645 kmemleak_scan_area(log->ptr, log->offset, log->length,
1646 GFP_KERNEL);
1647 break;
1648 case KMEMLEAK_NO_SCAN:
1649 kmemleak_no_scan(log->ptr);
1650 break;
1651 default:
1652 WARN_ON(1);
1653 }
1654 }
1655}
1656
1657/*
1658 * Late initialization function.
1659 */
1660static int __init kmemleak_late_init(void)
1661{
1662 struct dentry *dentry;
1663
1664 atomic_set(&kmemleak_initialized, 1);
1665
1666 if (atomic_read(&kmemleak_error)) {
1667 /*
1668 * Some error occured and kmemleak was disabled. There is a
1669 * small chance that kmemleak_disable() was called immediately
1670 * after setting kmemleak_initialized and we may end up with
1671 * two clean-up threads but serialized by scan_mutex.
1672 */
1673 schedule_work(&cleanup_work);
1674 return -ENOMEM;
1675 }
1676
1677 dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL,
1678 &kmemleak_fops);
1679 if (!dentry)
1680 pr_warning("Failed to create the debugfs kmemleak file\n");
1681 mutex_lock(&scan_mutex);
1682 start_scan_thread();
1683 mutex_unlock(&scan_mutex);
1684
1685 pr_info("Kernel memory leak detector initialized\n");
1686
1687 return 0;
1688}
1689late_initcall(kmemleak_late_init);
diff --git a/mm/ksm.c b/mm/ksm.c
new file mode 100644
index 000000000000..5575f8628fef
--- /dev/null
+++ b/mm/ksm.c
@@ -0,0 +1,1710 @@
1/*
2 * Memory merging support.
3 *
4 * This code enables dynamic sharing of identical pages found in different
5 * memory areas, even if they are not shared by fork()
6 *
7 * Copyright (C) 2008-2009 Red Hat, Inc.
8 * Authors:
9 * Izik Eidus
10 * Andrea Arcangeli
11 * Chris Wright
12 * Hugh Dickins
13 *
14 * This work is licensed under the terms of the GNU GPL, version 2.
15 */
16
17#include <linux/errno.h>
18#include <linux/mm.h>
19#include <linux/fs.h>
20#include <linux/mman.h>
21#include <linux/sched.h>
22#include <linux/rwsem.h>
23#include <linux/pagemap.h>
24#include <linux/rmap.h>
25#include <linux/spinlock.h>
26#include <linux/jhash.h>
27#include <linux/delay.h>
28#include <linux/kthread.h>
29#include <linux/wait.h>
30#include <linux/slab.h>
31#include <linux/rbtree.h>
32#include <linux/mmu_notifier.h>
33#include <linux/swap.h>
34#include <linux/ksm.h>
35
36#include <asm/tlbflush.h>
37
38/*
39 * A few notes about the KSM scanning process,
40 * to make it easier to understand the data structures below:
41 *
42 * In order to reduce excessive scanning, KSM sorts the memory pages by their
43 * contents into a data structure that holds pointers to the pages' locations.
44 *
45 * Since the contents of the pages may change at any moment, KSM cannot just
46 * insert the pages into a normal sorted tree and expect it to find anything.
47 * Therefore KSM uses two data structures - the stable and the unstable tree.
48 *
49 * The stable tree holds pointers to all the merged pages (ksm pages), sorted
50 * by their contents. Because each such page is write-protected, searching on
51 * this tree is fully assured to be working (except when pages are unmapped),
52 * and therefore this tree is called the stable tree.
53 *
54 * In addition to the stable tree, KSM uses a second data structure called the
55 * unstable tree: this tree holds pointers to pages which have been found to
56 * be "unchanged for a period of time". The unstable tree sorts these pages
57 * by their contents, but since they are not write-protected, KSM cannot rely
58 * upon the unstable tree to work correctly - the unstable tree is liable to
59 * be corrupted as its contents are modified, and so it is called unstable.
60 *
61 * KSM solves this problem by several techniques:
62 *
63 * 1) The unstable tree is flushed every time KSM completes scanning all
64 * memory areas, and then the tree is rebuilt again from the beginning.
65 * 2) KSM will only insert into the unstable tree, pages whose hash value
66 * has not changed since the previous scan of all memory areas.
67 * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
68 * colors of the nodes and not on their contents, assuring that even when
69 * the tree gets "corrupted" it won't get out of balance, so scanning time
70 * remains the same (also, searching and inserting nodes in an rbtree uses
71 * the same algorithm, so we have no overhead when we flush and rebuild).
72 * 4) KSM never flushes the stable tree, which means that even if it were to
73 * take 10 attempts to find a page in the unstable tree, once it is found,
74 * it is secured in the stable tree. (When we scan a new page, we first
75 * compare it against the stable tree, and then against the unstable tree.)
76 */
77
78/**
79 * struct mm_slot - ksm information per mm that is being scanned
80 * @link: link to the mm_slots hash list
81 * @mm_list: link into the mm_slots list, rooted in ksm_mm_head
82 * @rmap_list: head for this mm_slot's list of rmap_items
83 * @mm: the mm that this information is valid for
84 */
85struct mm_slot {
86 struct hlist_node link;
87 struct list_head mm_list;
88 struct list_head rmap_list;
89 struct mm_struct *mm;
90};
91
92/**
93 * struct ksm_scan - cursor for scanning
94 * @mm_slot: the current mm_slot we are scanning
95 * @address: the next address inside that to be scanned
96 * @rmap_item: the current rmap that we are scanning inside the rmap_list
97 * @seqnr: count of completed full scans (needed when removing unstable node)
98 *
99 * There is only the one ksm_scan instance of this cursor structure.
100 */
101struct ksm_scan {
102 struct mm_slot *mm_slot;
103 unsigned long address;
104 struct rmap_item *rmap_item;
105 unsigned long seqnr;
106};
107
108/**
109 * struct rmap_item - reverse mapping item for virtual addresses
110 * @link: link into mm_slot's rmap_list (rmap_list is per mm)
111 * @mm: the memory structure this rmap_item is pointing into
112 * @address: the virtual address this rmap_item tracks (+ flags in low bits)
113 * @oldchecksum: previous checksum of the page at that virtual address
114 * @node: rb_node of this rmap_item in either unstable or stable tree
115 * @next: next rmap_item hanging off the same node of the stable tree
116 * @prev: previous rmap_item hanging off the same node of the stable tree
117 */
118struct rmap_item {
119 struct list_head link;
120 struct mm_struct *mm;
121 unsigned long address; /* + low bits used for flags below */
122 union {
123 unsigned int oldchecksum; /* when unstable */
124 struct rmap_item *next; /* when stable */
125 };
126 union {
127 struct rb_node node; /* when tree node */
128 struct rmap_item *prev; /* in stable list */
129 };
130};
131
132#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */
133#define NODE_FLAG 0x100 /* is a node of unstable or stable tree */
134#define STABLE_FLAG 0x200 /* is a node or list item of stable tree */
135
136/* The stable and unstable tree heads */
137static struct rb_root root_stable_tree = RB_ROOT;
138static struct rb_root root_unstable_tree = RB_ROOT;
139
140#define MM_SLOTS_HASH_HEADS 1024
141static struct hlist_head *mm_slots_hash;
142
143static struct mm_slot ksm_mm_head = {
144 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
145};
146static struct ksm_scan ksm_scan = {
147 .mm_slot = &ksm_mm_head,
148};
149
150static struct kmem_cache *rmap_item_cache;
151static struct kmem_cache *mm_slot_cache;
152
153/* The number of nodes in the stable tree */
154static unsigned long ksm_pages_shared;
155
156/* The number of page slots additionally sharing those nodes */
157static unsigned long ksm_pages_sharing;
158
159/* The number of nodes in the unstable tree */
160static unsigned long ksm_pages_unshared;
161
162/* The number of rmap_items in use: to calculate pages_volatile */
163static unsigned long ksm_rmap_items;
164
165/* Limit on the number of unswappable pages used */
166static unsigned long ksm_max_kernel_pages;
167
168/* Number of pages ksmd should scan in one batch */
169static unsigned int ksm_thread_pages_to_scan = 100;
170
171/* Milliseconds ksmd should sleep between batches */
172static unsigned int ksm_thread_sleep_millisecs = 20;
173
174#define KSM_RUN_STOP 0
175#define KSM_RUN_MERGE 1
176#define KSM_RUN_UNMERGE 2
177static unsigned int ksm_run = KSM_RUN_STOP;
178
179static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
180static DEFINE_MUTEX(ksm_thread_mutex);
181static DEFINE_SPINLOCK(ksm_mmlist_lock);
182
183#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
184 sizeof(struct __struct), __alignof__(struct __struct),\
185 (__flags), NULL)
186
187static int __init ksm_slab_init(void)
188{
189 rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
190 if (!rmap_item_cache)
191 goto out;
192
193 mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
194 if (!mm_slot_cache)
195 goto out_free;
196
197 return 0;
198
199out_free:
200 kmem_cache_destroy(rmap_item_cache);
201out:
202 return -ENOMEM;
203}
204
205static void __init ksm_slab_free(void)
206{
207 kmem_cache_destroy(mm_slot_cache);
208 kmem_cache_destroy(rmap_item_cache);
209 mm_slot_cache = NULL;
210}
211
212static inline struct rmap_item *alloc_rmap_item(void)
213{
214 struct rmap_item *rmap_item;
215
216 rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
217 if (rmap_item)
218 ksm_rmap_items++;
219 return rmap_item;
220}
221
222static inline void free_rmap_item(struct rmap_item *rmap_item)
223{
224 ksm_rmap_items--;
225 rmap_item->mm = NULL; /* debug safety */
226 kmem_cache_free(rmap_item_cache, rmap_item);
227}
228
229static inline struct mm_slot *alloc_mm_slot(void)
230{
231 if (!mm_slot_cache) /* initialization failed */
232 return NULL;
233 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
234}
235
236static inline void free_mm_slot(struct mm_slot *mm_slot)
237{
238 kmem_cache_free(mm_slot_cache, mm_slot);
239}
240
241static int __init mm_slots_hash_init(void)
242{
243 mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
244 GFP_KERNEL);
245 if (!mm_slots_hash)
246 return -ENOMEM;
247 return 0;
248}
249
250static void __init mm_slots_hash_free(void)
251{
252 kfree(mm_slots_hash);
253}
254
255static struct mm_slot *get_mm_slot(struct mm_struct *mm)
256{
257 struct mm_slot *mm_slot;
258 struct hlist_head *bucket;
259 struct hlist_node *node;
260
261 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
262 % MM_SLOTS_HASH_HEADS];
263 hlist_for_each_entry(mm_slot, node, bucket, link) {
264 if (mm == mm_slot->mm)
265 return mm_slot;
266 }
267 return NULL;
268}
269
270static void insert_to_mm_slots_hash(struct mm_struct *mm,
271 struct mm_slot *mm_slot)
272{
273 struct hlist_head *bucket;
274
275 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
276 % MM_SLOTS_HASH_HEADS];
277 mm_slot->mm = mm;
278 INIT_LIST_HEAD(&mm_slot->rmap_list);
279 hlist_add_head(&mm_slot->link, bucket);
280}
281
282static inline int in_stable_tree(struct rmap_item *rmap_item)
283{
284 return rmap_item->address & STABLE_FLAG;
285}
286
287/*
288 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
289 * page tables after it has passed through ksm_exit() - which, if necessary,
290 * takes mmap_sem briefly to serialize against them. ksm_exit() does not set
291 * a special flag: they can just back out as soon as mm_users goes to zero.
292 * ksm_test_exit() is used throughout to make this test for exit: in some
293 * places for correctness, in some places just to avoid unnecessary work.
294 */
295static inline bool ksm_test_exit(struct mm_struct *mm)
296{
297 return atomic_read(&mm->mm_users) == 0;
298}
299
300/*
301 * We use break_ksm to break COW on a ksm page: it's a stripped down
302 *
303 * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
304 * put_page(page);
305 *
306 * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
307 * in case the application has unmapped and remapped mm,addr meanwhile.
308 * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
309 * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
310 */
311static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
312{
313 struct page *page;
314 int ret = 0;
315
316 do {
317 cond_resched();
318 page = follow_page(vma, addr, FOLL_GET);
319 if (!page)
320 break;
321 if (PageKsm(page))
322 ret = handle_mm_fault(vma->vm_mm, vma, addr,
323 FAULT_FLAG_WRITE);
324 else
325 ret = VM_FAULT_WRITE;
326 put_page(page);
327 } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM)));
328 /*
329 * We must loop because handle_mm_fault() may back out if there's
330 * any difficulty e.g. if pte accessed bit gets updated concurrently.
331 *
332 * VM_FAULT_WRITE is what we have been hoping for: it indicates that
333 * COW has been broken, even if the vma does not permit VM_WRITE;
334 * but note that a concurrent fault might break PageKsm for us.
335 *
336 * VM_FAULT_SIGBUS could occur if we race with truncation of the
337 * backing file, which also invalidates anonymous pages: that's
338 * okay, that truncation will have unmapped the PageKsm for us.
339 *
340 * VM_FAULT_OOM: at the time of writing (late July 2009), setting
341 * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
342 * current task has TIF_MEMDIE set, and will be OOM killed on return
343 * to user; and ksmd, having no mm, would never be chosen for that.
344 *
345 * But if the mm is in a limited mem_cgroup, then the fault may fail
346 * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
347 * even ksmd can fail in this way - though it's usually breaking ksm
348 * just to undo a merge it made a moment before, so unlikely to oom.
349 *
350 * That's a pity: we might therefore have more kernel pages allocated
351 * than we're counting as nodes in the stable tree; but ksm_do_scan
352 * will retry to break_cow on each pass, so should recover the page
353 * in due course. The important thing is to not let VM_MERGEABLE
354 * be cleared while any such pages might remain in the area.
355 */
356 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
357}
358
359static void break_cow(struct mm_struct *mm, unsigned long addr)
360{
361 struct vm_area_struct *vma;
362
363 down_read(&mm->mmap_sem);
364 if (ksm_test_exit(mm))
365 goto out;
366 vma = find_vma(mm, addr);
367 if (!vma || vma->vm_start > addr)
368 goto out;
369 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
370 goto out;
371 break_ksm(vma, addr);
372out:
373 up_read(&mm->mmap_sem);
374}
375
376static struct page *get_mergeable_page(struct rmap_item *rmap_item)
377{
378 struct mm_struct *mm = rmap_item->mm;
379 unsigned long addr = rmap_item->address;
380 struct vm_area_struct *vma;
381 struct page *page;
382
383 down_read(&mm->mmap_sem);
384 if (ksm_test_exit(mm))
385 goto out;
386 vma = find_vma(mm, addr);
387 if (!vma || vma->vm_start > addr)
388 goto out;
389 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
390 goto out;
391
392 page = follow_page(vma, addr, FOLL_GET);
393 if (!page)
394 goto out;
395 if (PageAnon(page)) {
396 flush_anon_page(vma, page, addr);
397 flush_dcache_page(page);
398 } else {
399 put_page(page);
400out: page = NULL;
401 }
402 up_read(&mm->mmap_sem);
403 return page;
404}
405
406/*
407 * get_ksm_page: checks if the page at the virtual address in rmap_item
408 * is still PageKsm, in which case we can trust the content of the page,
409 * and it returns the gotten page; but NULL if the page has been zapped.
410 */
411static struct page *get_ksm_page(struct rmap_item *rmap_item)
412{
413 struct page *page;
414
415 page = get_mergeable_page(rmap_item);
416 if (page && !PageKsm(page)) {
417 put_page(page);
418 page = NULL;
419 }
420 return page;
421}
422
423/*
424 * Removing rmap_item from stable or unstable tree.
425 * This function will clean the information from the stable/unstable tree.
426 */
427static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
428{
429 if (in_stable_tree(rmap_item)) {
430 struct rmap_item *next_item = rmap_item->next;
431
432 if (rmap_item->address & NODE_FLAG) {
433 if (next_item) {
434 rb_replace_node(&rmap_item->node,
435 &next_item->node,
436 &root_stable_tree);
437 next_item->address |= NODE_FLAG;
438 ksm_pages_sharing--;
439 } else {
440 rb_erase(&rmap_item->node, &root_stable_tree);
441 ksm_pages_shared--;
442 }
443 } else {
444 struct rmap_item *prev_item = rmap_item->prev;
445
446 BUG_ON(prev_item->next != rmap_item);
447 prev_item->next = next_item;
448 if (next_item) {
449 BUG_ON(next_item->prev != rmap_item);
450 next_item->prev = rmap_item->prev;
451 }
452 ksm_pages_sharing--;
453 }
454
455 rmap_item->next = NULL;
456
457 } else if (rmap_item->address & NODE_FLAG) {
458 unsigned char age;
459 /*
460 * Usually ksmd can and must skip the rb_erase, because
461 * root_unstable_tree was already reset to RB_ROOT.
462 * But be careful when an mm is exiting: do the rb_erase
463 * if this rmap_item was inserted by this scan, rather
464 * than left over from before.
465 */
466 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
467 BUG_ON(age > 1);
468 if (!age)
469 rb_erase(&rmap_item->node, &root_unstable_tree);
470 ksm_pages_unshared--;
471 }
472
473 rmap_item->address &= PAGE_MASK;
474
475 cond_resched(); /* we're called from many long loops */
476}
477
478static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
479 struct list_head *cur)
480{
481 struct rmap_item *rmap_item;
482
483 while (cur != &mm_slot->rmap_list) {
484 rmap_item = list_entry(cur, struct rmap_item, link);
485 cur = cur->next;
486 remove_rmap_item_from_tree(rmap_item);
487 list_del(&rmap_item->link);
488 free_rmap_item(rmap_item);
489 }
490}
491
492/*
493 * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
494 * than check every pte of a given vma, the locking doesn't quite work for
495 * that - an rmap_item is assigned to the stable tree after inserting ksm
496 * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
497 * rmap_items from parent to child at fork time (so as not to waste time
498 * if exit comes before the next scan reaches it).
499 *
500 * Similarly, although we'd like to remove rmap_items (so updating counts
501 * and freeing memory) when unmerging an area, it's easier to leave that
502 * to the next pass of ksmd - consider, for example, how ksmd might be
503 * in cmp_and_merge_page on one of the rmap_items we would be removing.
504 */
505static int unmerge_ksm_pages(struct vm_area_struct *vma,
506 unsigned long start, unsigned long end)
507{
508 unsigned long addr;
509 int err = 0;
510
511 for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
512 if (ksm_test_exit(vma->vm_mm))
513 break;
514 if (signal_pending(current))
515 err = -ERESTARTSYS;
516 else
517 err = break_ksm(vma, addr);
518 }
519 return err;
520}
521
522#ifdef CONFIG_SYSFS
523/*
524 * Only called through the sysfs control interface:
525 */
526static int unmerge_and_remove_all_rmap_items(void)
527{
528 struct mm_slot *mm_slot;
529 struct mm_struct *mm;
530 struct vm_area_struct *vma;
531 int err = 0;
532
533 spin_lock(&ksm_mmlist_lock);
534 ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
535 struct mm_slot, mm_list);
536 spin_unlock(&ksm_mmlist_lock);
537
538 for (mm_slot = ksm_scan.mm_slot;
539 mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
540 mm = mm_slot->mm;
541 down_read(&mm->mmap_sem);
542 for (vma = mm->mmap; vma; vma = vma->vm_next) {
543 if (ksm_test_exit(mm))
544 break;
545 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
546 continue;
547 err = unmerge_ksm_pages(vma,
548 vma->vm_start, vma->vm_end);
549 if (err)
550 goto error;
551 }
552
553 remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next);
554
555 spin_lock(&ksm_mmlist_lock);
556 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
557 struct mm_slot, mm_list);
558 if (ksm_test_exit(mm)) {
559 hlist_del(&mm_slot->link);
560 list_del(&mm_slot->mm_list);
561 spin_unlock(&ksm_mmlist_lock);
562
563 free_mm_slot(mm_slot);
564 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
565 up_read(&mm->mmap_sem);
566 mmdrop(mm);
567 } else {
568 spin_unlock(&ksm_mmlist_lock);
569 up_read(&mm->mmap_sem);
570 }
571 }
572
573 ksm_scan.seqnr = 0;
574 return 0;
575
576error:
577 up_read(&mm->mmap_sem);
578 spin_lock(&ksm_mmlist_lock);
579 ksm_scan.mm_slot = &ksm_mm_head;
580 spin_unlock(&ksm_mmlist_lock);
581 return err;
582}
583#endif /* CONFIG_SYSFS */
584
585static u32 calc_checksum(struct page *page)
586{
587 u32 checksum;
588 void *addr = kmap_atomic(page, KM_USER0);
589 checksum = jhash2(addr, PAGE_SIZE / 4, 17);
590 kunmap_atomic(addr, KM_USER0);
591 return checksum;
592}
593
594static int memcmp_pages(struct page *page1, struct page *page2)
595{
596 char *addr1, *addr2;
597 int ret;
598
599 addr1 = kmap_atomic(page1, KM_USER0);
600 addr2 = kmap_atomic(page2, KM_USER1);
601 ret = memcmp(addr1, addr2, PAGE_SIZE);
602 kunmap_atomic(addr2, KM_USER1);
603 kunmap_atomic(addr1, KM_USER0);
604 return ret;
605}
606
607static inline int pages_identical(struct page *page1, struct page *page2)
608{
609 return !memcmp_pages(page1, page2);
610}
611
612static int write_protect_page(struct vm_area_struct *vma, struct page *page,
613 pte_t *orig_pte)
614{
615 struct mm_struct *mm = vma->vm_mm;
616 unsigned long addr;
617 pte_t *ptep;
618 spinlock_t *ptl;
619 int swapped;
620 int err = -EFAULT;
621
622 addr = page_address_in_vma(page, vma);
623 if (addr == -EFAULT)
624 goto out;
625
626 ptep = page_check_address(page, mm, addr, &ptl, 0);
627 if (!ptep)
628 goto out;
629
630 if (pte_write(*ptep)) {
631 pte_t entry;
632
633 swapped = PageSwapCache(page);
634 flush_cache_page(vma, addr, page_to_pfn(page));
635 /*
636 * Ok this is tricky, when get_user_pages_fast() run it doesnt
637 * take any lock, therefore the check that we are going to make
638 * with the pagecount against the mapcount is racey and
639 * O_DIRECT can happen right after the check.
640 * So we clear the pte and flush the tlb before the check
641 * this assure us that no O_DIRECT can happen after the check
642 * or in the middle of the check.
643 */
644 entry = ptep_clear_flush(vma, addr, ptep);
645 /*
646 * Check that no O_DIRECT or similar I/O is in progress on the
647 * page
648 */
649 if ((page_mapcount(page) + 2 + swapped) != page_count(page)) {
650 set_pte_at_notify(mm, addr, ptep, entry);
651 goto out_unlock;
652 }
653 entry = pte_wrprotect(entry);
654 set_pte_at_notify(mm, addr, ptep, entry);
655 }
656 *orig_pte = *ptep;
657 err = 0;
658
659out_unlock:
660 pte_unmap_unlock(ptep, ptl);
661out:
662 return err;
663}
664
665/**
666 * replace_page - replace page in vma by new ksm page
667 * @vma: vma that holds the pte pointing to oldpage
668 * @oldpage: the page we are replacing by newpage
669 * @newpage: the ksm page we replace oldpage by
670 * @orig_pte: the original value of the pte
671 *
672 * Returns 0 on success, -EFAULT on failure.
673 */
674static int replace_page(struct vm_area_struct *vma, struct page *oldpage,
675 struct page *newpage, pte_t orig_pte)
676{
677 struct mm_struct *mm = vma->vm_mm;
678 pgd_t *pgd;
679 pud_t *pud;
680 pmd_t *pmd;
681 pte_t *ptep;
682 spinlock_t *ptl;
683 unsigned long addr;
684 pgprot_t prot;
685 int err = -EFAULT;
686
687 prot = vm_get_page_prot(vma->vm_flags & ~VM_WRITE);
688
689 addr = page_address_in_vma(oldpage, vma);
690 if (addr == -EFAULT)
691 goto out;
692
693 pgd = pgd_offset(mm, addr);
694 if (!pgd_present(*pgd))
695 goto out;
696
697 pud = pud_offset(pgd, addr);
698 if (!pud_present(*pud))
699 goto out;
700
701 pmd = pmd_offset(pud, addr);
702 if (!pmd_present(*pmd))
703 goto out;
704
705 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
706 if (!pte_same(*ptep, orig_pte)) {
707 pte_unmap_unlock(ptep, ptl);
708 goto out;
709 }
710
711 get_page(newpage);
712 page_add_ksm_rmap(newpage);
713
714 flush_cache_page(vma, addr, pte_pfn(*ptep));
715 ptep_clear_flush(vma, addr, ptep);
716 set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot));
717
718 page_remove_rmap(oldpage);
719 put_page(oldpage);
720
721 pte_unmap_unlock(ptep, ptl);
722 err = 0;
723out:
724 return err;
725}
726
727/*
728 * try_to_merge_one_page - take two pages and merge them into one
729 * @vma: the vma that hold the pte pointing into oldpage
730 * @oldpage: the page that we want to replace with newpage
731 * @newpage: the page that we want to map instead of oldpage
732 *
733 * Note:
734 * oldpage should be a PageAnon page, while newpage should be a PageKsm page,
735 * or a newly allocated kernel page which page_add_ksm_rmap will make PageKsm.
736 *
737 * This function returns 0 if the pages were merged, -EFAULT otherwise.
738 */
739static int try_to_merge_one_page(struct vm_area_struct *vma,
740 struct page *oldpage,
741 struct page *newpage)
742{
743 pte_t orig_pte = __pte(0);
744 int err = -EFAULT;
745
746 if (!(vma->vm_flags & VM_MERGEABLE))
747 goto out;
748
749 if (!PageAnon(oldpage))
750 goto out;
751
752 get_page(newpage);
753 get_page(oldpage);
754
755 /*
756 * We need the page lock to read a stable PageSwapCache in
757 * write_protect_page(). We use trylock_page() instead of
758 * lock_page() because we don't want to wait here - we
759 * prefer to continue scanning and merging different pages,
760 * then come back to this page when it is unlocked.
761 */
762 if (!trylock_page(oldpage))
763 goto out_putpage;
764 /*
765 * If this anonymous page is mapped only here, its pte may need
766 * to be write-protected. If it's mapped elsewhere, all of its
767 * ptes are necessarily already write-protected. But in either
768 * case, we need to lock and check page_count is not raised.
769 */
770 if (write_protect_page(vma, oldpage, &orig_pte)) {
771 unlock_page(oldpage);
772 goto out_putpage;
773 }
774 unlock_page(oldpage);
775
776 if (pages_identical(oldpage, newpage))
777 err = replace_page(vma, oldpage, newpage, orig_pte);
778
779out_putpage:
780 put_page(oldpage);
781 put_page(newpage);
782out:
783 return err;
784}
785
786/*
787 * try_to_merge_with_ksm_page - like try_to_merge_two_pages,
788 * but no new kernel page is allocated: kpage must already be a ksm page.
789 */
790static int try_to_merge_with_ksm_page(struct mm_struct *mm1,
791 unsigned long addr1,
792 struct page *page1,
793 struct page *kpage)
794{
795 struct vm_area_struct *vma;
796 int err = -EFAULT;
797
798 down_read(&mm1->mmap_sem);
799 if (ksm_test_exit(mm1))
800 goto out;
801
802 vma = find_vma(mm1, addr1);
803 if (!vma || vma->vm_start > addr1)
804 goto out;
805
806 err = try_to_merge_one_page(vma, page1, kpage);
807out:
808 up_read(&mm1->mmap_sem);
809 return err;
810}
811
812/*
813 * try_to_merge_two_pages - take two identical pages and prepare them
814 * to be merged into one page.
815 *
816 * This function returns 0 if we successfully mapped two identical pages
817 * into one page, -EFAULT otherwise.
818 *
819 * Note that this function allocates a new kernel page: if one of the pages
820 * is already a ksm page, try_to_merge_with_ksm_page should be used.
821 */
822static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1,
823 struct page *page1, struct mm_struct *mm2,
824 unsigned long addr2, struct page *page2)
825{
826 struct vm_area_struct *vma;
827 struct page *kpage;
828 int err = -EFAULT;
829
830 /*
831 * The number of nodes in the stable tree
832 * is the number of kernel pages that we hold.
833 */
834 if (ksm_max_kernel_pages &&
835 ksm_max_kernel_pages <= ksm_pages_shared)
836 return err;
837
838 kpage = alloc_page(GFP_HIGHUSER);
839 if (!kpage)
840 return err;
841
842 down_read(&mm1->mmap_sem);
843 if (ksm_test_exit(mm1)) {
844 up_read(&mm1->mmap_sem);
845 goto out;
846 }
847 vma = find_vma(mm1, addr1);
848 if (!vma || vma->vm_start > addr1) {
849 up_read(&mm1->mmap_sem);
850 goto out;
851 }
852
853 copy_user_highpage(kpage, page1, addr1, vma);
854 err = try_to_merge_one_page(vma, page1, kpage);
855 up_read(&mm1->mmap_sem);
856
857 if (!err) {
858 err = try_to_merge_with_ksm_page(mm2, addr2, page2, kpage);
859 /*
860 * If that fails, we have a ksm page with only one pte
861 * pointing to it: so break it.
862 */
863 if (err)
864 break_cow(mm1, addr1);
865 }
866out:
867 put_page(kpage);
868 return err;
869}
870
871/*
872 * stable_tree_search - search page inside the stable tree
873 * @page: the page that we are searching identical pages to.
874 * @page2: pointer into identical page that we are holding inside the stable
875 * tree that we have found.
876 * @rmap_item: the reverse mapping item
877 *
878 * This function checks if there is a page inside the stable tree
879 * with identical content to the page that we are scanning right now.
880 *
881 * This function return rmap_item pointer to the identical item if found,
882 * NULL otherwise.
883 */
884static struct rmap_item *stable_tree_search(struct page *page,
885 struct page **page2,
886 struct rmap_item *rmap_item)
887{
888 struct rb_node *node = root_stable_tree.rb_node;
889
890 while (node) {
891 struct rmap_item *tree_rmap_item, *next_rmap_item;
892 int ret;
893
894 tree_rmap_item = rb_entry(node, struct rmap_item, node);
895 while (tree_rmap_item) {
896 BUG_ON(!in_stable_tree(tree_rmap_item));
897 cond_resched();
898 page2[0] = get_ksm_page(tree_rmap_item);
899 if (page2[0])
900 break;
901 next_rmap_item = tree_rmap_item->next;
902 remove_rmap_item_from_tree(tree_rmap_item);
903 tree_rmap_item = next_rmap_item;
904 }
905 if (!tree_rmap_item)
906 return NULL;
907
908 ret = memcmp_pages(page, page2[0]);
909
910 if (ret < 0) {
911 put_page(page2[0]);
912 node = node->rb_left;
913 } else if (ret > 0) {
914 put_page(page2[0]);
915 node = node->rb_right;
916 } else {
917 return tree_rmap_item;
918 }
919 }
920
921 return NULL;
922}
923
924/*
925 * stable_tree_insert - insert rmap_item pointing to new ksm page
926 * into the stable tree.
927 *
928 * @page: the page that we are searching identical page to inside the stable
929 * tree.
930 * @rmap_item: pointer to the reverse mapping item.
931 *
932 * This function returns rmap_item if success, NULL otherwise.
933 */
934static struct rmap_item *stable_tree_insert(struct page *page,
935 struct rmap_item *rmap_item)
936{
937 struct rb_node **new = &root_stable_tree.rb_node;
938 struct rb_node *parent = NULL;
939
940 while (*new) {
941 struct rmap_item *tree_rmap_item, *next_rmap_item;
942 struct page *tree_page;
943 int ret;
944
945 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
946 while (tree_rmap_item) {
947 BUG_ON(!in_stable_tree(tree_rmap_item));
948 cond_resched();
949 tree_page = get_ksm_page(tree_rmap_item);
950 if (tree_page)
951 break;
952 next_rmap_item = tree_rmap_item->next;
953 remove_rmap_item_from_tree(tree_rmap_item);
954 tree_rmap_item = next_rmap_item;
955 }
956 if (!tree_rmap_item)
957 return NULL;
958
959 ret = memcmp_pages(page, tree_page);
960 put_page(tree_page);
961
962 parent = *new;
963 if (ret < 0)
964 new = &parent->rb_left;
965 else if (ret > 0)
966 new = &parent->rb_right;
967 else {
968 /*
969 * It is not a bug that stable_tree_search() didn't
970 * find this node: because at that time our page was
971 * not yet write-protected, so may have changed since.
972 */
973 return NULL;
974 }
975 }
976
977 rmap_item->address |= NODE_FLAG | STABLE_FLAG;
978 rmap_item->next = NULL;
979 rb_link_node(&rmap_item->node, parent, new);
980 rb_insert_color(&rmap_item->node, &root_stable_tree);
981
982 ksm_pages_shared++;
983 return rmap_item;
984}
985
986/*
987 * unstable_tree_search_insert - search and insert items into the unstable tree.
988 *
989 * @page: the page that we are going to search for identical page or to insert
990 * into the unstable tree
991 * @page2: pointer into identical page that was found inside the unstable tree
992 * @rmap_item: the reverse mapping item of page
993 *
994 * This function searches for a page in the unstable tree identical to the
995 * page currently being scanned; and if no identical page is found in the
996 * tree, we insert rmap_item as a new object into the unstable tree.
997 *
998 * This function returns pointer to rmap_item found to be identical
999 * to the currently scanned page, NULL otherwise.
1000 *
1001 * This function does both searching and inserting, because they share
1002 * the same walking algorithm in an rbtree.
1003 */
1004static struct rmap_item *unstable_tree_search_insert(struct page *page,
1005 struct page **page2,
1006 struct rmap_item *rmap_item)
1007{
1008 struct rb_node **new = &root_unstable_tree.rb_node;
1009 struct rb_node *parent = NULL;
1010
1011 while (*new) {
1012 struct rmap_item *tree_rmap_item;
1013 int ret;
1014
1015 cond_resched();
1016 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
1017 page2[0] = get_mergeable_page(tree_rmap_item);
1018 if (!page2[0])
1019 return NULL;
1020
1021 /*
1022 * Don't substitute an unswappable ksm page
1023 * just for one good swappable forked page.
1024 */
1025 if (page == page2[0]) {
1026 put_page(page2[0]);
1027 return NULL;
1028 }
1029
1030 ret = memcmp_pages(page, page2[0]);
1031
1032 parent = *new;
1033 if (ret < 0) {
1034 put_page(page2[0]);
1035 new = &parent->rb_left;
1036 } else if (ret > 0) {
1037 put_page(page2[0]);
1038 new = &parent->rb_right;
1039 } else {
1040 return tree_rmap_item;
1041 }
1042 }
1043
1044 rmap_item->address |= NODE_FLAG;
1045 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
1046 rb_link_node(&rmap_item->node, parent, new);
1047 rb_insert_color(&rmap_item->node, &root_unstable_tree);
1048
1049 ksm_pages_unshared++;
1050 return NULL;
1051}
1052
1053/*
1054 * stable_tree_append - add another rmap_item to the linked list of
1055 * rmap_items hanging off a given node of the stable tree, all sharing
1056 * the same ksm page.
1057 */
1058static void stable_tree_append(struct rmap_item *rmap_item,
1059 struct rmap_item *tree_rmap_item)
1060{
1061 rmap_item->next = tree_rmap_item->next;
1062 rmap_item->prev = tree_rmap_item;
1063
1064 if (tree_rmap_item->next)
1065 tree_rmap_item->next->prev = rmap_item;
1066
1067 tree_rmap_item->next = rmap_item;
1068 rmap_item->address |= STABLE_FLAG;
1069
1070 ksm_pages_sharing++;
1071}
1072
1073/*
1074 * cmp_and_merge_page - first see if page can be merged into the stable tree;
1075 * if not, compare checksum to previous and if it's the same, see if page can
1076 * be inserted into the unstable tree, or merged with a page already there and
1077 * both transferred to the stable tree.
1078 *
1079 * @page: the page that we are searching identical page to.
1080 * @rmap_item: the reverse mapping into the virtual address of this page
1081 */
1082static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1083{
1084 struct page *page2[1];
1085 struct rmap_item *tree_rmap_item;
1086 unsigned int checksum;
1087 int err;
1088
1089 if (in_stable_tree(rmap_item))
1090 remove_rmap_item_from_tree(rmap_item);
1091
1092 /* We first start with searching the page inside the stable tree */
1093 tree_rmap_item = stable_tree_search(page, page2, rmap_item);
1094 if (tree_rmap_item) {
1095 if (page == page2[0]) /* forked */
1096 err = 0;
1097 else
1098 err = try_to_merge_with_ksm_page(rmap_item->mm,
1099 rmap_item->address,
1100 page, page2[0]);
1101 put_page(page2[0]);
1102
1103 if (!err) {
1104 /*
1105 * The page was successfully merged:
1106 * add its rmap_item to the stable tree.
1107 */
1108 stable_tree_append(rmap_item, tree_rmap_item);
1109 }
1110 return;
1111 }
1112
1113 /*
1114 * A ksm page might have got here by fork, but its other
1115 * references have already been removed from the stable tree.
1116 * Or it might be left over from a break_ksm which failed
1117 * when the mem_cgroup had reached its limit: try again now.
1118 */
1119 if (PageKsm(page))
1120 break_cow(rmap_item->mm, rmap_item->address);
1121
1122 /*
1123 * In case the hash value of the page was changed from the last time we
1124 * have calculated it, this page to be changed frequely, therefore we
1125 * don't want to insert it to the unstable tree, and we don't want to
1126 * waste our time to search if there is something identical to it there.
1127 */
1128 checksum = calc_checksum(page);
1129 if (rmap_item->oldchecksum != checksum) {
1130 rmap_item->oldchecksum = checksum;
1131 return;
1132 }
1133
1134 tree_rmap_item = unstable_tree_search_insert(page, page2, rmap_item);
1135 if (tree_rmap_item) {
1136 err = try_to_merge_two_pages(rmap_item->mm,
1137 rmap_item->address, page,
1138 tree_rmap_item->mm,
1139 tree_rmap_item->address, page2[0]);
1140 /*
1141 * As soon as we merge this page, we want to remove the
1142 * rmap_item of the page we have merged with from the unstable
1143 * tree, and insert it instead as new node in the stable tree.
1144 */
1145 if (!err) {
1146 rb_erase(&tree_rmap_item->node, &root_unstable_tree);
1147 tree_rmap_item->address &= ~NODE_FLAG;
1148 ksm_pages_unshared--;
1149
1150 /*
1151 * If we fail to insert the page into the stable tree,
1152 * we will have 2 virtual addresses that are pointing
1153 * to a ksm page left outside the stable tree,
1154 * in which case we need to break_cow on both.
1155 */
1156 if (stable_tree_insert(page2[0], tree_rmap_item))
1157 stable_tree_append(rmap_item, tree_rmap_item);
1158 else {
1159 break_cow(tree_rmap_item->mm,
1160 tree_rmap_item->address);
1161 break_cow(rmap_item->mm, rmap_item->address);
1162 }
1163 }
1164
1165 put_page(page2[0]);
1166 }
1167}
1168
1169static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
1170 struct list_head *cur,
1171 unsigned long addr)
1172{
1173 struct rmap_item *rmap_item;
1174
1175 while (cur != &mm_slot->rmap_list) {
1176 rmap_item = list_entry(cur, struct rmap_item, link);
1177 if ((rmap_item->address & PAGE_MASK) == addr) {
1178 if (!in_stable_tree(rmap_item))
1179 remove_rmap_item_from_tree(rmap_item);
1180 return rmap_item;
1181 }
1182 if (rmap_item->address > addr)
1183 break;
1184 cur = cur->next;
1185 remove_rmap_item_from_tree(rmap_item);
1186 list_del(&rmap_item->link);
1187 free_rmap_item(rmap_item);
1188 }
1189
1190 rmap_item = alloc_rmap_item();
1191 if (rmap_item) {
1192 /* It has already been zeroed */
1193 rmap_item->mm = mm_slot->mm;
1194 rmap_item->address = addr;
1195 list_add_tail(&rmap_item->link, cur);
1196 }
1197 return rmap_item;
1198}
1199
1200static struct rmap_item *scan_get_next_rmap_item(struct page **page)
1201{
1202 struct mm_struct *mm;
1203 struct mm_slot *slot;
1204 struct vm_area_struct *vma;
1205 struct rmap_item *rmap_item;
1206
1207 if (list_empty(&ksm_mm_head.mm_list))
1208 return NULL;
1209
1210 slot = ksm_scan.mm_slot;
1211 if (slot == &ksm_mm_head) {
1212 root_unstable_tree = RB_ROOT;
1213
1214 spin_lock(&ksm_mmlist_lock);
1215 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
1216 ksm_scan.mm_slot = slot;
1217 spin_unlock(&ksm_mmlist_lock);
1218next_mm:
1219 ksm_scan.address = 0;
1220 ksm_scan.rmap_item = list_entry(&slot->rmap_list,
1221 struct rmap_item, link);
1222 }
1223
1224 mm = slot->mm;
1225 down_read(&mm->mmap_sem);
1226 if (ksm_test_exit(mm))
1227 vma = NULL;
1228 else
1229 vma = find_vma(mm, ksm_scan.address);
1230
1231 for (; vma; vma = vma->vm_next) {
1232 if (!(vma->vm_flags & VM_MERGEABLE))
1233 continue;
1234 if (ksm_scan.address < vma->vm_start)
1235 ksm_scan.address = vma->vm_start;
1236 if (!vma->anon_vma)
1237 ksm_scan.address = vma->vm_end;
1238
1239 while (ksm_scan.address < vma->vm_end) {
1240 if (ksm_test_exit(mm))
1241 break;
1242 *page = follow_page(vma, ksm_scan.address, FOLL_GET);
1243 if (*page && PageAnon(*page)) {
1244 flush_anon_page(vma, *page, ksm_scan.address);
1245 flush_dcache_page(*page);
1246 rmap_item = get_next_rmap_item(slot,
1247 ksm_scan.rmap_item->link.next,
1248 ksm_scan.address);
1249 if (rmap_item) {
1250 ksm_scan.rmap_item = rmap_item;
1251 ksm_scan.address += PAGE_SIZE;
1252 } else
1253 put_page(*page);
1254 up_read(&mm->mmap_sem);
1255 return rmap_item;
1256 }
1257 if (*page)
1258 put_page(*page);
1259 ksm_scan.address += PAGE_SIZE;
1260 cond_resched();
1261 }
1262 }
1263
1264 if (ksm_test_exit(mm)) {
1265 ksm_scan.address = 0;
1266 ksm_scan.rmap_item = list_entry(&slot->rmap_list,
1267 struct rmap_item, link);
1268 }
1269 /*
1270 * Nuke all the rmap_items that are above this current rmap:
1271 * because there were no VM_MERGEABLE vmas with such addresses.
1272 */
1273 remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next);
1274
1275 spin_lock(&ksm_mmlist_lock);
1276 ksm_scan.mm_slot = list_entry(slot->mm_list.next,
1277 struct mm_slot, mm_list);
1278 if (ksm_scan.address == 0) {
1279 /*
1280 * We've completed a full scan of all vmas, holding mmap_sem
1281 * throughout, and found no VM_MERGEABLE: so do the same as
1282 * __ksm_exit does to remove this mm from all our lists now.
1283 * This applies either when cleaning up after __ksm_exit
1284 * (but beware: we can reach here even before __ksm_exit),
1285 * or when all VM_MERGEABLE areas have been unmapped (and
1286 * mmap_sem then protects against race with MADV_MERGEABLE).
1287 */
1288 hlist_del(&slot->link);
1289 list_del(&slot->mm_list);
1290 spin_unlock(&ksm_mmlist_lock);
1291
1292 free_mm_slot(slot);
1293 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1294 up_read(&mm->mmap_sem);
1295 mmdrop(mm);
1296 } else {
1297 spin_unlock(&ksm_mmlist_lock);
1298 up_read(&mm->mmap_sem);
1299 }
1300
1301 /* Repeat until we've completed scanning the whole list */
1302 slot = ksm_scan.mm_slot;
1303 if (slot != &ksm_mm_head)
1304 goto next_mm;
1305
1306 ksm_scan.seqnr++;
1307 return NULL;
1308}
1309
1310/**
1311 * ksm_do_scan - the ksm scanner main worker function.
1312 * @scan_npages - number of pages we want to scan before we return.
1313 */
1314static void ksm_do_scan(unsigned int scan_npages)
1315{
1316 struct rmap_item *rmap_item;
1317 struct page *page;
1318
1319 while (scan_npages--) {
1320 cond_resched();
1321 rmap_item = scan_get_next_rmap_item(&page);
1322 if (!rmap_item)
1323 return;
1324 if (!PageKsm(page) || !in_stable_tree(rmap_item))
1325 cmp_and_merge_page(page, rmap_item);
1326 else if (page_mapcount(page) == 1) {
1327 /*
1328 * Replace now-unshared ksm page by ordinary page.
1329 */
1330 break_cow(rmap_item->mm, rmap_item->address);
1331 remove_rmap_item_from_tree(rmap_item);
1332 rmap_item->oldchecksum = calc_checksum(page);
1333 }
1334 put_page(page);
1335 }
1336}
1337
1338static int ksmd_should_run(void)
1339{
1340 return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
1341}
1342
1343static int ksm_scan_thread(void *nothing)
1344{
1345 set_user_nice(current, 5);
1346
1347 while (!kthread_should_stop()) {
1348 mutex_lock(&ksm_thread_mutex);
1349 if (ksmd_should_run())
1350 ksm_do_scan(ksm_thread_pages_to_scan);
1351 mutex_unlock(&ksm_thread_mutex);
1352
1353 if (ksmd_should_run()) {
1354 schedule_timeout_interruptible(
1355 msecs_to_jiffies(ksm_thread_sleep_millisecs));
1356 } else {
1357 wait_event_interruptible(ksm_thread_wait,
1358 ksmd_should_run() || kthread_should_stop());
1359 }
1360 }
1361 return 0;
1362}
1363
1364int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
1365 unsigned long end, int advice, unsigned long *vm_flags)
1366{
1367 struct mm_struct *mm = vma->vm_mm;
1368 int err;
1369
1370 switch (advice) {
1371 case MADV_MERGEABLE:
1372 /*
1373 * Be somewhat over-protective for now!
1374 */
1375 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
1376 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
1377 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
1378 VM_MIXEDMAP | VM_SAO))
1379 return 0; /* just ignore the advice */
1380
1381 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
1382 err = __ksm_enter(mm);
1383 if (err)
1384 return err;
1385 }
1386
1387 *vm_flags |= VM_MERGEABLE;
1388 break;
1389
1390 case MADV_UNMERGEABLE:
1391 if (!(*vm_flags & VM_MERGEABLE))
1392 return 0; /* just ignore the advice */
1393
1394 if (vma->anon_vma) {
1395 err = unmerge_ksm_pages(vma, start, end);
1396 if (err)
1397 return err;
1398 }
1399
1400 *vm_flags &= ~VM_MERGEABLE;
1401 break;
1402 }
1403
1404 return 0;
1405}
1406
1407int __ksm_enter(struct mm_struct *mm)
1408{
1409 struct mm_slot *mm_slot;
1410 int needs_wakeup;
1411
1412 mm_slot = alloc_mm_slot();
1413 if (!mm_slot)
1414 return -ENOMEM;
1415
1416 /* Check ksm_run too? Would need tighter locking */
1417 needs_wakeup = list_empty(&ksm_mm_head.mm_list);
1418
1419 spin_lock(&ksm_mmlist_lock);
1420 insert_to_mm_slots_hash(mm, mm_slot);
1421 /*
1422 * Insert just behind the scanning cursor, to let the area settle
1423 * down a little; when fork is followed by immediate exec, we don't
1424 * want ksmd to waste time setting up and tearing down an rmap_list.
1425 */
1426 list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
1427 spin_unlock(&ksm_mmlist_lock);
1428
1429 set_bit(MMF_VM_MERGEABLE, &mm->flags);
1430 atomic_inc(&mm->mm_count);
1431
1432 if (needs_wakeup)
1433 wake_up_interruptible(&ksm_thread_wait);
1434
1435 return 0;
1436}
1437
1438void __ksm_exit(struct mm_struct *mm)
1439{
1440 struct mm_slot *mm_slot;
1441 int easy_to_free = 0;
1442
1443 /*
1444 * This process is exiting: if it's straightforward (as is the
1445 * case when ksmd was never running), free mm_slot immediately.
1446 * But if it's at the cursor or has rmap_items linked to it, use
1447 * mmap_sem to synchronize with any break_cows before pagetables
1448 * are freed, and leave the mm_slot on the list for ksmd to free.
1449 * Beware: ksm may already have noticed it exiting and freed the slot.
1450 */
1451
1452 spin_lock(&ksm_mmlist_lock);
1453 mm_slot = get_mm_slot(mm);
1454 if (mm_slot && ksm_scan.mm_slot != mm_slot) {
1455 if (list_empty(&mm_slot->rmap_list)) {
1456 hlist_del(&mm_slot->link);
1457 list_del(&mm_slot->mm_list);
1458 easy_to_free = 1;
1459 } else {
1460 list_move(&mm_slot->mm_list,
1461 &ksm_scan.mm_slot->mm_list);
1462 }
1463 }
1464 spin_unlock(&ksm_mmlist_lock);
1465
1466 if (easy_to_free) {
1467 free_mm_slot(mm_slot);
1468 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1469 mmdrop(mm);
1470 } else if (mm_slot) {
1471 down_write(&mm->mmap_sem);
1472 up_write(&mm->mmap_sem);
1473 }
1474}
1475
1476#ifdef CONFIG_SYSFS
1477/*
1478 * This all compiles without CONFIG_SYSFS, but is a waste of space.
1479 */
1480
1481#define KSM_ATTR_RO(_name) \
1482 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
1483#define KSM_ATTR(_name) \
1484 static struct kobj_attribute _name##_attr = \
1485 __ATTR(_name, 0644, _name##_show, _name##_store)
1486
1487static ssize_t sleep_millisecs_show(struct kobject *kobj,
1488 struct kobj_attribute *attr, char *buf)
1489{
1490 return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
1491}
1492
1493static ssize_t sleep_millisecs_store(struct kobject *kobj,
1494 struct kobj_attribute *attr,
1495 const char *buf, size_t count)
1496{
1497 unsigned long msecs;
1498 int err;
1499
1500 err = strict_strtoul(buf, 10, &msecs);
1501 if (err || msecs > UINT_MAX)
1502 return -EINVAL;
1503
1504 ksm_thread_sleep_millisecs = msecs;
1505
1506 return count;
1507}
1508KSM_ATTR(sleep_millisecs);
1509
1510static ssize_t pages_to_scan_show(struct kobject *kobj,
1511 struct kobj_attribute *attr, char *buf)
1512{
1513 return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
1514}
1515
1516static ssize_t pages_to_scan_store(struct kobject *kobj,
1517 struct kobj_attribute *attr,
1518 const char *buf, size_t count)
1519{
1520 int err;
1521 unsigned long nr_pages;
1522
1523 err = strict_strtoul(buf, 10, &nr_pages);
1524 if (err || nr_pages > UINT_MAX)
1525 return -EINVAL;
1526
1527 ksm_thread_pages_to_scan = nr_pages;
1528
1529 return count;
1530}
1531KSM_ATTR(pages_to_scan);
1532
1533static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
1534 char *buf)
1535{
1536 return sprintf(buf, "%u\n", ksm_run);
1537}
1538
1539static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
1540 const char *buf, size_t count)
1541{
1542 int err;
1543 unsigned long flags;
1544
1545 err = strict_strtoul(buf, 10, &flags);
1546 if (err || flags > UINT_MAX)
1547 return -EINVAL;
1548 if (flags > KSM_RUN_UNMERGE)
1549 return -EINVAL;
1550
1551 /*
1552 * KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
1553 * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
1554 * breaking COW to free the unswappable pages_shared (but leaves
1555 * mm_slots on the list for when ksmd may be set running again).
1556 */
1557
1558 mutex_lock(&ksm_thread_mutex);
1559 if (ksm_run != flags) {
1560 ksm_run = flags;
1561 if (flags & KSM_RUN_UNMERGE) {
1562 current->flags |= PF_OOM_ORIGIN;
1563 err = unmerge_and_remove_all_rmap_items();
1564 current->flags &= ~PF_OOM_ORIGIN;
1565 if (err) {
1566 ksm_run = KSM_RUN_STOP;
1567 count = err;
1568 }
1569 }
1570 }
1571 mutex_unlock(&ksm_thread_mutex);
1572
1573 if (flags & KSM_RUN_MERGE)
1574 wake_up_interruptible(&ksm_thread_wait);
1575
1576 return count;
1577}
1578KSM_ATTR(run);
1579
1580static ssize_t max_kernel_pages_store(struct kobject *kobj,
1581 struct kobj_attribute *attr,
1582 const char *buf, size_t count)
1583{
1584 int err;
1585 unsigned long nr_pages;
1586
1587 err = strict_strtoul(buf, 10, &nr_pages);
1588 if (err)
1589 return -EINVAL;
1590
1591 ksm_max_kernel_pages = nr_pages;
1592
1593 return count;
1594}
1595
1596static ssize_t max_kernel_pages_show(struct kobject *kobj,
1597 struct kobj_attribute *attr, char *buf)
1598{
1599 return sprintf(buf, "%lu\n", ksm_max_kernel_pages);
1600}
1601KSM_ATTR(max_kernel_pages);
1602
1603static ssize_t pages_shared_show(struct kobject *kobj,
1604 struct kobj_attribute *attr, char *buf)
1605{
1606 return sprintf(buf, "%lu\n", ksm_pages_shared);
1607}
1608KSM_ATTR_RO(pages_shared);
1609
1610static ssize_t pages_sharing_show(struct kobject *kobj,
1611 struct kobj_attribute *attr, char *buf)
1612{
1613 return sprintf(buf, "%lu\n", ksm_pages_sharing);
1614}
1615KSM_ATTR_RO(pages_sharing);
1616
1617static ssize_t pages_unshared_show(struct kobject *kobj,
1618 struct kobj_attribute *attr, char *buf)
1619{
1620 return sprintf(buf, "%lu\n", ksm_pages_unshared);
1621}
1622KSM_ATTR_RO(pages_unshared);
1623
1624static ssize_t pages_volatile_show(struct kobject *kobj,
1625 struct kobj_attribute *attr, char *buf)
1626{
1627 long ksm_pages_volatile;
1628
1629 ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
1630 - ksm_pages_sharing - ksm_pages_unshared;
1631 /*
1632 * It was not worth any locking to calculate that statistic,
1633 * but it might therefore sometimes be negative: conceal that.
1634 */
1635 if (ksm_pages_volatile < 0)
1636 ksm_pages_volatile = 0;
1637 return sprintf(buf, "%ld\n", ksm_pages_volatile);
1638}
1639KSM_ATTR_RO(pages_volatile);
1640
1641static ssize_t full_scans_show(struct kobject *kobj,
1642 struct kobj_attribute *attr, char *buf)
1643{
1644 return sprintf(buf, "%lu\n", ksm_scan.seqnr);
1645}
1646KSM_ATTR_RO(full_scans);
1647
1648static struct attribute *ksm_attrs[] = {
1649 &sleep_millisecs_attr.attr,
1650 &pages_to_scan_attr.attr,
1651 &run_attr.attr,
1652 &max_kernel_pages_attr.attr,
1653 &pages_shared_attr.attr,
1654 &pages_sharing_attr.attr,
1655 &pages_unshared_attr.attr,
1656 &pages_volatile_attr.attr,
1657 &full_scans_attr.attr,
1658 NULL,
1659};
1660
1661static struct attribute_group ksm_attr_group = {
1662 .attrs = ksm_attrs,
1663 .name = "ksm",
1664};
1665#endif /* CONFIG_SYSFS */
1666
1667static int __init ksm_init(void)
1668{
1669 struct task_struct *ksm_thread;
1670 int err;
1671
1672 ksm_max_kernel_pages = totalram_pages / 4;
1673
1674 err = ksm_slab_init();
1675 if (err)
1676 goto out;
1677
1678 err = mm_slots_hash_init();
1679 if (err)
1680 goto out_free1;
1681
1682 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
1683 if (IS_ERR(ksm_thread)) {
1684 printk(KERN_ERR "ksm: creating kthread failed\n");
1685 err = PTR_ERR(ksm_thread);
1686 goto out_free2;
1687 }
1688
1689#ifdef CONFIG_SYSFS
1690 err = sysfs_create_group(mm_kobj, &ksm_attr_group);
1691 if (err) {
1692 printk(KERN_ERR "ksm: register sysfs failed\n");
1693 kthread_stop(ksm_thread);
1694 goto out_free2;
1695 }
1696#else
1697 ksm_run = KSM_RUN_MERGE; /* no way for user to start it */
1698
1699#endif /* CONFIG_SYSFS */
1700
1701 return 0;
1702
1703out_free2:
1704 mm_slots_hash_free();
1705out_free1:
1706 ksm_slab_free();
1707out:
1708 return err;
1709}
1710module_init(ksm_init)
diff --git a/mm/maccess.c b/mm/maccess.c
index ac40796cfb15..9073695ff25f 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -39,7 +39,7 @@ EXPORT_SYMBOL_GPL(probe_kernel_read);
39 * Safely write to address @dst from the buffer at @src. If a kernel fault 39 * Safely write to address @dst from the buffer at @src. If a kernel fault
40 * happens, handle that and return -EFAULT. 40 * happens, handle that and return -EFAULT.
41 */ 41 */
42long probe_kernel_write(void *dst, void *src, size_t size) 42long notrace __weak probe_kernel_write(void *dst, void *src, size_t size)
43{ 43{
44 long ret; 44 long ret;
45 mm_segment_t old_fs = get_fs(); 45 mm_segment_t old_fs = get_fs();
diff --git a/mm/madvise.c b/mm/madvise.c
index b9ce574827c8..35b1479b7c9d 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -11,6 +11,7 @@
11#include <linux/mempolicy.h> 11#include <linux/mempolicy.h>
12#include <linux/hugetlb.h> 12#include <linux/hugetlb.h>
13#include <linux/sched.h> 13#include <linux/sched.h>
14#include <linux/ksm.h>
14 15
15/* 16/*
16 * Any behaviour which results in changes to the vma->vm_flags needs to 17 * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -41,7 +42,7 @@ static long madvise_behavior(struct vm_area_struct * vma,
41 struct mm_struct * mm = vma->vm_mm; 42 struct mm_struct * mm = vma->vm_mm;
42 int error = 0; 43 int error = 0;
43 pgoff_t pgoff; 44 pgoff_t pgoff;
44 int new_flags = vma->vm_flags; 45 unsigned long new_flags = vma->vm_flags;
45 46
46 switch (behavior) { 47 switch (behavior) {
47 case MADV_NORMAL: 48 case MADV_NORMAL:
@@ -57,8 +58,18 @@ static long madvise_behavior(struct vm_area_struct * vma,
57 new_flags |= VM_DONTCOPY; 58 new_flags |= VM_DONTCOPY;
58 break; 59 break;
59 case MADV_DOFORK: 60 case MADV_DOFORK:
61 if (vma->vm_flags & VM_IO) {
62 error = -EINVAL;
63 goto out;
64 }
60 new_flags &= ~VM_DONTCOPY; 65 new_flags &= ~VM_DONTCOPY;
61 break; 66 break;
67 case MADV_MERGEABLE:
68 case MADV_UNMERGEABLE:
69 error = ksm_madvise(vma, start, end, behavior, &new_flags);
70 if (error)
71 goto out;
72 break;
62 } 73 }
63 74
64 if (new_flags == vma->vm_flags) { 75 if (new_flags == vma->vm_flags) {
@@ -123,8 +134,7 @@ static long madvise_willneed(struct vm_area_struct * vma,
123 end = vma->vm_end; 134 end = vma->vm_end;
124 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 135 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
125 136
126 force_page_cache_readahead(file->f_mapping, 137 force_page_cache_readahead(file->f_mapping, file, start, end - start);
127 file, start, max_sane_readahead(end - start));
128 return 0; 138 return 0;
129} 139}
130 140
@@ -208,41 +218,69 @@ static long madvise_remove(struct vm_area_struct *vma,
208 return error; 218 return error;
209} 219}
210 220
221#ifdef CONFIG_MEMORY_FAILURE
222/*
223 * Error injection support for memory error handling.
224 */
225static int madvise_hwpoison(unsigned long start, unsigned long end)
226{
227 int ret = 0;
228
229 if (!capable(CAP_SYS_ADMIN))
230 return -EPERM;
231 for (; start < end; start += PAGE_SIZE) {
232 struct page *p;
233 int ret = get_user_pages(current, current->mm, start, 1,
234 0, 0, &p, NULL);
235 if (ret != 1)
236 return ret;
237 printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
238 page_to_pfn(p), start);
239 /* Ignore return value for now */
240 __memory_failure(page_to_pfn(p), 0, 1);
241 put_page(p);
242 }
243 return ret;
244}
245#endif
246
211static long 247static long
212madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, 248madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
213 unsigned long start, unsigned long end, int behavior) 249 unsigned long start, unsigned long end, int behavior)
214{ 250{
215 long error; 251 switch (behavior) {
252 case MADV_REMOVE:
253 return madvise_remove(vma, prev, start, end);
254 case MADV_WILLNEED:
255 return madvise_willneed(vma, prev, start, end);
256 case MADV_DONTNEED:
257 return madvise_dontneed(vma, prev, start, end);
258 default:
259 return madvise_behavior(vma, prev, start, end, behavior);
260 }
261}
216 262
263static int
264madvise_behavior_valid(int behavior)
265{
217 switch (behavior) { 266 switch (behavior) {
218 case MADV_DOFORK: 267 case MADV_DOFORK:
219 if (vma->vm_flags & VM_IO) {
220 error = -EINVAL;
221 break;
222 }
223 case MADV_DONTFORK: 268 case MADV_DONTFORK:
224 case MADV_NORMAL: 269 case MADV_NORMAL:
225 case MADV_SEQUENTIAL: 270 case MADV_SEQUENTIAL:
226 case MADV_RANDOM: 271 case MADV_RANDOM:
227 error = madvise_behavior(vma, prev, start, end, behavior);
228 break;
229 case MADV_REMOVE: 272 case MADV_REMOVE:
230 error = madvise_remove(vma, prev, start, end);
231 break;
232
233 case MADV_WILLNEED: 273 case MADV_WILLNEED:
234 error = madvise_willneed(vma, prev, start, end);
235 break;
236
237 case MADV_DONTNEED: 274 case MADV_DONTNEED:
238 error = madvise_dontneed(vma, prev, start, end); 275#ifdef CONFIG_KSM
239 break; 276 case MADV_MERGEABLE:
277 case MADV_UNMERGEABLE:
278#endif
279 return 1;
240 280
241 default: 281 default:
242 error = -EINVAL; 282 return 0;
243 break;
244 } 283 }
245 return error;
246} 284}
247 285
248/* 286/*
@@ -269,6 +307,12 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
269 * so the kernel can free resources associated with it. 307 * so the kernel can free resources associated with it.
270 * MADV_REMOVE - the application wants to free up the given range of 308 * MADV_REMOVE - the application wants to free up the given range of
271 * pages and associated backing store. 309 * pages and associated backing store.
310 * MADV_DONTFORK - omit this area from child's address space when forking:
311 * typically, to avoid COWing pages pinned by get_user_pages().
312 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
313 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in
314 * this area with pages of identical content from other such areas.
315 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
272 * 316 *
273 * return values: 317 * return values:
274 * zero - success 318 * zero - success
@@ -290,6 +334,13 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
290 int write; 334 int write;
291 size_t len; 335 size_t len;
292 336
337#ifdef CONFIG_MEMORY_FAILURE
338 if (behavior == MADV_HWPOISON)
339 return madvise_hwpoison(start, start+len_in);
340#endif
341 if (!madvise_behavior_valid(behavior))
342 return error;
343
293 write = madvise_need_mmap_write(behavior); 344 write = madvise_need_mmap_write(behavior);
294 if (write) 345 if (write)
295 down_write(&current->mm->mmap_sem); 346 down_write(&current->mm->mmap_sem);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8e4be9cb2a6a..f99f5991d6bb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -27,7 +27,9 @@
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/bit_spinlock.h> 28#include <linux/bit_spinlock.h>
29#include <linux/rcupdate.h> 29#include <linux/rcupdate.h>
30#include <linux/limits.h>
30#include <linux/mutex.h> 31#include <linux/mutex.h>
32#include <linux/rbtree.h>
31#include <linux/slab.h> 33#include <linux/slab.h>
32#include <linux/swap.h> 34#include <linux/swap.h>
33#include <linux/spinlock.h> 35#include <linux/spinlock.h>
@@ -42,9 +44,10 @@
42 44
43struct cgroup_subsys mem_cgroup_subsys __read_mostly; 45struct cgroup_subsys mem_cgroup_subsys __read_mostly;
44#define MEM_CGROUP_RECLAIM_RETRIES 5 46#define MEM_CGROUP_RECLAIM_RETRIES 5
47struct mem_cgroup *root_mem_cgroup __read_mostly;
45 48
46#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 49#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
47/* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */ 50/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
48int do_swap_account __read_mostly; 51int do_swap_account __read_mostly;
49static int really_do_swap_account __initdata = 1; /* for remember boot option*/ 52static int really_do_swap_account __initdata = 1; /* for remember boot option*/
50#else 53#else
@@ -52,6 +55,7 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
52#endif 55#endif
53 56
54static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ 57static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */
58#define SOFTLIMIT_EVENTS_THRESH (1000)
55 59
56/* 60/*
57 * Statistics for memory cgroup. 61 * Statistics for memory cgroup.
@@ -61,9 +65,12 @@ enum mem_cgroup_stat_index {
61 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 65 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
62 */ 66 */
63 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 67 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
64 MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */ 68 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
69 MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */
65 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 70 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
66 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 71 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
72 MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
73 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
67 74
68 MEM_CGROUP_STAT_NSTATS, 75 MEM_CGROUP_STAT_NSTATS,
69}; 76};
@@ -76,6 +83,20 @@ struct mem_cgroup_stat {
76 struct mem_cgroup_stat_cpu cpustat[0]; 83 struct mem_cgroup_stat_cpu cpustat[0];
77}; 84};
78 85
86static inline void
87__mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat,
88 enum mem_cgroup_stat_index idx)
89{
90 stat->count[idx] = 0;
91}
92
93static inline s64
94__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat,
95 enum mem_cgroup_stat_index idx)
96{
97 return stat->count[idx];
98}
99
79/* 100/*
80 * For accounting under irq disable, no need for increment preempt count. 101 * For accounting under irq disable, no need for increment preempt count.
81 */ 102 */
@@ -95,6 +116,15 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
95 return ret; 116 return ret;
96} 117}
97 118
119static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat)
120{
121 s64 ret;
122
123 ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE);
124 ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS);
125 return ret;
126}
127
98/* 128/*
99 * per-zone information in memory controller. 129 * per-zone information in memory controller.
100 */ 130 */
@@ -106,6 +136,12 @@ struct mem_cgroup_per_zone {
106 unsigned long count[NR_LRU_LISTS]; 136 unsigned long count[NR_LRU_LISTS];
107 137
108 struct zone_reclaim_stat reclaim_stat; 138 struct zone_reclaim_stat reclaim_stat;
139 struct rb_node tree_node; /* RB tree node */
140 unsigned long long usage_in_excess;/* Set to the value by which */
141 /* the soft limit is exceeded*/
142 bool on_tree;
143 struct mem_cgroup *mem; /* Back pointer, we cannot */
144 /* use container_of */
109}; 145};
110/* Macro for accessing counter */ 146/* Macro for accessing counter */
111#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 147#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
@@ -119,6 +155,26 @@ struct mem_cgroup_lru_info {
119}; 155};
120 156
121/* 157/*
158 * Cgroups above their limits are maintained in a RB-Tree, independent of
159 * their hierarchy representation
160 */
161
162struct mem_cgroup_tree_per_zone {
163 struct rb_root rb_root;
164 spinlock_t lock;
165};
166
167struct mem_cgroup_tree_per_node {
168 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
169};
170
171struct mem_cgroup_tree {
172 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
173};
174
175static struct mem_cgroup_tree soft_limit_tree __read_mostly;
176
177/*
122 * The memory controller data structure. The memory controller controls both 178 * The memory controller data structure. The memory controller controls both
123 * page cache and RSS per cgroup. We would eventually like to provide 179 * page cache and RSS per cgroup. We would eventually like to provide
124 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 180 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
@@ -154,9 +210,9 @@ struct mem_cgroup {
154 210
155 /* 211 /*
156 * While reclaiming in a hiearchy, we cache the last child we 212 * While reclaiming in a hiearchy, we cache the last child we
157 * reclaimed from. Protected by hierarchy_mutex 213 * reclaimed from.
158 */ 214 */
159 struct mem_cgroup *last_scanned_child; 215 int last_scanned_child;
160 /* 216 /*
161 * Should the accounting and control be hierarchical, per subtree? 217 * Should the accounting and control be hierarchical, per subtree?
162 */ 218 */
@@ -166,18 +222,29 @@ struct mem_cgroup {
166 222
167 unsigned int swappiness; 223 unsigned int swappiness;
168 224
225 /* set when res.limit == memsw.limit */
226 bool memsw_is_minimum;
227
169 /* 228 /*
170 * statistics. This must be placed at the end of memcg. 229 * statistics. This must be placed at the end of memcg.
171 */ 230 */
172 struct mem_cgroup_stat stat; 231 struct mem_cgroup_stat stat;
173}; 232};
174 233
234/*
235 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
236 * limit reclaim to prevent infinite loops, if they ever occur.
237 */
238#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100)
239#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)
240
175enum charge_type { 241enum charge_type {
176 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 242 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
177 MEM_CGROUP_CHARGE_TYPE_MAPPED, 243 MEM_CGROUP_CHARGE_TYPE_MAPPED,
178 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ 244 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
179 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 245 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
180 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 246 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
247 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
181 NR_CHARGE_TYPE, 248 NR_CHARGE_TYPE,
182}; 249};
183 250
@@ -185,13 +252,8 @@ enum charge_type {
185#define PCGF_CACHE (1UL << PCG_CACHE) 252#define PCGF_CACHE (1UL << PCG_CACHE)
186#define PCGF_USED (1UL << PCG_USED) 253#define PCGF_USED (1UL << PCG_USED)
187#define PCGF_LOCK (1UL << PCG_LOCK) 254#define PCGF_LOCK (1UL << PCG_LOCK)
188static const unsigned long 255/* Not used, but added here for completeness */
189pcg_default_flags[NR_CHARGE_TYPE] = { 256#define PCGF_ACCT (1UL << PCG_ACCT)
190 PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
191 PCGF_USED | PCGF_LOCK, /* Anon */
192 PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
193 0, /* FORCE */
194};
195 257
196/* for encoding cft->private value on file */ 258/* for encoding cft->private value on file */
197#define _MEM (0) 259#define _MEM (0)
@@ -200,15 +262,237 @@ pcg_default_flags[NR_CHARGE_TYPE] = {
200#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 262#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
201#define MEMFILE_ATTR(val) ((val) & 0xffff) 263#define MEMFILE_ATTR(val) ((val) & 0xffff)
202 264
265/*
266 * Reclaim flags for mem_cgroup_hierarchical_reclaim
267 */
268#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
269#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
270#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
271#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
272#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
273#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
274
203static void mem_cgroup_get(struct mem_cgroup *mem); 275static void mem_cgroup_get(struct mem_cgroup *mem);
204static void mem_cgroup_put(struct mem_cgroup *mem); 276static void mem_cgroup_put(struct mem_cgroup *mem);
205static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 277static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
206 278
279static struct mem_cgroup_per_zone *
280mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
281{
282 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
283}
284
285static struct mem_cgroup_per_zone *
286page_cgroup_zoneinfo(struct page_cgroup *pc)
287{
288 struct mem_cgroup *mem = pc->mem_cgroup;
289 int nid = page_cgroup_nid(pc);
290 int zid = page_cgroup_zid(pc);
291
292 if (!mem)
293 return NULL;
294
295 return mem_cgroup_zoneinfo(mem, nid, zid);
296}
297
298static struct mem_cgroup_tree_per_zone *
299soft_limit_tree_node_zone(int nid, int zid)
300{
301 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
302}
303
304static struct mem_cgroup_tree_per_zone *
305soft_limit_tree_from_page(struct page *page)
306{
307 int nid = page_to_nid(page);
308 int zid = page_zonenum(page);
309
310 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
311}
312
313static void
314__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
315 struct mem_cgroup_per_zone *mz,
316 struct mem_cgroup_tree_per_zone *mctz,
317 unsigned long long new_usage_in_excess)
318{
319 struct rb_node **p = &mctz->rb_root.rb_node;
320 struct rb_node *parent = NULL;
321 struct mem_cgroup_per_zone *mz_node;
322
323 if (mz->on_tree)
324 return;
325
326 mz->usage_in_excess = new_usage_in_excess;
327 if (!mz->usage_in_excess)
328 return;
329 while (*p) {
330 parent = *p;
331 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
332 tree_node);
333 if (mz->usage_in_excess < mz_node->usage_in_excess)
334 p = &(*p)->rb_left;
335 /*
336 * We can't avoid mem cgroups that are over their soft
337 * limit by the same amount
338 */
339 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
340 p = &(*p)->rb_right;
341 }
342 rb_link_node(&mz->tree_node, parent, p);
343 rb_insert_color(&mz->tree_node, &mctz->rb_root);
344 mz->on_tree = true;
345}
346
347static void
348__mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
349 struct mem_cgroup_per_zone *mz,
350 struct mem_cgroup_tree_per_zone *mctz)
351{
352 if (!mz->on_tree)
353 return;
354 rb_erase(&mz->tree_node, &mctz->rb_root);
355 mz->on_tree = false;
356}
357
358static void
359mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
360 struct mem_cgroup_per_zone *mz,
361 struct mem_cgroup_tree_per_zone *mctz)
362{
363 spin_lock(&mctz->lock);
364 __mem_cgroup_remove_exceeded(mem, mz, mctz);
365 spin_unlock(&mctz->lock);
366}
367
368static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
369{
370 bool ret = false;
371 int cpu;
372 s64 val;
373 struct mem_cgroup_stat_cpu *cpustat;
374
375 cpu = get_cpu();
376 cpustat = &mem->stat.cpustat[cpu];
377 val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS);
378 if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) {
379 __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS);
380 ret = true;
381 }
382 put_cpu();
383 return ret;
384}
385
386static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
387{
388 unsigned long long excess;
389 struct mem_cgroup_per_zone *mz;
390 struct mem_cgroup_tree_per_zone *mctz;
391 int nid = page_to_nid(page);
392 int zid = page_zonenum(page);
393 mctz = soft_limit_tree_from_page(page);
394
395 /*
396 * Necessary to update all ancestors when hierarchy is used.
397 * because their event counter is not touched.
398 */
399 for (; mem; mem = parent_mem_cgroup(mem)) {
400 mz = mem_cgroup_zoneinfo(mem, nid, zid);
401 excess = res_counter_soft_limit_excess(&mem->res);
402 /*
403 * We have to update the tree if mz is on RB-tree or
404 * mem is over its softlimit.
405 */
406 if (excess || mz->on_tree) {
407 spin_lock(&mctz->lock);
408 /* if on-tree, remove it */
409 if (mz->on_tree)
410 __mem_cgroup_remove_exceeded(mem, mz, mctz);
411 /*
412 * Insert again. mz->usage_in_excess will be updated.
413 * If excess is 0, no tree ops.
414 */
415 __mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
416 spin_unlock(&mctz->lock);
417 }
418 }
419}
420
421static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
422{
423 int node, zone;
424 struct mem_cgroup_per_zone *mz;
425 struct mem_cgroup_tree_per_zone *mctz;
426
427 for_each_node_state(node, N_POSSIBLE) {
428 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
429 mz = mem_cgroup_zoneinfo(mem, node, zone);
430 mctz = soft_limit_tree_node_zone(node, zone);
431 mem_cgroup_remove_exceeded(mem, mz, mctz);
432 }
433 }
434}
435
436static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem)
437{
438 return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT;
439}
440
441static struct mem_cgroup_per_zone *
442__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
443{
444 struct rb_node *rightmost = NULL;
445 struct mem_cgroup_per_zone *mz;
446
447retry:
448 mz = NULL;
449 rightmost = rb_last(&mctz->rb_root);
450 if (!rightmost)
451 goto done; /* Nothing to reclaim from */
452
453 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
454 /*
455 * Remove the node now but someone else can add it back,
456 * we will to add it back at the end of reclaim to its correct
457 * position in the tree.
458 */
459 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
460 if (!res_counter_soft_limit_excess(&mz->mem->res) ||
461 !css_tryget(&mz->mem->css))
462 goto retry;
463done:
464 return mz;
465}
466
467static struct mem_cgroup_per_zone *
468mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
469{
470 struct mem_cgroup_per_zone *mz;
471
472 spin_lock(&mctz->lock);
473 mz = __mem_cgroup_largest_soft_limit_node(mctz);
474 spin_unlock(&mctz->lock);
475 return mz;
476}
477
478static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
479 bool charge)
480{
481 int val = (charge) ? 1 : -1;
482 struct mem_cgroup_stat *stat = &mem->stat;
483 struct mem_cgroup_stat_cpu *cpustat;
484 int cpu = get_cpu();
485
486 cpustat = &stat->cpustat[cpu];
487 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val);
488 put_cpu();
489}
490
207static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 491static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
208 struct page_cgroup *pc, 492 struct page_cgroup *pc,
209 bool charge) 493 bool charge)
210{ 494{
211 int val = (charge)? 1 : -1; 495 int val = (charge) ? 1 : -1;
212 struct mem_cgroup_stat *stat = &mem->stat; 496 struct mem_cgroup_stat *stat = &mem->stat;
213 struct mem_cgroup_stat_cpu *cpustat; 497 struct mem_cgroup_stat_cpu *cpustat;
214 int cpu = get_cpu(); 498 int cpu = get_cpu();
@@ -225,29 +509,11 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
225 else 509 else
226 __mem_cgroup_stat_add_safe(cpustat, 510 __mem_cgroup_stat_add_safe(cpustat,
227 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); 511 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
512 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1);
228 put_cpu(); 513 put_cpu();
229} 514}
230 515
231static struct mem_cgroup_per_zone * 516static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
232mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
233{
234 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
235}
236
237static struct mem_cgroup_per_zone *
238page_cgroup_zoneinfo(struct page_cgroup *pc)
239{
240 struct mem_cgroup *mem = pc->mem_cgroup;
241 int nid = page_cgroup_nid(pc);
242 int zid = page_cgroup_zid(pc);
243
244 if (!mem)
245 return NULL;
246
247 return mem_cgroup_zoneinfo(mem, nid, zid);
248}
249
250static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
251 enum lru_list idx) 517 enum lru_list idx)
252{ 518{
253 int nid, zid; 519 int nid, zid;
@@ -286,6 +552,9 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
286static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 552static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
287{ 553{
288 struct mem_cgroup *mem = NULL; 554 struct mem_cgroup *mem = NULL;
555
556 if (!mm)
557 return NULL;
289 /* 558 /*
290 * Because we have no locks, mm->owner's may be being moved to other 559 * Because we have no locks, mm->owner's may be being moved to other
291 * cgroup. We use css_tryget() here even if this looks 560 * cgroup. We use css_tryget() here even if this looks
@@ -301,11 +570,44 @@ static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
301 return mem; 570 return mem;
302} 571}
303 572
304static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem) 573/*
574 * Call callback function against all cgroup under hierarchy tree.
575 */
576static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
577 int (*func)(struct mem_cgroup *, void *))
305{ 578{
306 if (!mem) 579 int found, ret, nextid;
307 return true; 580 struct cgroup_subsys_state *css;
308 return css_is_removed(&mem->css); 581 struct mem_cgroup *mem;
582
583 if (!root->use_hierarchy)
584 return (*func)(root, data);
585
586 nextid = 1;
587 do {
588 ret = 0;
589 mem = NULL;
590
591 rcu_read_lock();
592 css = css_get_next(&mem_cgroup_subsys, nextid, &root->css,
593 &found);
594 if (css && css_tryget(css))
595 mem = container_of(css, struct mem_cgroup, css);
596 rcu_read_unlock();
597
598 if (mem) {
599 ret = (*func)(mem, data);
600 css_put(&mem->css);
601 }
602 nextid = found + 1;
603 } while (!ret && css);
604
605 return ret;
606}
607
608static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
609{
610 return (mem == root_mem_cgroup);
309} 611}
310 612
311/* 613/*
@@ -325,22 +627,24 @@ static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem)
325void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) 627void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
326{ 628{
327 struct page_cgroup *pc; 629 struct page_cgroup *pc;
328 struct mem_cgroup *mem;
329 struct mem_cgroup_per_zone *mz; 630 struct mem_cgroup_per_zone *mz;
330 631
331 if (mem_cgroup_disabled()) 632 if (mem_cgroup_disabled())
332 return; 633 return;
333 pc = lookup_page_cgroup(page); 634 pc = lookup_page_cgroup(page);
334 /* can happen while we handle swapcache. */ 635 /* can happen while we handle swapcache. */
335 if (list_empty(&pc->lru) || !pc->mem_cgroup) 636 if (!TestClearPageCgroupAcctLRU(pc))
336 return; 637 return;
638 VM_BUG_ON(!pc->mem_cgroup);
337 /* 639 /*
338 * We don't check PCG_USED bit. It's cleared when the "page" is finally 640 * We don't check PCG_USED bit. It's cleared when the "page" is finally
339 * removed from global LRU. 641 * removed from global LRU.
340 */ 642 */
341 mz = page_cgroup_zoneinfo(pc); 643 mz = page_cgroup_zoneinfo(pc);
342 mem = pc->mem_cgroup;
343 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 644 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
645 if (mem_cgroup_is_root(pc->mem_cgroup))
646 return;
647 VM_BUG_ON(list_empty(&pc->lru));
344 list_del_init(&pc->lru); 648 list_del_init(&pc->lru);
345 return; 649 return;
346} 650}
@@ -364,8 +668,8 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
364 * For making pc->mem_cgroup visible, insert smp_rmb() here. 668 * For making pc->mem_cgroup visible, insert smp_rmb() here.
365 */ 669 */
366 smp_rmb(); 670 smp_rmb();
367 /* unused page is not rotated. */ 671 /* unused or root page is not rotated. */
368 if (!PageCgroupUsed(pc)) 672 if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
369 return; 673 return;
370 mz = page_cgroup_zoneinfo(pc); 674 mz = page_cgroup_zoneinfo(pc);
371 list_move(&pc->lru, &mz->lists[lru]); 675 list_move(&pc->lru, &mz->lists[lru]);
@@ -379,6 +683,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
379 if (mem_cgroup_disabled()) 683 if (mem_cgroup_disabled())
380 return; 684 return;
381 pc = lookup_page_cgroup(page); 685 pc = lookup_page_cgroup(page);
686 VM_BUG_ON(PageCgroupAcctLRU(pc));
382 /* 687 /*
383 * Used bit is set without atomic ops but after smp_wmb(). 688 * Used bit is set without atomic ops but after smp_wmb().
384 * For making pc->mem_cgroup visible, insert smp_rmb() here. 689 * For making pc->mem_cgroup visible, insert smp_rmb() here.
@@ -389,6 +694,9 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
389 694
390 mz = page_cgroup_zoneinfo(pc); 695 mz = page_cgroup_zoneinfo(pc);
391 MEM_CGROUP_ZSTAT(mz, lru) += 1; 696 MEM_CGROUP_ZSTAT(mz, lru) += 1;
697 SetPageCgroupAcctLRU(pc);
698 if (mem_cgroup_is_root(pc->mem_cgroup))
699 return;
392 list_add(&pc->lru, &mz->lists[lru]); 700 list_add(&pc->lru, &mz->lists[lru]);
393} 701}
394 702
@@ -423,7 +731,7 @@ static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
423 731
424 spin_lock_irqsave(&zone->lru_lock, flags); 732 spin_lock_irqsave(&zone->lru_lock, flags);
425 /* link when the page is linked to LRU but page_cgroup isn't */ 733 /* link when the page is linked to LRU but page_cgroup isn't */
426 if (PageLRU(page) && list_empty(&pc->lru)) 734 if (PageLRU(page) && !PageCgroupAcctLRU(pc))
427 mem_cgroup_add_lru_list(page, page_lru(page)); 735 mem_cgroup_add_lru_list(page, page_lru(page));
428 spin_unlock_irqrestore(&zone->lru_lock, flags); 736 spin_unlock_irqrestore(&zone->lru_lock, flags);
429} 737}
@@ -441,31 +749,24 @@ void mem_cgroup_move_lists(struct page *page,
441int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 749int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
442{ 750{
443 int ret; 751 int ret;
752 struct mem_cgroup *curr = NULL;
444 753
445 task_lock(task); 754 task_lock(task);
446 ret = task->mm && mm_match_cgroup(task->mm, mem); 755 rcu_read_lock();
756 curr = try_get_mem_cgroup_from_mm(task->mm);
757 rcu_read_unlock();
447 task_unlock(task); 758 task_unlock(task);
759 if (!curr)
760 return 0;
761 if (curr->use_hierarchy)
762 ret = css_is_ancestor(&curr->css, &mem->css);
763 else
764 ret = (curr == mem);
765 css_put(&curr->css);
448 return ret; 766 return ret;
449} 767}
450 768
451/* 769/*
452 * Calculate mapped_ratio under memory controller. This will be used in
453 * vmscan.c for deteremining we have to reclaim mapped pages.
454 */
455int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
456{
457 long total, rss;
458
459 /*
460 * usage is recorded in bytes. But, here, we assume the number of
461 * physical pages can be represented by "long" on any arch.
462 */
463 total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;
464 rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
465 return (int)((rss * 100L) / total);
466}
467
468/*
469 * prev_priority control...this will be used in memory reclaim path. 770 * prev_priority control...this will be used in memory reclaim path.
470 */ 771 */
471int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) 772int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
@@ -501,8 +802,8 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_
501 unsigned long gb; 802 unsigned long gb;
502 unsigned long inactive_ratio; 803 unsigned long inactive_ratio;
503 804
504 inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON); 805 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON);
505 active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON); 806 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON);
506 807
507 gb = (inactive + active) >> (30 - PAGE_SHIFT); 808 gb = (inactive + active) >> (30 - PAGE_SHIFT);
508 if (gb) 809 if (gb)
@@ -536,6 +837,17 @@ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
536 return 0; 837 return 0;
537} 838}
538 839
840int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
841{
842 unsigned long active;
843 unsigned long inactive;
844
845 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);
846 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);
847
848 return (active > inactive);
849}
850
539unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, 851unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
540 struct zone *zone, 852 struct zone *zone,
541 enum lru_list lru) 853 enum lru_list lru)
@@ -598,7 +910,8 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
598 int nid = z->zone_pgdat->node_id; 910 int nid = z->zone_pgdat->node_id;
599 int zid = zone_idx(z); 911 int zid = zone_idx(z);
600 struct mem_cgroup_per_zone *mz; 912 struct mem_cgroup_per_zone *mz;
601 int lru = LRU_FILE * !!file + !!active; 913 int lru = LRU_FILE * file + active;
914 int ret;
602 915
603 BUG_ON(!mem_cont); 916 BUG_ON(!mem_cont);
604 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 917 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
@@ -616,9 +929,19 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
616 continue; 929 continue;
617 930
618 scan++; 931 scan++;
619 if (__isolate_lru_page(page, mode, file) == 0) { 932 ret = __isolate_lru_page(page, mode, file);
933 switch (ret) {
934 case 0:
620 list_move(&page->lru, dst); 935 list_move(&page->lru, dst);
936 mem_cgroup_del_lru(page);
621 nr_taken++; 937 nr_taken++;
938 break;
939 case -EBUSY:
940 /* we don't affect global LRU but rotate in our LRU */
941 mem_cgroup_rotate_lru_list(page, page_lru(page));
942 break;
943 default:
944 break;
622 } 945 }
623 } 946 }
624 947
@@ -629,172 +952,243 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
629#define mem_cgroup_from_res_counter(counter, member) \ 952#define mem_cgroup_from_res_counter(counter, member) \
630 container_of(counter, struct mem_cgroup, member) 953 container_of(counter, struct mem_cgroup, member)
631 954
632/* 955static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
633 * This routine finds the DFS walk successor. This routine should be
634 * called with hierarchy_mutex held
635 */
636static struct mem_cgroup *
637__mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem)
638{ 956{
639 struct cgroup *cgroup, *curr_cgroup, *root_cgroup; 957 if (do_swap_account) {
640 958 if (res_counter_check_under_limit(&mem->res) &&
641 curr_cgroup = curr->css.cgroup; 959 res_counter_check_under_limit(&mem->memsw))
642 root_cgroup = root_mem->css.cgroup; 960 return true;
961 } else
962 if (res_counter_check_under_limit(&mem->res))
963 return true;
964 return false;
965}
643 966
644 if (!list_empty(&curr_cgroup->children)) { 967static unsigned int get_swappiness(struct mem_cgroup *memcg)
645 /* 968{
646 * Walk down to children 969 struct cgroup *cgrp = memcg->css.cgroup;
647 */ 970 unsigned int swappiness;
648 cgroup = list_entry(curr_cgroup->children.next,
649 struct cgroup, sibling);
650 curr = mem_cgroup_from_cont(cgroup);
651 goto done;
652 }
653 971
654visit_parent: 972 /* root ? */
655 if (curr_cgroup == root_cgroup) { 973 if (cgrp->parent == NULL)
656 /* caller handles NULL case */ 974 return vm_swappiness;
657 curr = NULL;
658 goto done;
659 }
660 975
661 /* 976 spin_lock(&memcg->reclaim_param_lock);
662 * Goto next sibling 977 swappiness = memcg->swappiness;
663 */ 978 spin_unlock(&memcg->reclaim_param_lock);
664 if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) {
665 cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup,
666 sibling);
667 curr = mem_cgroup_from_cont(cgroup);
668 goto done;
669 }
670 979
671 /* 980 return swappiness;
672 * Go up to next parent and next parent's sibling if need be 981}
673 */
674 curr_cgroup = curr_cgroup->parent;
675 goto visit_parent;
676 982
677done: 983static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
678 return curr; 984{
985 int *val = data;
986 (*val)++;
987 return 0;
679} 988}
680 989
681/* 990/**
682 * Visit the first child (need not be the first child as per the ordering 991 * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode.
683 * of the cgroup list, since we track last_scanned_child) of @mem and use 992 * @memcg: The memory cgroup that went over limit
684 * that to reclaim free pages from. 993 * @p: Task that is going to be killed
994 *
995 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
996 * enabled
685 */ 997 */
686static struct mem_cgroup * 998void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
687mem_cgroup_get_next_node(struct mem_cgroup *root_mem)
688{ 999{
689 struct cgroup *cgroup; 1000 struct cgroup *task_cgrp;
690 struct mem_cgroup *orig, *next; 1001 struct cgroup *mem_cgrp;
691 bool obsolete;
692
693 /* 1002 /*
694 * Scan all children under the mem_cgroup mem 1003 * Need a buffer in BSS, can't rely on allocations. The code relies
1004 * on the assumption that OOM is serialized for memory controller.
1005 * If this assumption is broken, revisit this code.
695 */ 1006 */
696 mutex_lock(&mem_cgroup_subsys.hierarchy_mutex); 1007 static char memcg_name[PATH_MAX];
1008 int ret;
1009
1010 if (!memcg)
1011 return;
697 1012
698 orig = root_mem->last_scanned_child;
699 obsolete = mem_cgroup_is_obsolete(orig);
700 1013
701 if (list_empty(&root_mem->css.cgroup->children)) { 1014 rcu_read_lock();
1015
1016 mem_cgrp = memcg->css.cgroup;
1017 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
1018
1019 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
1020 if (ret < 0) {
702 /* 1021 /*
703 * root_mem might have children before and last_scanned_child 1022 * Unfortunately, we are unable to convert to a useful name
704 * may point to one of them. We put it later. 1023 * But we'll still print out the usage information
705 */ 1024 */
706 if (orig) 1025 rcu_read_unlock();
707 VM_BUG_ON(!obsolete);
708 next = NULL;
709 goto done; 1026 goto done;
710 } 1027 }
1028 rcu_read_unlock();
711 1029
712 if (!orig || obsolete) { 1030 printk(KERN_INFO "Task in %s killed", memcg_name);
713 cgroup = list_first_entry(&root_mem->css.cgroup->children, 1031
714 struct cgroup, sibling); 1032 rcu_read_lock();
715 next = mem_cgroup_from_cont(cgroup); 1033 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
716 } else 1034 if (ret < 0) {
717 next = __mem_cgroup_get_next_node(orig, root_mem); 1035 rcu_read_unlock();
1036 goto done;
1037 }
1038 rcu_read_unlock();
718 1039
1040 /*
1041 * Continues from above, so we don't need an KERN_ level
1042 */
1043 printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
719done: 1044done:
720 if (next) 1045
721 mem_cgroup_get(next); 1046 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
722 root_mem->last_scanned_child = next; 1047 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
723 if (orig) 1048 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
724 mem_cgroup_put(orig); 1049 res_counter_read_u64(&memcg->res, RES_FAILCNT));
725 mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex); 1050 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
726 return (next) ? next : root_mem; 1051 "failcnt %llu\n",
1052 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1053 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1054 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
727} 1055}
728 1056
729static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) 1057/*
1058 * This function returns the number of memcg under hierarchy tree. Returns
1059 * 1(self count) if no children.
1060 */
1061static int mem_cgroup_count_children(struct mem_cgroup *mem)
730{ 1062{
731 if (do_swap_account) { 1063 int num = 0;
732 if (res_counter_check_under_limit(&mem->res) && 1064 mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb);
733 res_counter_check_under_limit(&mem->memsw)) 1065 return num;
734 return true;
735 } else
736 if (res_counter_check_under_limit(&mem->res))
737 return true;
738 return false;
739} 1066}
740 1067
741static unsigned int get_swappiness(struct mem_cgroup *memcg) 1068/*
1069 * Visit the first child (need not be the first child as per the ordering
1070 * of the cgroup list, since we track last_scanned_child) of @mem and use
1071 * that to reclaim free pages from.
1072 */
1073static struct mem_cgroup *
1074mem_cgroup_select_victim(struct mem_cgroup *root_mem)
742{ 1075{
743 struct cgroup *cgrp = memcg->css.cgroup; 1076 struct mem_cgroup *ret = NULL;
744 unsigned int swappiness; 1077 struct cgroup_subsys_state *css;
1078 int nextid, found;
745 1079
746 /* root ? */ 1080 if (!root_mem->use_hierarchy) {
747 if (cgrp->parent == NULL) 1081 css_get(&root_mem->css);
748 return vm_swappiness; 1082 ret = root_mem;
1083 }
749 1084
750 spin_lock(&memcg->reclaim_param_lock); 1085 while (!ret) {
751 swappiness = memcg->swappiness; 1086 rcu_read_lock();
752 spin_unlock(&memcg->reclaim_param_lock); 1087 nextid = root_mem->last_scanned_child + 1;
1088 css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
1089 &found);
1090 if (css && css_tryget(css))
1091 ret = container_of(css, struct mem_cgroup, css);
1092
1093 rcu_read_unlock();
1094 /* Updates scanning parameter */
1095 spin_lock(&root_mem->reclaim_param_lock);
1096 if (!css) {
1097 /* this means start scan from ID:1 */
1098 root_mem->last_scanned_child = 0;
1099 } else
1100 root_mem->last_scanned_child = found;
1101 spin_unlock(&root_mem->reclaim_param_lock);
1102 }
753 1103
754 return swappiness; 1104 return ret;
755} 1105}
756 1106
757/* 1107/*
758 * Dance down the hierarchy if needed to reclaim memory. We remember the 1108 * Scan the hierarchy if needed to reclaim memory. We remember the last child
759 * last child we reclaimed from, so that we don't end up penalizing 1109 * we reclaimed from, so that we don't end up penalizing one child extensively
760 * one child extensively based on its position in the children list. 1110 * based on its position in the children list.
761 * 1111 *
762 * root_mem is the original ancestor that we've been reclaim from. 1112 * root_mem is the original ancestor that we've been reclaim from.
1113 *
1114 * We give up and return to the caller when we visit root_mem twice.
1115 * (other groups can be removed while we're walking....)
1116 *
1117 * If shrink==true, for avoiding to free too much, this returns immedieately.
763 */ 1118 */
764static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 1119static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
765 gfp_t gfp_mask, bool noswap) 1120 struct zone *zone,
766{ 1121 gfp_t gfp_mask,
767 struct mem_cgroup *next_mem; 1122 unsigned long reclaim_options)
768 int ret = 0; 1123{
769 1124 struct mem_cgroup *victim;
770 /* 1125 int ret, total = 0;
771 * Reclaim unconditionally and don't check for return value. 1126 int loop = 0;
772 * We need to reclaim in the current group and down the tree. 1127 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
773 * One might think about checking for children before reclaiming, 1128 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
774 * but there might be left over accounting, even after children 1129 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
775 * have left. 1130 unsigned long excess = mem_cgroup_get_excess(root_mem);
776 */ 1131
777 ret += try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap, 1132 /* If memsw_is_minimum==1, swap-out is of-no-use. */
778 get_swappiness(root_mem)); 1133 if (root_mem->memsw_is_minimum)
779 if (mem_cgroup_check_under_limit(root_mem)) 1134 noswap = true;
780 return 1; /* indicate reclaim has succeeded */
781 if (!root_mem->use_hierarchy)
782 return ret;
783 1135
784 next_mem = mem_cgroup_get_next_node(root_mem); 1136 while (1) {
785 1137 victim = mem_cgroup_select_victim(root_mem);
786 while (next_mem != root_mem) { 1138 if (victim == root_mem) {
787 if (mem_cgroup_is_obsolete(next_mem)) { 1139 loop++;
788 next_mem = mem_cgroup_get_next_node(root_mem); 1140 if (loop >= 2) {
1141 /*
1142 * If we have not been able to reclaim
1143 * anything, it might because there are
1144 * no reclaimable pages under this hierarchy
1145 */
1146 if (!check_soft || !total) {
1147 css_put(&victim->css);
1148 break;
1149 }
1150 /*
1151 * We want to do more targetted reclaim.
1152 * excess >> 2 is not to excessive so as to
1153 * reclaim too much, nor too less that we keep
1154 * coming back to reclaim from this cgroup
1155 */
1156 if (total >= (excess >> 2) ||
1157 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
1158 css_put(&victim->css);
1159 break;
1160 }
1161 }
1162 }
1163 if (!mem_cgroup_local_usage(&victim->stat)) {
1164 /* this cgroup's local usage == 0 */
1165 css_put(&victim->css);
789 continue; 1166 continue;
790 } 1167 }
791 ret += try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap, 1168 /* we use swappiness of local cgroup */
792 get_swappiness(next_mem)); 1169 if (check_soft)
793 if (mem_cgroup_check_under_limit(root_mem)) 1170 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
794 return 1; /* indicate reclaim has succeeded */ 1171 noswap, get_swappiness(victim), zone,
795 next_mem = mem_cgroup_get_next_node(root_mem); 1172 zone->zone_pgdat->node_id);
1173 else
1174 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1175 noswap, get_swappiness(victim));
1176 css_put(&victim->css);
1177 /*
1178 * At shrinking usage, we can't check we should stop here or
1179 * reclaim more. It's depends on callers. last_scanned_child
1180 * will work enough for keeping fairness under tree.
1181 */
1182 if (shrink)
1183 return ret;
1184 total += ret;
1185 if (check_soft) {
1186 if (res_counter_check_under_soft_limit(&root_mem->res))
1187 return total;
1188 } else if (mem_cgroup_check_under_limit(root_mem))
1189 return 1 + total;
796 } 1190 }
797 return ret; 1191 return total;
798} 1192}
799 1193
800bool mem_cgroup_oom_called(struct task_struct *task) 1194bool mem_cgroup_oom_called(struct task_struct *task)
@@ -813,13 +1207,64 @@ bool mem_cgroup_oom_called(struct task_struct *task)
813 rcu_read_unlock(); 1207 rcu_read_unlock();
814 return ret; 1208 return ret;
815} 1209}
1210
1211static int record_last_oom_cb(struct mem_cgroup *mem, void *data)
1212{
1213 mem->last_oom_jiffies = jiffies;
1214 return 0;
1215}
1216
1217static void record_last_oom(struct mem_cgroup *mem)
1218{
1219 mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
1220}
1221
1222/*
1223 * Currently used to update mapped file statistics, but the routine can be
1224 * generalized to update other statistics as well.
1225 */
1226void mem_cgroup_update_mapped_file_stat(struct page *page, int val)
1227{
1228 struct mem_cgroup *mem;
1229 struct mem_cgroup_stat *stat;
1230 struct mem_cgroup_stat_cpu *cpustat;
1231 int cpu;
1232 struct page_cgroup *pc;
1233
1234 if (!page_is_file_cache(page))
1235 return;
1236
1237 pc = lookup_page_cgroup(page);
1238 if (unlikely(!pc))
1239 return;
1240
1241 lock_page_cgroup(pc);
1242 mem = pc->mem_cgroup;
1243 if (!mem)
1244 goto done;
1245
1246 if (!PageCgroupUsed(pc))
1247 goto done;
1248
1249 /*
1250 * Preemption is already disabled, we don't need get_cpu()
1251 */
1252 cpu = smp_processor_id();
1253 stat = &mem->stat;
1254 cpustat = &stat->cpustat[cpu];
1255
1256 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val);
1257done:
1258 unlock_page_cgroup(pc);
1259}
1260
816/* 1261/*
817 * Unlike exported interface, "oom" parameter is added. if oom==true, 1262 * Unlike exported interface, "oom" parameter is added. if oom==true,
818 * oom-killer can be invoked. 1263 * oom-killer can be invoked.
819 */ 1264 */
820static int __mem_cgroup_try_charge(struct mm_struct *mm, 1265static int __mem_cgroup_try_charge(struct mm_struct *mm,
821 gfp_t gfp_mask, struct mem_cgroup **memcg, 1266 gfp_t gfp_mask, struct mem_cgroup **memcg,
822 bool oom) 1267 bool oom, struct page *page)
823{ 1268{
824 struct mem_cgroup *mem, *mem_over_limit; 1269 struct mem_cgroup *mem, *mem_over_limit;
825 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1270 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
@@ -847,12 +1292,14 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
847 if (unlikely(!mem)) 1292 if (unlikely(!mem))
848 return 0; 1293 return 0;
849 1294
850 VM_BUG_ON(mem_cgroup_is_obsolete(mem)); 1295 VM_BUG_ON(css_is_removed(&mem->css));
851 1296
852 while (1) { 1297 while (1) {
853 int ret; 1298 int ret = 0;
854 bool noswap = false; 1299 unsigned long flags = 0;
855 1300
1301 if (mem_cgroup_is_root(mem))
1302 goto done;
856 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); 1303 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
857 if (likely(!ret)) { 1304 if (likely(!ret)) {
858 if (!do_swap_account) 1305 if (!do_swap_account)
@@ -863,7 +1310,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
863 break; 1310 break;
864 /* mem+swap counter fails */ 1311 /* mem+swap counter fails */
865 res_counter_uncharge(&mem->res, PAGE_SIZE); 1312 res_counter_uncharge(&mem->res, PAGE_SIZE);
866 noswap = true; 1313 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
867 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1314 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
868 memsw); 1315 memsw);
869 } else 1316 } else
@@ -874,8 +1321,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
874 if (!(gfp_mask & __GFP_WAIT)) 1321 if (!(gfp_mask & __GFP_WAIT))
875 goto nomem; 1322 goto nomem;
876 1323
877 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, 1324 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
878 noswap); 1325 gfp_mask, flags);
879 if (ret) 1326 if (ret)
880 continue; 1327 continue;
881 1328
@@ -895,31 +1342,71 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
895 mutex_lock(&memcg_tasklist); 1342 mutex_lock(&memcg_tasklist);
896 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); 1343 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
897 mutex_unlock(&memcg_tasklist); 1344 mutex_unlock(&memcg_tasklist);
898 mem_over_limit->last_oom_jiffies = jiffies; 1345 record_last_oom(mem_over_limit);
899 } 1346 }
900 goto nomem; 1347 goto nomem;
901 } 1348 }
902 } 1349 }
1350 /*
1351 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
1352 * if they exceeds softlimit.
1353 */
1354 if (mem_cgroup_soft_limit_check(mem))
1355 mem_cgroup_update_tree(mem, page);
1356done:
903 return 0; 1357 return 0;
904nomem: 1358nomem:
905 css_put(&mem->css); 1359 css_put(&mem->css);
906 return -ENOMEM; 1360 return -ENOMEM;
907} 1361}
908 1362
1363/*
1364 * A helper function to get mem_cgroup from ID. must be called under
1365 * rcu_read_lock(). The caller must check css_is_removed() or some if
1366 * it's concern. (dropping refcnt from swap can be called against removed
1367 * memcg.)
1368 */
1369static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
1370{
1371 struct cgroup_subsys_state *css;
1372
1373 /* ID 0 is unused ID */
1374 if (!id)
1375 return NULL;
1376 css = css_lookup(&mem_cgroup_subsys, id);
1377 if (!css)
1378 return NULL;
1379 return container_of(css, struct mem_cgroup, css);
1380}
1381
909static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) 1382static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
910{ 1383{
911 struct mem_cgroup *mem; 1384 struct mem_cgroup *mem;
1385 struct page_cgroup *pc;
1386 unsigned short id;
912 swp_entry_t ent; 1387 swp_entry_t ent;
913 1388
1389 VM_BUG_ON(!PageLocked(page));
1390
914 if (!PageSwapCache(page)) 1391 if (!PageSwapCache(page))
915 return NULL; 1392 return NULL;
916 1393
917 ent.val = page_private(page); 1394 pc = lookup_page_cgroup(page);
918 mem = lookup_swap_cgroup(ent); 1395 lock_page_cgroup(pc);
919 if (!mem) 1396 if (PageCgroupUsed(pc)) {
920 return NULL; 1397 mem = pc->mem_cgroup;
921 if (!css_tryget(&mem->css)) 1398 if (mem && !css_tryget(&mem->css))
922 return NULL; 1399 mem = NULL;
1400 } else {
1401 ent.val = page_private(page);
1402 id = lookup_swap_cgroup(ent);
1403 rcu_read_lock();
1404 mem = mem_cgroup_lookup(id);
1405 if (mem && !css_tryget(&mem->css))
1406 mem = NULL;
1407 rcu_read_unlock();
1408 }
1409 unlock_page_cgroup(pc);
923 return mem; 1410 return mem;
924} 1411}
925 1412
@@ -939,15 +1426,37 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
939 lock_page_cgroup(pc); 1426 lock_page_cgroup(pc);
940 if (unlikely(PageCgroupUsed(pc))) { 1427 if (unlikely(PageCgroupUsed(pc))) {
941 unlock_page_cgroup(pc); 1428 unlock_page_cgroup(pc);
942 res_counter_uncharge(&mem->res, PAGE_SIZE); 1429 if (!mem_cgroup_is_root(mem)) {
943 if (do_swap_account) 1430 res_counter_uncharge(&mem->res, PAGE_SIZE);
944 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1431 if (do_swap_account)
1432 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1433 }
945 css_put(&mem->css); 1434 css_put(&mem->css);
946 return; 1435 return;
947 } 1436 }
1437
948 pc->mem_cgroup = mem; 1438 pc->mem_cgroup = mem;
1439 /*
1440 * We access a page_cgroup asynchronously without lock_page_cgroup().
1441 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
1442 * is accessed after testing USED bit. To make pc->mem_cgroup visible
1443 * before USED bit, we need memory barrier here.
1444 * See mem_cgroup_add_lru_list(), etc.
1445 */
949 smp_wmb(); 1446 smp_wmb();
950 pc->flags = pcg_default_flags[ctype]; 1447 switch (ctype) {
1448 case MEM_CGROUP_CHARGE_TYPE_CACHE:
1449 case MEM_CGROUP_CHARGE_TYPE_SHMEM:
1450 SetPageCgroupCache(pc);
1451 SetPageCgroupUsed(pc);
1452 break;
1453 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
1454 ClearPageCgroupCache(pc);
1455 SetPageCgroupUsed(pc);
1456 break;
1457 default:
1458 break;
1459 }
951 1460
952 mem_cgroup_charge_statistics(mem, pc, true); 1461 mem_cgroup_charge_statistics(mem, pc, true);
953 1462
@@ -976,6 +1485,10 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
976 struct mem_cgroup_per_zone *from_mz, *to_mz; 1485 struct mem_cgroup_per_zone *from_mz, *to_mz;
977 int nid, zid; 1486 int nid, zid;
978 int ret = -EBUSY; 1487 int ret = -EBUSY;
1488 struct page *page;
1489 int cpu;
1490 struct mem_cgroup_stat *stat;
1491 struct mem_cgroup_stat_cpu *cpustat;
979 1492
980 VM_BUG_ON(from == to); 1493 VM_BUG_ON(from == to);
981 VM_BUG_ON(PageLRU(pc->page)); 1494 VM_BUG_ON(PageLRU(pc->page));
@@ -994,9 +1507,27 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
994 if (pc->mem_cgroup != from) 1507 if (pc->mem_cgroup != from)
995 goto out; 1508 goto out;
996 1509
997 res_counter_uncharge(&from->res, PAGE_SIZE); 1510 if (!mem_cgroup_is_root(from))
1511 res_counter_uncharge(&from->res, PAGE_SIZE);
998 mem_cgroup_charge_statistics(from, pc, false); 1512 mem_cgroup_charge_statistics(from, pc, false);
999 if (do_swap_account) 1513
1514 page = pc->page;
1515 if (page_is_file_cache(page) && page_mapped(page)) {
1516 cpu = smp_processor_id();
1517 /* Update mapped_file data for mem_cgroup "from" */
1518 stat = &from->stat;
1519 cpustat = &stat->cpustat[cpu];
1520 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE,
1521 -1);
1522
1523 /* Update mapped_file data for mem_cgroup "to" */
1524 stat = &to->stat;
1525 cpustat = &stat->cpustat[cpu];
1526 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE,
1527 1);
1528 }
1529
1530 if (do_swap_account && !mem_cgroup_is_root(from))
1000 res_counter_uncharge(&from->memsw, PAGE_SIZE); 1531 res_counter_uncharge(&from->memsw, PAGE_SIZE);
1001 css_put(&from->css); 1532 css_put(&from->css);
1002 1533
@@ -1006,6 +1537,12 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1006 ret = 0; 1537 ret = 0;
1007out: 1538out:
1008 unlock_page_cgroup(pc); 1539 unlock_page_cgroup(pc);
1540 /*
1541 * We charges against "to" which may not have any tasks. Then, "to"
1542 * can be under rmdir(). But in current implementation, caller of
1543 * this function is just force_empty() and it's garanteed that
1544 * "to" is never removed. So, we don't check rmdir status here.
1545 */
1009 return ret; 1546 return ret;
1010} 1547}
1011 1548
@@ -1031,7 +1568,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
1031 parent = mem_cgroup_from_cont(pcg); 1568 parent = mem_cgroup_from_cont(pcg);
1032 1569
1033 1570
1034 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); 1571 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page);
1035 if (ret || !parent) 1572 if (ret || !parent)
1036 return ret; 1573 return ret;
1037 1574
@@ -1061,9 +1598,11 @@ uncharge:
1061 /* drop extra refcnt by try_charge() */ 1598 /* drop extra refcnt by try_charge() */
1062 css_put(&parent->css); 1599 css_put(&parent->css);
1063 /* uncharge if move fails */ 1600 /* uncharge if move fails */
1064 res_counter_uncharge(&parent->res, PAGE_SIZE); 1601 if (!mem_cgroup_is_root(parent)) {
1065 if (do_swap_account) 1602 res_counter_uncharge(&parent->res, PAGE_SIZE);
1066 res_counter_uncharge(&parent->memsw, PAGE_SIZE); 1603 if (do_swap_account)
1604 res_counter_uncharge(&parent->memsw, PAGE_SIZE);
1605 }
1067 return ret; 1606 return ret;
1068} 1607}
1069 1608
@@ -1088,7 +1627,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
1088 prefetchw(pc); 1627 prefetchw(pc);
1089 1628
1090 mem = memcg; 1629 mem = memcg;
1091 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); 1630 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page);
1092 if (ret || !mem) 1631 if (ret || !mem)
1093 return ret; 1632 return ret;
1094 1633
@@ -1118,6 +1657,10 @@ int mem_cgroup_newpage_charge(struct page *page,
1118 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); 1657 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
1119} 1658}
1120 1659
1660static void
1661__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1662 enum charge_type ctype);
1663
1121int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 1664int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
1122 gfp_t gfp_mask) 1665 gfp_t gfp_mask)
1123{ 1666{
@@ -1154,16 +1697,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
1154 unlock_page_cgroup(pc); 1697 unlock_page_cgroup(pc);
1155 } 1698 }
1156 1699
1157 if (do_swap_account && PageSwapCache(page)) {
1158 mem = try_get_mem_cgroup_from_swapcache(page);
1159 if (mem)
1160 mm = NULL;
1161 else
1162 mem = NULL;
1163 /* SwapCache may be still linked to LRU now. */
1164 mem_cgroup_lru_del_before_commit_swapcache(page);
1165 }
1166
1167 if (unlikely(!mm && !mem)) 1700 if (unlikely(!mm && !mem))
1168 mm = &init_mm; 1701 mm = &init_mm;
1169 1702
@@ -1171,22 +1704,16 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
1171 return mem_cgroup_charge_common(page, mm, gfp_mask, 1704 return mem_cgroup_charge_common(page, mm, gfp_mask,
1172 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); 1705 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
1173 1706
1174 ret = mem_cgroup_charge_common(page, mm, gfp_mask, 1707 /* shmem */
1175 MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); 1708 if (PageSwapCache(page)) {
1176 if (mem) 1709 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
1177 css_put(&mem->css); 1710 if (!ret)
1178 if (PageSwapCache(page)) 1711 __mem_cgroup_commit_charge_swapin(page, mem,
1179 mem_cgroup_lru_add_after_commit_swapcache(page); 1712 MEM_CGROUP_CHARGE_TYPE_SHMEM);
1713 } else
1714 ret = mem_cgroup_charge_common(page, mm, gfp_mask,
1715 MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
1180 1716
1181 if (do_swap_account && !ret && PageSwapCache(page)) {
1182 swp_entry_t ent = {.val = page_private(page)};
1183 /* avoid double counting */
1184 mem = swap_cgroup_record(ent, NULL);
1185 if (mem) {
1186 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1187 mem_cgroup_put(mem);
1188 }
1189 }
1190 return ret; 1717 return ret;
1191} 1718}
1192 1719
@@ -1219,17 +1746,19 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
1219 if (!mem) 1746 if (!mem)
1220 goto charge_cur_mm; 1747 goto charge_cur_mm;
1221 *ptr = mem; 1748 *ptr = mem;
1222 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); 1749 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page);
1223 /* drop extra refcnt from tryget */ 1750 /* drop extra refcnt from tryget */
1224 css_put(&mem->css); 1751 css_put(&mem->css);
1225 return ret; 1752 return ret;
1226charge_cur_mm: 1753charge_cur_mm:
1227 if (unlikely(!mm)) 1754 if (unlikely(!mm))
1228 mm = &init_mm; 1755 mm = &init_mm;
1229 return __mem_cgroup_try_charge(mm, mask, ptr, true); 1756 return __mem_cgroup_try_charge(mm, mask, ptr, true, page);
1230} 1757}
1231 1758
1232void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) 1759static void
1760__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1761 enum charge_type ctype)
1233{ 1762{
1234 struct page_cgroup *pc; 1763 struct page_cgroup *pc;
1235 1764
@@ -1237,9 +1766,10 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
1237 return; 1766 return;
1238 if (!ptr) 1767 if (!ptr)
1239 return; 1768 return;
1769 cgroup_exclude_rmdir(&ptr->css);
1240 pc = lookup_page_cgroup(page); 1770 pc = lookup_page_cgroup(page);
1241 mem_cgroup_lru_del_before_commit_swapcache(page); 1771 mem_cgroup_lru_del_before_commit_swapcache(page);
1242 __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED); 1772 __mem_cgroup_commit_charge(ptr, pc, ctype);
1243 mem_cgroup_lru_add_after_commit_swapcache(page); 1773 mem_cgroup_lru_add_after_commit_swapcache(page);
1244 /* 1774 /*
1245 * Now swap is on-memory. This means this page may be 1775 * Now swap is on-memory. This means this page may be
@@ -1250,16 +1780,36 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
1250 */ 1780 */
1251 if (do_swap_account && PageSwapCache(page)) { 1781 if (do_swap_account && PageSwapCache(page)) {
1252 swp_entry_t ent = {.val = page_private(page)}; 1782 swp_entry_t ent = {.val = page_private(page)};
1783 unsigned short id;
1253 struct mem_cgroup *memcg; 1784 struct mem_cgroup *memcg;
1254 memcg = swap_cgroup_record(ent, NULL); 1785
1786 id = swap_cgroup_record(ent, 0);
1787 rcu_read_lock();
1788 memcg = mem_cgroup_lookup(id);
1255 if (memcg) { 1789 if (memcg) {
1256 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1790 /*
1791 * This recorded memcg can be obsolete one. So, avoid
1792 * calling css_tryget
1793 */
1794 if (!mem_cgroup_is_root(memcg))
1795 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1796 mem_cgroup_swap_statistics(memcg, false);
1257 mem_cgroup_put(memcg); 1797 mem_cgroup_put(memcg);
1258 } 1798 }
1259 1799 rcu_read_unlock();
1260 } 1800 }
1261 /* add this page(page_cgroup) to the LRU we want. */ 1801 /*
1802 * At swapin, we may charge account against cgroup which has no tasks.
1803 * So, rmdir()->pre_destroy() can be called while we do this charge.
1804 * In that case, we need to call pre_destroy() again. check it here.
1805 */
1806 cgroup_release_and_wakeup_rmdir(&ptr->css);
1807}
1262 1808
1809void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
1810{
1811 __mem_cgroup_commit_charge_swapin(page, ptr,
1812 MEM_CGROUP_CHARGE_TYPE_MAPPED);
1263} 1813}
1264 1814
1265void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) 1815void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
@@ -1268,9 +1818,11 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
1268 return; 1818 return;
1269 if (!mem) 1819 if (!mem)
1270 return; 1820 return;
1271 res_counter_uncharge(&mem->res, PAGE_SIZE); 1821 if (!mem_cgroup_is_root(mem)) {
1272 if (do_swap_account) 1822 res_counter_uncharge(&mem->res, PAGE_SIZE);
1273 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1823 if (do_swap_account)
1824 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1825 }
1274 css_put(&mem->css); 1826 css_put(&mem->css);
1275} 1827}
1276 1828
@@ -1307,6 +1859,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1307 1859
1308 switch (ctype) { 1860 switch (ctype) {
1309 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 1861 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
1862 case MEM_CGROUP_CHARGE_TYPE_DROP:
1310 if (page_mapped(page)) 1863 if (page_mapped(page))
1311 goto unlock_out; 1864 goto unlock_out;
1312 break; 1865 break;
@@ -1321,11 +1874,16 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1321 break; 1874 break;
1322 } 1875 }
1323 1876
1324 res_counter_uncharge(&mem->res, PAGE_SIZE); 1877 if (!mem_cgroup_is_root(mem)) {
1325 if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) 1878 res_counter_uncharge(&mem->res, PAGE_SIZE);
1326 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1879 if (do_swap_account &&
1327 1880 (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1881 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1882 }
1883 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1884 mem_cgroup_swap_statistics(mem, true);
1328 mem_cgroup_charge_statistics(mem, pc, false); 1885 mem_cgroup_charge_statistics(mem, pc, false);
1886
1329 ClearPageCgroupUsed(pc); 1887 ClearPageCgroupUsed(pc);
1330 /* 1888 /*
1331 * pc->mem_cgroup is not cleared here. It will be accessed when it's 1889 * pc->mem_cgroup is not cleared here. It will be accessed when it's
@@ -1337,6 +1895,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1337 mz = page_cgroup_zoneinfo(pc); 1895 mz = page_cgroup_zoneinfo(pc);
1338 unlock_page_cgroup(pc); 1896 unlock_page_cgroup(pc);
1339 1897
1898 if (mem_cgroup_soft_limit_check(mem))
1899 mem_cgroup_update_tree(mem, page);
1340 /* at swapout, this memcg will be accessed to record to swap */ 1900 /* at swapout, this memcg will be accessed to record to swap */
1341 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 1901 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1342 css_put(&mem->css); 1902 css_put(&mem->css);
@@ -1365,24 +1925,31 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
1365 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 1925 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
1366} 1926}
1367 1927
1928#ifdef CONFIG_SWAP
1368/* 1929/*
1369 * called from __delete_from_swap_cache() and drop "page" account. 1930 * called after __delete_from_swap_cache() and drop "page" account.
1370 * memcg information is recorded to swap_cgroup of "ent" 1931 * memcg information is recorded to swap_cgroup of "ent"
1371 */ 1932 */
1372void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) 1933void
1934mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
1373{ 1935{
1374 struct mem_cgroup *memcg; 1936 struct mem_cgroup *memcg;
1937 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
1938
1939 if (!swapout) /* this was a swap cache but the swap is unused ! */
1940 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
1941
1942 memcg = __mem_cgroup_uncharge_common(page, ctype);
1375 1943
1376 memcg = __mem_cgroup_uncharge_common(page,
1377 MEM_CGROUP_CHARGE_TYPE_SWAPOUT);
1378 /* record memcg information */ 1944 /* record memcg information */
1379 if (do_swap_account && memcg) { 1945 if (do_swap_account && swapout && memcg) {
1380 swap_cgroup_record(ent, memcg); 1946 swap_cgroup_record(ent, css_id(&memcg->css));
1381 mem_cgroup_get(memcg); 1947 mem_cgroup_get(memcg);
1382 } 1948 }
1383 if (memcg) 1949 if (swapout && memcg)
1384 css_put(&memcg->css); 1950 css_put(&memcg->css);
1385} 1951}
1952#endif
1386 1953
1387#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 1954#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
1388/* 1955/*
@@ -1392,15 +1959,25 @@ void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
1392void mem_cgroup_uncharge_swap(swp_entry_t ent) 1959void mem_cgroup_uncharge_swap(swp_entry_t ent)
1393{ 1960{
1394 struct mem_cgroup *memcg; 1961 struct mem_cgroup *memcg;
1962 unsigned short id;
1395 1963
1396 if (!do_swap_account) 1964 if (!do_swap_account)
1397 return; 1965 return;
1398 1966
1399 memcg = swap_cgroup_record(ent, NULL); 1967 id = swap_cgroup_record(ent, 0);
1968 rcu_read_lock();
1969 memcg = mem_cgroup_lookup(id);
1400 if (memcg) { 1970 if (memcg) {
1401 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1971 /*
1972 * We uncharge this because swap is freed.
1973 * This memcg can be obsolete one. We avoid calling css_tryget
1974 */
1975 if (!mem_cgroup_is_root(memcg))
1976 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1977 mem_cgroup_swap_statistics(memcg, false);
1402 mem_cgroup_put(memcg); 1978 mem_cgroup_put(memcg);
1403 } 1979 }
1980 rcu_read_unlock();
1404} 1981}
1405#endif 1982#endif
1406 1983
@@ -1426,7 +2003,8 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
1426 unlock_page_cgroup(pc); 2003 unlock_page_cgroup(pc);
1427 2004
1428 if (mem) { 2005 if (mem) {
1429 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); 2006 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
2007 page);
1430 css_put(&mem->css); 2008 css_put(&mem->css);
1431 } 2009 }
1432 *ptr = mem; 2010 *ptr = mem;
@@ -1443,7 +2021,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
1443 2021
1444 if (!mem) 2022 if (!mem)
1445 return; 2023 return;
1446 2024 cgroup_exclude_rmdir(&mem->css);
1447 /* at migration success, oldpage->mapping is NULL. */ 2025 /* at migration success, oldpage->mapping is NULL. */
1448 if (oldpage->mapping) { 2026 if (oldpage->mapping) {
1449 target = oldpage; 2027 target = oldpage;
@@ -1483,39 +2061,37 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
1483 */ 2061 */
1484 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) 2062 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
1485 mem_cgroup_uncharge_page(target); 2063 mem_cgroup_uncharge_page(target);
2064 /*
2065 * At migration, we may charge account against cgroup which has no tasks
2066 * So, rmdir()->pre_destroy() can be called while we do this charge.
2067 * In that case, we need to call pre_destroy() again. check it here.
2068 */
2069 cgroup_release_and_wakeup_rmdir(&mem->css);
1486} 2070}
1487 2071
1488/* 2072/*
1489 * A call to try to shrink memory usage under specified resource controller. 2073 * A call to try to shrink memory usage on charge failure at shmem's swapin.
1490 * This is typically used for page reclaiming for shmem for reducing side 2074 * Calling hierarchical_reclaim is not enough because we should update
1491 * effect of page allocation from shmem, which is used by some mem_cgroup. 2075 * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
2076 * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
2077 * not from the memcg which this page would be charged to.
2078 * try_charge_swapin does all of these works properly.
1492 */ 2079 */
1493int mem_cgroup_shrink_usage(struct page *page, 2080int mem_cgroup_shmem_charge_fallback(struct page *page,
1494 struct mm_struct *mm, 2081 struct mm_struct *mm,
1495 gfp_t gfp_mask) 2082 gfp_t gfp_mask)
1496{ 2083{
1497 struct mem_cgroup *mem = NULL; 2084 struct mem_cgroup *mem = NULL;
1498 int progress = 0; 2085 int ret;
1499 int retry = MEM_CGROUP_RECLAIM_RETRIES;
1500 2086
1501 if (mem_cgroup_disabled()) 2087 if (mem_cgroup_disabled())
1502 return 0; 2088 return 0;
1503 if (page)
1504 mem = try_get_mem_cgroup_from_swapcache(page);
1505 if (!mem && mm)
1506 mem = try_get_mem_cgroup_from_mm(mm);
1507 if (unlikely(!mem))
1508 return 0;
1509 2089
1510 do { 2090 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
1511 progress = mem_cgroup_hierarchical_reclaim(mem, gfp_mask, true); 2091 if (!ret)
1512 progress += mem_cgroup_check_under_limit(mem); 2092 mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */
1513 } while (!progress && --retry);
1514 2093
1515 css_put(&mem->css); 2094 return ret;
1516 if (!retry)
1517 return -ENOMEM;
1518 return 0;
1519} 2095}
1520 2096
1521static DEFINE_MUTEX(set_limit_mutex); 2097static DEFINE_MUTEX(set_limit_mutex);
@@ -1523,11 +2099,21 @@ static DEFINE_MUTEX(set_limit_mutex);
1523static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 2099static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1524 unsigned long long val) 2100 unsigned long long val)
1525{ 2101{
1526 2102 int retry_count;
1527 int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
1528 int progress; 2103 int progress;
1529 u64 memswlimit; 2104 u64 memswlimit;
1530 int ret = 0; 2105 int ret = 0;
2106 int children = mem_cgroup_count_children(memcg);
2107 u64 curusage, oldusage;
2108
2109 /*
2110 * For keeping hierarchical_reclaim simple, how long we should retry
2111 * is depends on callers. We set our retry-count to be function
2112 * of # of children which we should visit in this loop.
2113 */
2114 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
2115
2116 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
1531 2117
1532 while (retry_count) { 2118 while (retry_count) {
1533 if (signal_pending(current)) { 2119 if (signal_pending(current)) {
@@ -1547,29 +2133,42 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1547 break; 2133 break;
1548 } 2134 }
1549 ret = res_counter_set_limit(&memcg->res, val); 2135 ret = res_counter_set_limit(&memcg->res, val);
2136 if (!ret) {
2137 if (memswlimit == val)
2138 memcg->memsw_is_minimum = true;
2139 else
2140 memcg->memsw_is_minimum = false;
2141 }
1550 mutex_unlock(&set_limit_mutex); 2142 mutex_unlock(&set_limit_mutex);
1551 2143
1552 if (!ret) 2144 if (!ret)
1553 break; 2145 break;
1554 2146
1555 progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, 2147 progress = mem_cgroup_hierarchical_reclaim(memcg, NULL,
1556 false); 2148 GFP_KERNEL,
1557 if (!progress) retry_count--; 2149 MEM_CGROUP_RECLAIM_SHRINK);
2150 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
2151 /* Usage is reduced ? */
2152 if (curusage >= oldusage)
2153 retry_count--;
2154 else
2155 oldusage = curusage;
1558 } 2156 }
1559 2157
1560 return ret; 2158 return ret;
1561} 2159}
1562 2160
1563int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 2161static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1564 unsigned long long val) 2162 unsigned long long val)
1565{ 2163{
1566 int retry_count = MEM_CGROUP_RECLAIM_RETRIES; 2164 int retry_count;
1567 u64 memlimit, oldusage, curusage; 2165 u64 memlimit, oldusage, curusage;
1568 int ret; 2166 int children = mem_cgroup_count_children(memcg);
1569 2167 int ret = -EBUSY;
1570 if (!do_swap_account)
1571 return -EINVAL;
1572 2168
2169 /* see mem_cgroup_resize_res_limit */
2170 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
2171 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1573 while (retry_count) { 2172 while (retry_count) {
1574 if (signal_pending(current)) { 2173 if (signal_pending(current)) {
1575 ret = -EINTR; 2174 ret = -EINTR;
@@ -1588,20 +2187,121 @@ int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1588 break; 2187 break;
1589 } 2188 }
1590 ret = res_counter_set_limit(&memcg->memsw, val); 2189 ret = res_counter_set_limit(&memcg->memsw, val);
2190 if (!ret) {
2191 if (memlimit == val)
2192 memcg->memsw_is_minimum = true;
2193 else
2194 memcg->memsw_is_minimum = false;
2195 }
1591 mutex_unlock(&set_limit_mutex); 2196 mutex_unlock(&set_limit_mutex);
1592 2197
1593 if (!ret) 2198 if (!ret)
1594 break; 2199 break;
1595 2200
1596 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 2201 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
1597 mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true); 2202 MEM_CGROUP_RECLAIM_NOSWAP |
2203 MEM_CGROUP_RECLAIM_SHRINK);
1598 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 2204 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
2205 /* Usage is reduced ? */
1599 if (curusage >= oldusage) 2206 if (curusage >= oldusage)
1600 retry_count--; 2207 retry_count--;
2208 else
2209 oldusage = curusage;
1601 } 2210 }
1602 return ret; 2211 return ret;
1603} 2212}
1604 2213
2214unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2215 gfp_t gfp_mask, int nid,
2216 int zid)
2217{
2218 unsigned long nr_reclaimed = 0;
2219 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
2220 unsigned long reclaimed;
2221 int loop = 0;
2222 struct mem_cgroup_tree_per_zone *mctz;
2223 unsigned long long excess;
2224
2225 if (order > 0)
2226 return 0;
2227
2228 mctz = soft_limit_tree_node_zone(nid, zid);
2229 /*
2230 * This loop can run a while, specially if mem_cgroup's continuously
2231 * keep exceeding their soft limit and putting the system under
2232 * pressure
2233 */
2234 do {
2235 if (next_mz)
2236 mz = next_mz;
2237 else
2238 mz = mem_cgroup_largest_soft_limit_node(mctz);
2239 if (!mz)
2240 break;
2241
2242 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
2243 gfp_mask,
2244 MEM_CGROUP_RECLAIM_SOFT);
2245 nr_reclaimed += reclaimed;
2246 spin_lock(&mctz->lock);
2247
2248 /*
2249 * If we failed to reclaim anything from this memory cgroup
2250 * it is time to move on to the next cgroup
2251 */
2252 next_mz = NULL;
2253 if (!reclaimed) {
2254 do {
2255 /*
2256 * Loop until we find yet another one.
2257 *
2258 * By the time we get the soft_limit lock
2259 * again, someone might have aded the
2260 * group back on the RB tree. Iterate to
2261 * make sure we get a different mem.
2262 * mem_cgroup_largest_soft_limit_node returns
2263 * NULL if no other cgroup is present on
2264 * the tree
2265 */
2266 next_mz =
2267 __mem_cgroup_largest_soft_limit_node(mctz);
2268 if (next_mz == mz) {
2269 css_put(&next_mz->mem->css);
2270 next_mz = NULL;
2271 } else /* next_mz == NULL or other memcg */
2272 break;
2273 } while (1);
2274 }
2275 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
2276 excess = res_counter_soft_limit_excess(&mz->mem->res);
2277 /*
2278 * One school of thought says that we should not add
2279 * back the node to the tree if reclaim returns 0.
2280 * But our reclaim could return 0, simply because due
2281 * to priority we are exposing a smaller subset of
2282 * memory to reclaim from. Consider this as a longer
2283 * term TODO.
2284 */
2285 /* If excess == 0, no tree ops */
2286 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
2287 spin_unlock(&mctz->lock);
2288 css_put(&mz->mem->css);
2289 loop++;
2290 /*
2291 * Could not reclaim anything and there are no more
2292 * mem cgroups to try or we seem to be looping without
2293 * reclaiming anything.
2294 */
2295 if (!nr_reclaimed &&
2296 (next_mz == NULL ||
2297 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
2298 break;
2299 } while (!nr_reclaimed);
2300 if (next_mz)
2301 css_put(&next_mz->mem->css);
2302 return nr_reclaimed;
2303}
2304
1605/* 2305/*
1606 * This routine traverse page_cgroup in given list and drop them all. 2306 * This routine traverse page_cgroup in given list and drop them all.
1607 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 2307 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
@@ -1730,7 +2430,7 @@ try_to_free:
1730 if (!progress) { 2430 if (!progress) {
1731 nr_retries--; 2431 nr_retries--;
1732 /* maybe some writeback is necessary */ 2432 /* maybe some writeback is necessary */
1733 congestion_wait(WRITE, HZ/10); 2433 congestion_wait(BLK_RW_ASYNC, HZ/10);
1734 } 2434 }
1735 2435
1736 } 2436 }
@@ -1786,20 +2486,63 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
1786 return retval; 2486 return retval;
1787} 2487}
1788 2488
2489struct mem_cgroup_idx_data {
2490 s64 val;
2491 enum mem_cgroup_stat_index idx;
2492};
2493
2494static int
2495mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
2496{
2497 struct mem_cgroup_idx_data *d = data;
2498 d->val += mem_cgroup_read_stat(&mem->stat, d->idx);
2499 return 0;
2500}
2501
2502static void
2503mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
2504 enum mem_cgroup_stat_index idx, s64 *val)
2505{
2506 struct mem_cgroup_idx_data d;
2507 d.idx = idx;
2508 d.val = 0;
2509 mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
2510 *val = d.val;
2511}
2512
1789static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 2513static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
1790{ 2514{
1791 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2515 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1792 u64 val = 0; 2516 u64 idx_val, val;
1793 int type, name; 2517 int type, name;
1794 2518
1795 type = MEMFILE_TYPE(cft->private); 2519 type = MEMFILE_TYPE(cft->private);
1796 name = MEMFILE_ATTR(cft->private); 2520 name = MEMFILE_ATTR(cft->private);
1797 switch (type) { 2521 switch (type) {
1798 case _MEM: 2522 case _MEM:
1799 val = res_counter_read_u64(&mem->res, name); 2523 if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
2524 mem_cgroup_get_recursive_idx_stat(mem,
2525 MEM_CGROUP_STAT_CACHE, &idx_val);
2526 val = idx_val;
2527 mem_cgroup_get_recursive_idx_stat(mem,
2528 MEM_CGROUP_STAT_RSS, &idx_val);
2529 val += idx_val;
2530 val <<= PAGE_SHIFT;
2531 } else
2532 val = res_counter_read_u64(&mem->res, name);
1800 break; 2533 break;
1801 case _MEMSWAP: 2534 case _MEMSWAP:
1802 if (do_swap_account) 2535 if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
2536 mem_cgroup_get_recursive_idx_stat(mem,
2537 MEM_CGROUP_STAT_CACHE, &idx_val);
2538 val = idx_val;
2539 mem_cgroup_get_recursive_idx_stat(mem,
2540 MEM_CGROUP_STAT_RSS, &idx_val);
2541 val += idx_val;
2542 mem_cgroup_get_recursive_idx_stat(mem,
2543 MEM_CGROUP_STAT_SWAPOUT, &idx_val);
2544 val <<= PAGE_SHIFT;
2545 } else
1803 val = res_counter_read_u64(&mem->memsw, name); 2546 val = res_counter_read_u64(&mem->memsw, name);
1804 break; 2547 break;
1805 default: 2548 default:
@@ -1824,6 +2567,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
1824 name = MEMFILE_ATTR(cft->private); 2567 name = MEMFILE_ATTR(cft->private);
1825 switch (name) { 2568 switch (name) {
1826 case RES_LIMIT: 2569 case RES_LIMIT:
2570 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
2571 ret = -EINVAL;
2572 break;
2573 }
1827 /* This function does all necessary parse...reuse it */ 2574 /* This function does all necessary parse...reuse it */
1828 ret = res_counter_memparse_write_strategy(buffer, &val); 2575 ret = res_counter_memparse_write_strategy(buffer, &val);
1829 if (ret) 2576 if (ret)
@@ -1833,6 +2580,20 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
1833 else 2580 else
1834 ret = mem_cgroup_resize_memsw_limit(memcg, val); 2581 ret = mem_cgroup_resize_memsw_limit(memcg, val);
1835 break; 2582 break;
2583 case RES_SOFT_LIMIT:
2584 ret = res_counter_memparse_write_strategy(buffer, &val);
2585 if (ret)
2586 break;
2587 /*
2588 * For memsw, soft limits are hard to implement in terms
2589 * of semantics, for now, we support soft limits for
2590 * control without swap
2591 */
2592 if (type == _MEM)
2593 ret = res_counter_set_soft_limit(&memcg->res, val);
2594 else
2595 ret = -EINVAL;
2596 break;
1836 default: 2597 default:
1837 ret = -EINVAL; /* should be BUG() ? */ 2598 ret = -EINVAL; /* should be BUG() ? */
1838 break; 2599 break;
@@ -1890,57 +2651,107 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
1890 res_counter_reset_failcnt(&mem->memsw); 2651 res_counter_reset_failcnt(&mem->memsw);
1891 break; 2652 break;
1892 } 2653 }
2654
1893 return 0; 2655 return 0;
1894} 2656}
1895 2657
1896static const struct mem_cgroup_stat_desc { 2658
1897 const char *msg; 2659/* For read statistics */
1898 u64 unit; 2660enum {
1899} mem_cgroup_stat_desc[] = { 2661 MCS_CACHE,
1900 [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, }, 2662 MCS_RSS,
1901 [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, }, 2663 MCS_MAPPED_FILE,
1902 [MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, }, 2664 MCS_PGPGIN,
1903 [MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, }, 2665 MCS_PGPGOUT,
2666 MCS_SWAP,
2667 MCS_INACTIVE_ANON,
2668 MCS_ACTIVE_ANON,
2669 MCS_INACTIVE_FILE,
2670 MCS_ACTIVE_FILE,
2671 MCS_UNEVICTABLE,
2672 NR_MCS_STAT,
2673};
2674
2675struct mcs_total_stat {
2676 s64 stat[NR_MCS_STAT];
2677};
2678
2679struct {
2680 char *local_name;
2681 char *total_name;
2682} memcg_stat_strings[NR_MCS_STAT] = {
2683 {"cache", "total_cache"},
2684 {"rss", "total_rss"},
2685 {"mapped_file", "total_mapped_file"},
2686 {"pgpgin", "total_pgpgin"},
2687 {"pgpgout", "total_pgpgout"},
2688 {"swap", "total_swap"},
2689 {"inactive_anon", "total_inactive_anon"},
2690 {"active_anon", "total_active_anon"},
2691 {"inactive_file", "total_inactive_file"},
2692 {"active_file", "total_active_file"},
2693 {"unevictable", "total_unevictable"}
1904}; 2694};
1905 2695
2696
2697static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
2698{
2699 struct mcs_total_stat *s = data;
2700 s64 val;
2701
2702 /* per cpu stat */
2703 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE);
2704 s->stat[MCS_CACHE] += val * PAGE_SIZE;
2705 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
2706 s->stat[MCS_RSS] += val * PAGE_SIZE;
2707 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_MAPPED_FILE);
2708 s->stat[MCS_MAPPED_FILE] += val * PAGE_SIZE;
2709 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT);
2710 s->stat[MCS_PGPGIN] += val;
2711 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
2712 s->stat[MCS_PGPGOUT] += val;
2713 if (do_swap_account) {
2714 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT);
2715 s->stat[MCS_SWAP] += val * PAGE_SIZE;
2716 }
2717
2718 /* per zone stat */
2719 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
2720 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
2721 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON);
2722 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
2723 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE);
2724 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
2725 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE);
2726 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
2727 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
2728 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
2729 return 0;
2730}
2731
2732static void
2733mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
2734{
2735 mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat);
2736}
2737
1906static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 2738static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
1907 struct cgroup_map_cb *cb) 2739 struct cgroup_map_cb *cb)
1908{ 2740{
1909 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 2741 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
1910 struct mem_cgroup_stat *stat = &mem_cont->stat; 2742 struct mcs_total_stat mystat;
1911 int i; 2743 int i;
1912 2744
1913 for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) { 2745 memset(&mystat, 0, sizeof(mystat));
1914 s64 val; 2746 mem_cgroup_get_local_stat(mem_cont, &mystat);
1915 2747
1916 val = mem_cgroup_read_stat(stat, i); 2748 for (i = 0; i < NR_MCS_STAT; i++) {
1917 val *= mem_cgroup_stat_desc[i].unit; 2749 if (i == MCS_SWAP && !do_swap_account)
1918 cb->fill(cb, mem_cgroup_stat_desc[i].msg, val); 2750 continue;
2751 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
1919 } 2752 }
1920 /* showing # of active pages */
1921 {
1922 unsigned long active_anon, inactive_anon;
1923 unsigned long active_file, inactive_file;
1924 unsigned long unevictable;
1925
1926 inactive_anon = mem_cgroup_get_all_zonestat(mem_cont,
1927 LRU_INACTIVE_ANON);
1928 active_anon = mem_cgroup_get_all_zonestat(mem_cont,
1929 LRU_ACTIVE_ANON);
1930 inactive_file = mem_cgroup_get_all_zonestat(mem_cont,
1931 LRU_INACTIVE_FILE);
1932 active_file = mem_cgroup_get_all_zonestat(mem_cont,
1933 LRU_ACTIVE_FILE);
1934 unevictable = mem_cgroup_get_all_zonestat(mem_cont,
1935 LRU_UNEVICTABLE);
1936
1937 cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE);
1938 cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE);
1939 cb->fill(cb, "active_file", (active_file) * PAGE_SIZE);
1940 cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE);
1941 cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);
1942 2753
1943 } 2754 /* Hierarchical information */
1944 { 2755 {
1945 unsigned long long limit, memsw_limit; 2756 unsigned long long limit, memsw_limit;
1946 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); 2757 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
@@ -1949,6 +2760,14 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
1949 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); 2760 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
1950 } 2761 }
1951 2762
2763 memset(&mystat, 0, sizeof(mystat));
2764 mem_cgroup_get_total_stat(mem_cont, &mystat);
2765 for (i = 0; i < NR_MCS_STAT; i++) {
2766 if (i == MCS_SWAP && !do_swap_account)
2767 continue;
2768 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
2769 }
2770
1952#ifdef CONFIG_DEBUG_VM 2771#ifdef CONFIG_DEBUG_VM
1953 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); 2772 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
1954 2773
@@ -2040,6 +2859,12 @@ static struct cftype mem_cgroup_files[] = {
2040 .read_u64 = mem_cgroup_read, 2859 .read_u64 = mem_cgroup_read,
2041 }, 2860 },
2042 { 2861 {
2862 .name = "soft_limit_in_bytes",
2863 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
2864 .write_string = mem_cgroup_write,
2865 .read_u64 = mem_cgroup_read,
2866 },
2867 {
2043 .name = "failcnt", 2868 .name = "failcnt",
2044 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 2869 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
2045 .trigger = mem_cgroup_reset, 2870 .trigger = mem_cgroup_reset,
@@ -2133,6 +2958,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
2133 mz = &pn->zoneinfo[zone]; 2958 mz = &pn->zoneinfo[zone];
2134 for_each_lru(l) 2959 for_each_lru(l)
2135 INIT_LIST_HEAD(&mz->lists[l]); 2960 INIT_LIST_HEAD(&mz->lists[l]);
2961 mz->usage_in_excess = 0;
2962 mz->on_tree = false;
2963 mz->mem = mem;
2136 } 2964 }
2137 return 0; 2965 return 0;
2138} 2966}
@@ -2178,6 +3006,9 @@ static void __mem_cgroup_free(struct mem_cgroup *mem)
2178{ 3006{
2179 int node; 3007 int node;
2180 3008
3009 mem_cgroup_remove_from_trees(mem);
3010 free_css_id(&mem_cgroup_subsys, &mem->css);
3011
2181 for_each_node_state(node, N_POSSIBLE) 3012 for_each_node_state(node, N_POSSIBLE)
2182 free_mem_cgroup_per_zone_info(mem, node); 3013 free_mem_cgroup_per_zone_info(mem, node);
2183 3014
@@ -2224,23 +3055,54 @@ static void __init enable_swap_cgroup(void)
2224} 3055}
2225#endif 3056#endif
2226 3057
3058static int mem_cgroup_soft_limit_tree_init(void)
3059{
3060 struct mem_cgroup_tree_per_node *rtpn;
3061 struct mem_cgroup_tree_per_zone *rtpz;
3062 int tmp, node, zone;
3063
3064 for_each_node_state(node, N_POSSIBLE) {
3065 tmp = node;
3066 if (!node_state(node, N_NORMAL_MEMORY))
3067 tmp = -1;
3068 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
3069 if (!rtpn)
3070 return 1;
3071
3072 soft_limit_tree.rb_tree_per_node[node] = rtpn;
3073
3074 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
3075 rtpz = &rtpn->rb_tree_per_zone[zone];
3076 rtpz->rb_root = RB_ROOT;
3077 spin_lock_init(&rtpz->lock);
3078 }
3079 }
3080 return 0;
3081}
3082
2227static struct cgroup_subsys_state * __ref 3083static struct cgroup_subsys_state * __ref
2228mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 3084mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2229{ 3085{
2230 struct mem_cgroup *mem, *parent; 3086 struct mem_cgroup *mem, *parent;
3087 long error = -ENOMEM;
2231 int node; 3088 int node;
2232 3089
2233 mem = mem_cgroup_alloc(); 3090 mem = mem_cgroup_alloc();
2234 if (!mem) 3091 if (!mem)
2235 return ERR_PTR(-ENOMEM); 3092 return ERR_PTR(error);
2236 3093
2237 for_each_node_state(node, N_POSSIBLE) 3094 for_each_node_state(node, N_POSSIBLE)
2238 if (alloc_mem_cgroup_per_zone_info(mem, node)) 3095 if (alloc_mem_cgroup_per_zone_info(mem, node))
2239 goto free_out; 3096 goto free_out;
3097
2240 /* root ? */ 3098 /* root ? */
2241 if (cont->parent == NULL) { 3099 if (cont->parent == NULL) {
2242 enable_swap_cgroup(); 3100 enable_swap_cgroup();
2243 parent = NULL; 3101 parent = NULL;
3102 root_mem_cgroup = mem;
3103 if (mem_cgroup_soft_limit_tree_init())
3104 goto free_out;
3105
2244 } else { 3106 } else {
2245 parent = mem_cgroup_from_cont(cont->parent); 3107 parent = mem_cgroup_from_cont(cont->parent);
2246 mem->use_hierarchy = parent->use_hierarchy; 3108 mem->use_hierarchy = parent->use_hierarchy;
@@ -2260,7 +3122,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2260 res_counter_init(&mem->res, NULL); 3122 res_counter_init(&mem->res, NULL);
2261 res_counter_init(&mem->memsw, NULL); 3123 res_counter_init(&mem->memsw, NULL);
2262 } 3124 }
2263 mem->last_scanned_child = NULL; 3125 mem->last_scanned_child = 0;
2264 spin_lock_init(&mem->reclaim_param_lock); 3126 spin_lock_init(&mem->reclaim_param_lock);
2265 3127
2266 if (parent) 3128 if (parent)
@@ -2269,26 +3131,23 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2269 return &mem->css; 3131 return &mem->css;
2270free_out: 3132free_out:
2271 __mem_cgroup_free(mem); 3133 __mem_cgroup_free(mem);
2272 return ERR_PTR(-ENOMEM); 3134 root_mem_cgroup = NULL;
3135 return ERR_PTR(error);
2273} 3136}
2274 3137
2275static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 3138static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
2276 struct cgroup *cont) 3139 struct cgroup *cont)
2277{ 3140{
2278 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3141 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2279 mem_cgroup_force_empty(mem, false); 3142
3143 return mem_cgroup_force_empty(mem, false);
2280} 3144}
2281 3145
2282static void mem_cgroup_destroy(struct cgroup_subsys *ss, 3146static void mem_cgroup_destroy(struct cgroup_subsys *ss,
2283 struct cgroup *cont) 3147 struct cgroup *cont)
2284{ 3148{
2285 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3149 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2286 struct mem_cgroup *last_scanned_child = mem->last_scanned_child;
2287 3150
2288 if (last_scanned_child) {
2289 VM_BUG_ON(!mem_cgroup_is_obsolete(last_scanned_child));
2290 mem_cgroup_put(last_scanned_child);
2291 }
2292 mem_cgroup_put(mem); 3151 mem_cgroup_put(mem);
2293} 3152}
2294 3153
@@ -2308,7 +3167,8 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
2308static void mem_cgroup_move_task(struct cgroup_subsys *ss, 3167static void mem_cgroup_move_task(struct cgroup_subsys *ss,
2309 struct cgroup *cont, 3168 struct cgroup *cont,
2310 struct cgroup *old_cont, 3169 struct cgroup *old_cont,
2311 struct task_struct *p) 3170 struct task_struct *p,
3171 bool threadgroup)
2312{ 3172{
2313 mutex_lock(&memcg_tasklist); 3173 mutex_lock(&memcg_tasklist);
2314 /* 3174 /*
@@ -2327,6 +3187,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
2327 .populate = mem_cgroup_populate, 3187 .populate = mem_cgroup_populate,
2328 .attach = mem_cgroup_move_task, 3188 .attach = mem_cgroup_move_task,
2329 .early_init = 0, 3189 .early_init = 0,
3190 .use_id = 1,
2330}; 3191};
2331 3192
2332#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3193#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
new file mode 100644
index 000000000000..dacc64183874
--- /dev/null
+++ b/mm/memory-failure.c
@@ -0,0 +1,835 @@
1/*
2 * Copyright (C) 2008, 2009 Intel Corporation
3 * Authors: Andi Kleen, Fengguang Wu
4 *
5 * This software may be redistributed and/or modified under the terms of
6 * the GNU General Public License ("GPL") version 2 only as published by the
7 * Free Software Foundation.
8 *
9 * High level machine check handler. Handles pages reported by the
10 * hardware as being corrupted usually due to a 2bit ECC memory or cache
11 * failure.
12 *
13 * Handles page cache pages in various states. The tricky part
14 * here is that we can access any page asynchronous to other VM
15 * users, because memory failures could happen anytime and anywhere,
16 * possibly violating some of their assumptions. This is why this code
17 * has to be extremely careful. Generally it tries to use normal locking
18 * rules, as in get the standard locks, even if that means the
19 * error handling takes potentially a long time.
20 *
21 * The operation to map back from RMAP chains to processes has to walk
22 * the complete process list and has non linear complexity with the number
23 * mappings. In short it can be quite slow. But since memory corruptions
24 * are rare we hope to get away with this.
25 */
26
27/*
28 * Notebook:
29 * - hugetlb needs more code
30 * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
31 * - pass bad pages to kdump next kernel
32 */
33#define DEBUG 1 /* remove me in 2.6.34 */
34#include <linux/kernel.h>
35#include <linux/mm.h>
36#include <linux/page-flags.h>
37#include <linux/sched.h>
38#include <linux/ksm.h>
39#include <linux/rmap.h>
40#include <linux/pagemap.h>
41#include <linux/swap.h>
42#include <linux/backing-dev.h>
43#include "internal.h"
44
45int sysctl_memory_failure_early_kill __read_mostly = 0;
46
47int sysctl_memory_failure_recovery __read_mostly = 1;
48
49atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
50
51/*
52 * Send all the processes who have the page mapped an ``action optional''
53 * signal.
54 */
55static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
56 unsigned long pfn)
57{
58 struct siginfo si;
59 int ret;
60
61 printk(KERN_ERR
62 "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n",
63 pfn, t->comm, t->pid);
64 si.si_signo = SIGBUS;
65 si.si_errno = 0;
66 si.si_code = BUS_MCEERR_AO;
67 si.si_addr = (void *)addr;
68#ifdef __ARCH_SI_TRAPNO
69 si.si_trapno = trapno;
70#endif
71 si.si_addr_lsb = PAGE_SHIFT;
72 /*
73 * Don't use force here, it's convenient if the signal
74 * can be temporarily blocked.
75 * This could cause a loop when the user sets SIGBUS
76 * to SIG_IGN, but hopefully noone will do that?
77 */
78 ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
79 if (ret < 0)
80 printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
81 t->comm, t->pid, ret);
82 return ret;
83}
84
85/*
86 * Kill all processes that have a poisoned page mapped and then isolate
87 * the page.
88 *
89 * General strategy:
90 * Find all processes having the page mapped and kill them.
91 * But we keep a page reference around so that the page is not
92 * actually freed yet.
93 * Then stash the page away
94 *
95 * There's no convenient way to get back to mapped processes
96 * from the VMAs. So do a brute-force search over all
97 * running processes.
98 *
99 * Remember that machine checks are not common (or rather
100 * if they are common you have other problems), so this shouldn't
101 * be a performance issue.
102 *
103 * Also there are some races possible while we get from the
104 * error detection to actually handle it.
105 */
106
107struct to_kill {
108 struct list_head nd;
109 struct task_struct *tsk;
110 unsigned long addr;
111 unsigned addr_valid:1;
112};
113
114/*
115 * Failure handling: if we can't find or can't kill a process there's
116 * not much we can do. We just print a message and ignore otherwise.
117 */
118
119/*
120 * Schedule a process for later kill.
121 * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
122 * TBD would GFP_NOIO be enough?
123 */
124static void add_to_kill(struct task_struct *tsk, struct page *p,
125 struct vm_area_struct *vma,
126 struct list_head *to_kill,
127 struct to_kill **tkc)
128{
129 struct to_kill *tk;
130
131 if (*tkc) {
132 tk = *tkc;
133 *tkc = NULL;
134 } else {
135 tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
136 if (!tk) {
137 printk(KERN_ERR
138 "MCE: Out of memory while machine check handling\n");
139 return;
140 }
141 }
142 tk->addr = page_address_in_vma(p, vma);
143 tk->addr_valid = 1;
144
145 /*
146 * In theory we don't have to kill when the page was
147 * munmaped. But it could be also a mremap. Since that's
148 * likely very rare kill anyways just out of paranoia, but use
149 * a SIGKILL because the error is not contained anymore.
150 */
151 if (tk->addr == -EFAULT) {
152 pr_debug("MCE: Unable to find user space address %lx in %s\n",
153 page_to_pfn(p), tsk->comm);
154 tk->addr_valid = 0;
155 }
156 get_task_struct(tsk);
157 tk->tsk = tsk;
158 list_add_tail(&tk->nd, to_kill);
159}
160
161/*
162 * Kill the processes that have been collected earlier.
163 *
164 * Only do anything when DOIT is set, otherwise just free the list
165 * (this is used for clean pages which do not need killing)
166 * Also when FAIL is set do a force kill because something went
167 * wrong earlier.
168 */
169static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
170 int fail, unsigned long pfn)
171{
172 struct to_kill *tk, *next;
173
174 list_for_each_entry_safe (tk, next, to_kill, nd) {
175 if (doit) {
176 /*
177 * In case something went wrong with munmaping
178 * make sure the process doesn't catch the
179 * signal and then access the memory. Just kill it.
180 * the signal handlers
181 */
182 if (fail || tk->addr_valid == 0) {
183 printk(KERN_ERR
184 "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
185 pfn, tk->tsk->comm, tk->tsk->pid);
186 force_sig(SIGKILL, tk->tsk);
187 }
188
189 /*
190 * In theory the process could have mapped
191 * something else on the address in-between. We could
192 * check for that, but we need to tell the
193 * process anyways.
194 */
195 else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
196 pfn) < 0)
197 printk(KERN_ERR
198 "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
199 pfn, tk->tsk->comm, tk->tsk->pid);
200 }
201 put_task_struct(tk->tsk);
202 kfree(tk);
203 }
204}
205
206static int task_early_kill(struct task_struct *tsk)
207{
208 if (!tsk->mm)
209 return 0;
210 if (tsk->flags & PF_MCE_PROCESS)
211 return !!(tsk->flags & PF_MCE_EARLY);
212 return sysctl_memory_failure_early_kill;
213}
214
215/*
216 * Collect processes when the error hit an anonymous page.
217 */
218static void collect_procs_anon(struct page *page, struct list_head *to_kill,
219 struct to_kill **tkc)
220{
221 struct vm_area_struct *vma;
222 struct task_struct *tsk;
223 struct anon_vma *av;
224
225 read_lock(&tasklist_lock);
226 av = page_lock_anon_vma(page);
227 if (av == NULL) /* Not actually mapped anymore */
228 goto out;
229 for_each_process (tsk) {
230 if (!task_early_kill(tsk))
231 continue;
232 list_for_each_entry (vma, &av->head, anon_vma_node) {
233 if (!page_mapped_in_vma(page, vma))
234 continue;
235 if (vma->vm_mm == tsk->mm)
236 add_to_kill(tsk, page, vma, to_kill, tkc);
237 }
238 }
239 page_unlock_anon_vma(av);
240out:
241 read_unlock(&tasklist_lock);
242}
243
244/*
245 * Collect processes when the error hit a file mapped page.
246 */
247static void collect_procs_file(struct page *page, struct list_head *to_kill,
248 struct to_kill **tkc)
249{
250 struct vm_area_struct *vma;
251 struct task_struct *tsk;
252 struct prio_tree_iter iter;
253 struct address_space *mapping = page->mapping;
254
255 /*
256 * A note on the locking order between the two locks.
257 * We don't rely on this particular order.
258 * If you have some other code that needs a different order
259 * feel free to switch them around. Or add a reverse link
260 * from mm_struct to task_struct, then this could be all
261 * done without taking tasklist_lock and looping over all tasks.
262 */
263
264 read_lock(&tasklist_lock);
265 spin_lock(&mapping->i_mmap_lock);
266 for_each_process(tsk) {
267 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
268
269 if (!task_early_kill(tsk))
270 continue;
271
272 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
273 pgoff) {
274 /*
275 * Send early kill signal to tasks where a vma covers
276 * the page but the corrupted page is not necessarily
277 * mapped it in its pte.
278 * Assume applications who requested early kill want
279 * to be informed of all such data corruptions.
280 */
281 if (vma->vm_mm == tsk->mm)
282 add_to_kill(tsk, page, vma, to_kill, tkc);
283 }
284 }
285 spin_unlock(&mapping->i_mmap_lock);
286 read_unlock(&tasklist_lock);
287}
288
289/*
290 * Collect the processes who have the corrupted page mapped to kill.
291 * This is done in two steps for locking reasons.
292 * First preallocate one tokill structure outside the spin locks,
293 * so that we can kill at least one process reasonably reliable.
294 */
295static void collect_procs(struct page *page, struct list_head *tokill)
296{
297 struct to_kill *tk;
298
299 if (!page->mapping)
300 return;
301
302 tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
303 if (!tk)
304 return;
305 if (PageAnon(page))
306 collect_procs_anon(page, tokill, &tk);
307 else
308 collect_procs_file(page, tokill, &tk);
309 kfree(tk);
310}
311
312/*
313 * Error handlers for various types of pages.
314 */
315
316enum outcome {
317 FAILED, /* Error handling failed */
318 DELAYED, /* Will be handled later */
319 IGNORED, /* Error safely ignored */
320 RECOVERED, /* Successfully recovered */
321};
322
323static const char *action_name[] = {
324 [FAILED] = "Failed",
325 [DELAYED] = "Delayed",
326 [IGNORED] = "Ignored",
327 [RECOVERED] = "Recovered",
328};
329
330/*
331 * Error hit kernel page.
332 * Do nothing, try to be lucky and not touch this instead. For a few cases we
333 * could be more sophisticated.
334 */
335static int me_kernel(struct page *p, unsigned long pfn)
336{
337 return DELAYED;
338}
339
340/*
341 * Already poisoned page.
342 */
343static int me_ignore(struct page *p, unsigned long pfn)
344{
345 return IGNORED;
346}
347
348/*
349 * Page in unknown state. Do nothing.
350 */
351static int me_unknown(struct page *p, unsigned long pfn)
352{
353 printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
354 return FAILED;
355}
356
357/*
358 * Free memory
359 */
360static int me_free(struct page *p, unsigned long pfn)
361{
362 return DELAYED;
363}
364
365/*
366 * Clean (or cleaned) page cache page.
367 */
368static int me_pagecache_clean(struct page *p, unsigned long pfn)
369{
370 int err;
371 int ret = FAILED;
372 struct address_space *mapping;
373
374 /*
375 * For anonymous pages we're done the only reference left
376 * should be the one m_f() holds.
377 */
378 if (PageAnon(p))
379 return RECOVERED;
380
381 /*
382 * Now truncate the page in the page cache. This is really
383 * more like a "temporary hole punch"
384 * Don't do this for block devices when someone else
385 * has a reference, because it could be file system metadata
386 * and that's not safe to truncate.
387 */
388 mapping = page_mapping(p);
389 if (!mapping) {
390 /*
391 * Page has been teared down in the meanwhile
392 */
393 return FAILED;
394 }
395
396 /*
397 * Truncation is a bit tricky. Enable it per file system for now.
398 *
399 * Open: to take i_mutex or not for this? Right now we don't.
400 */
401 if (mapping->a_ops->error_remove_page) {
402 err = mapping->a_ops->error_remove_page(mapping, p);
403 if (err != 0) {
404 printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
405 pfn, err);
406 } else if (page_has_private(p) &&
407 !try_to_release_page(p, GFP_NOIO)) {
408 pr_debug("MCE %#lx: failed to release buffers\n", pfn);
409 } else {
410 ret = RECOVERED;
411 }
412 } else {
413 /*
414 * If the file system doesn't support it just invalidate
415 * This fails on dirty or anything with private pages
416 */
417 if (invalidate_inode_page(p))
418 ret = RECOVERED;
419 else
420 printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
421 pfn);
422 }
423 return ret;
424}
425
426/*
427 * Dirty cache page page
428 * Issues: when the error hit a hole page the error is not properly
429 * propagated.
430 */
431static int me_pagecache_dirty(struct page *p, unsigned long pfn)
432{
433 struct address_space *mapping = page_mapping(p);
434
435 SetPageError(p);
436 /* TBD: print more information about the file. */
437 if (mapping) {
438 /*
439 * IO error will be reported by write(), fsync(), etc.
440 * who check the mapping.
441 * This way the application knows that something went
442 * wrong with its dirty file data.
443 *
444 * There's one open issue:
445 *
446 * The EIO will be only reported on the next IO
447 * operation and then cleared through the IO map.
448 * Normally Linux has two mechanisms to pass IO error
449 * first through the AS_EIO flag in the address space
450 * and then through the PageError flag in the page.
451 * Since we drop pages on memory failure handling the
452 * only mechanism open to use is through AS_AIO.
453 *
454 * This has the disadvantage that it gets cleared on
455 * the first operation that returns an error, while
456 * the PageError bit is more sticky and only cleared
457 * when the page is reread or dropped. If an
458 * application assumes it will always get error on
459 * fsync, but does other operations on the fd before
460 * and the page is dropped inbetween then the error
461 * will not be properly reported.
462 *
463 * This can already happen even without hwpoisoned
464 * pages: first on metadata IO errors (which only
465 * report through AS_EIO) or when the page is dropped
466 * at the wrong time.
467 *
468 * So right now we assume that the application DTRT on
469 * the first EIO, but we're not worse than other parts
470 * of the kernel.
471 */
472 mapping_set_error(mapping, EIO);
473 }
474
475 return me_pagecache_clean(p, pfn);
476}
477
478/*
479 * Clean and dirty swap cache.
480 *
481 * Dirty swap cache page is tricky to handle. The page could live both in page
482 * cache and swap cache(ie. page is freshly swapped in). So it could be
483 * referenced concurrently by 2 types of PTEs:
484 * normal PTEs and swap PTEs. We try to handle them consistently by calling
485 * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,
486 * and then
487 * - clear dirty bit to prevent IO
488 * - remove from LRU
489 * - but keep in the swap cache, so that when we return to it on
490 * a later page fault, we know the application is accessing
491 * corrupted data and shall be killed (we installed simple
492 * interception code in do_swap_page to catch it).
493 *
494 * Clean swap cache pages can be directly isolated. A later page fault will
495 * bring in the known good data from disk.
496 */
497static int me_swapcache_dirty(struct page *p, unsigned long pfn)
498{
499 ClearPageDirty(p);
500 /* Trigger EIO in shmem: */
501 ClearPageUptodate(p);
502
503 return DELAYED;
504}
505
506static int me_swapcache_clean(struct page *p, unsigned long pfn)
507{
508 delete_from_swap_cache(p);
509
510 return RECOVERED;
511}
512
513/*
514 * Huge pages. Needs work.
515 * Issues:
516 * No rmap support so we cannot find the original mapper. In theory could walk
517 * all MMs and look for the mappings, but that would be non atomic and racy.
518 * Need rmap for hugepages for this. Alternatively we could employ a heuristic,
519 * like just walking the current process and hoping it has it mapped (that
520 * should be usually true for the common "shared database cache" case)
521 * Should handle free huge pages and dequeue them too, but this needs to
522 * handle huge page accounting correctly.
523 */
524static int me_huge_page(struct page *p, unsigned long pfn)
525{
526 return FAILED;
527}
528
529/*
530 * Various page states we can handle.
531 *
532 * A page state is defined by its current page->flags bits.
533 * The table matches them in order and calls the right handler.
534 *
535 * This is quite tricky because we can access page at any time
536 * in its live cycle, so all accesses have to be extremly careful.
537 *
538 * This is not complete. More states could be added.
539 * For any missing state don't attempt recovery.
540 */
541
542#define dirty (1UL << PG_dirty)
543#define sc (1UL << PG_swapcache)
544#define unevict (1UL << PG_unevictable)
545#define mlock (1UL << PG_mlocked)
546#define writeback (1UL << PG_writeback)
547#define lru (1UL << PG_lru)
548#define swapbacked (1UL << PG_swapbacked)
549#define head (1UL << PG_head)
550#define tail (1UL << PG_tail)
551#define compound (1UL << PG_compound)
552#define slab (1UL << PG_slab)
553#define buddy (1UL << PG_buddy)
554#define reserved (1UL << PG_reserved)
555
556static struct page_state {
557 unsigned long mask;
558 unsigned long res;
559 char *msg;
560 int (*action)(struct page *p, unsigned long pfn);
561} error_states[] = {
562 { reserved, reserved, "reserved kernel", me_ignore },
563 { buddy, buddy, "free kernel", me_free },
564
565 /*
566 * Could in theory check if slab page is free or if we can drop
567 * currently unused objects without touching them. But just
568 * treat it as standard kernel for now.
569 */
570 { slab, slab, "kernel slab", me_kernel },
571
572#ifdef CONFIG_PAGEFLAGS_EXTENDED
573 { head, head, "huge", me_huge_page },
574 { tail, tail, "huge", me_huge_page },
575#else
576 { compound, compound, "huge", me_huge_page },
577#endif
578
579 { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty },
580 { sc|dirty, sc, "swapcache", me_swapcache_clean },
581
582 { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty},
583 { unevict, unevict, "unevictable LRU", me_pagecache_clean},
584
585#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
586 { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty },
587 { mlock, mlock, "mlocked LRU", me_pagecache_clean },
588#endif
589
590 { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty },
591 { lru|dirty, lru, "clean LRU", me_pagecache_clean },
592 { swapbacked, swapbacked, "anonymous", me_pagecache_clean },
593
594 /*
595 * Catchall entry: must be at end.
596 */
597 { 0, 0, "unknown page state", me_unknown },
598};
599
600static void action_result(unsigned long pfn, char *msg, int result)
601{
602 struct page *page = NULL;
603 if (pfn_valid(pfn))
604 page = pfn_to_page(pfn);
605
606 printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
607 pfn,
608 page && PageDirty(page) ? "dirty " : "",
609 msg, action_name[result]);
610}
611
612static int page_action(struct page_state *ps, struct page *p,
613 unsigned long pfn, int ref)
614{
615 int result;
616 int count;
617
618 result = ps->action(p, pfn);
619 action_result(pfn, ps->msg, result);
620
621 count = page_count(p) - 1 - ref;
622 if (count != 0)
623 printk(KERN_ERR
624 "MCE %#lx: %s page still referenced by %d users\n",
625 pfn, ps->msg, count);
626
627 /* Could do more checks here if page looks ok */
628 /*
629 * Could adjust zone counters here to correct for the missing page.
630 */
631
632 return result == RECOVERED ? 0 : -EBUSY;
633}
634
635#define N_UNMAP_TRIES 5
636
637/*
638 * Do all that is necessary to remove user space mappings. Unmap
639 * the pages and send SIGBUS to the processes if the data was dirty.
640 */
641static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
642 int trapno)
643{
644 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
645 struct address_space *mapping;
646 LIST_HEAD(tokill);
647 int ret;
648 int i;
649 int kill = 1;
650
651 if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p))
652 return;
653
654 /*
655 * This check implies we don't kill processes if their pages
656 * are in the swap cache early. Those are always late kills.
657 */
658 if (!page_mapped(p))
659 return;
660
661 if (PageSwapCache(p)) {
662 printk(KERN_ERR
663 "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
664 ttu |= TTU_IGNORE_HWPOISON;
665 }
666
667 /*
668 * Propagate the dirty bit from PTEs to struct page first, because we
669 * need this to decide if we should kill or just drop the page.
670 */
671 mapping = page_mapping(p);
672 if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) {
673 if (page_mkclean(p)) {
674 SetPageDirty(p);
675 } else {
676 kill = 0;
677 ttu |= TTU_IGNORE_HWPOISON;
678 printk(KERN_INFO
679 "MCE %#lx: corrupted page was clean: dropped without side effects\n",
680 pfn);
681 }
682 }
683
684 /*
685 * First collect all the processes that have the page
686 * mapped in dirty form. This has to be done before try_to_unmap,
687 * because ttu takes the rmap data structures down.
688 *
689 * Error handling: We ignore errors here because
690 * there's nothing that can be done.
691 */
692 if (kill)
693 collect_procs(p, &tokill);
694
695 /*
696 * try_to_unmap can fail temporarily due to races.
697 * Try a few times (RED-PEN better strategy?)
698 */
699 for (i = 0; i < N_UNMAP_TRIES; i++) {
700 ret = try_to_unmap(p, ttu);
701 if (ret == SWAP_SUCCESS)
702 break;
703 pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
704 }
705
706 if (ret != SWAP_SUCCESS)
707 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
708 pfn, page_mapcount(p));
709
710 /*
711 * Now that the dirty bit has been propagated to the
712 * struct page and all unmaps done we can decide if
713 * killing is needed or not. Only kill when the page
714 * was dirty, otherwise the tokill list is merely
715 * freed. When there was a problem unmapping earlier
716 * use a more force-full uncatchable kill to prevent
717 * any accesses to the poisoned memory.
718 */
719 kill_procs_ao(&tokill, !!PageDirty(p), trapno,
720 ret != SWAP_SUCCESS, pfn);
721}
722
723int __memory_failure(unsigned long pfn, int trapno, int ref)
724{
725 unsigned long lru_flag;
726 struct page_state *ps;
727 struct page *p;
728 int res;
729
730 if (!sysctl_memory_failure_recovery)
731 panic("Memory failure from trap %d on page %lx", trapno, pfn);
732
733 if (!pfn_valid(pfn)) {
734 action_result(pfn, "memory outside kernel control", IGNORED);
735 return -EIO;
736 }
737
738 p = pfn_to_page(pfn);
739 if (TestSetPageHWPoison(p)) {
740 action_result(pfn, "already hardware poisoned", IGNORED);
741 return 0;
742 }
743
744 atomic_long_add(1, &mce_bad_pages);
745
746 /*
747 * We need/can do nothing about count=0 pages.
748 * 1) it's a free page, and therefore in safe hand:
749 * prep_new_page() will be the gate keeper.
750 * 2) it's part of a non-compound high order page.
751 * Implies some kernel user: cannot stop them from
752 * R/W the page; let's pray that the page has been
753 * used and will be freed some time later.
754 * In fact it's dangerous to directly bump up page count from 0,
755 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
756 */
757 if (!get_page_unless_zero(compound_head(p))) {
758 action_result(pfn, "free or high order kernel", IGNORED);
759 return PageBuddy(compound_head(p)) ? 0 : -EBUSY;
760 }
761
762 /*
763 * We ignore non-LRU pages for good reasons.
764 * - PG_locked is only well defined for LRU pages and a few others
765 * - to avoid races with __set_page_locked()
766 * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
767 * The check (unnecessarily) ignores LRU pages being isolated and
768 * walked by the page reclaim code, however that's not a big loss.
769 */
770 if (!PageLRU(p))
771 lru_add_drain_all();
772 lru_flag = p->flags & lru;
773 if (isolate_lru_page(p)) {
774 action_result(pfn, "non LRU", IGNORED);
775 put_page(p);
776 return -EBUSY;
777 }
778 page_cache_release(p);
779
780 /*
781 * Lock the page and wait for writeback to finish.
782 * It's very difficult to mess with pages currently under IO
783 * and in many cases impossible, so we just avoid it here.
784 */
785 lock_page_nosync(p);
786 wait_on_page_writeback(p);
787
788 /*
789 * Now take care of user space mappings.
790 */
791 hwpoison_user_mappings(p, pfn, trapno);
792
793 /*
794 * Torn down by someone else?
795 */
796 if ((lru_flag & lru) && !PageSwapCache(p) && p->mapping == NULL) {
797 action_result(pfn, "already truncated LRU", IGNORED);
798 res = 0;
799 goto out;
800 }
801
802 res = -EBUSY;
803 for (ps = error_states;; ps++) {
804 if (((p->flags | lru_flag)& ps->mask) == ps->res) {
805 res = page_action(ps, p, pfn, ref);
806 break;
807 }
808 }
809out:
810 unlock_page(p);
811 return res;
812}
813EXPORT_SYMBOL_GPL(__memory_failure);
814
815/**
816 * memory_failure - Handle memory failure of a page.
817 * @pfn: Page Number of the corrupted page
818 * @trapno: Trap number reported in the signal to user space.
819 *
820 * This function is called by the low level machine check code
821 * of an architecture when it detects hardware memory corruption
822 * of a page. It tries its best to recover, which includes
823 * dropping pages, killing processes etc.
824 *
825 * The function is primarily of use for corruptions that
826 * happen outside the current execution context (e.g. when
827 * detected by a background scrubber)
828 *
829 * Must run in process context (e.g. a work queue) with interrupts
830 * enabled and no spinlocks hold.
831 */
832void memory_failure(unsigned long pfn, int trapno)
833{
834 __memory_failure(pfn, trapno, 0);
835}
diff --git a/mm/memory.c b/mm/memory.c
index cf6873e91c6a..6ab19dd4a199 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -45,6 +45,7 @@
45#include <linux/swap.h> 45#include <linux/swap.h>
46#include <linux/highmem.h> 46#include <linux/highmem.h>
47#include <linux/pagemap.h> 47#include <linux/pagemap.h>
48#include <linux/ksm.h>
48#include <linux/rmap.h> 49#include <linux/rmap.h>
49#include <linux/module.h> 50#include <linux/module.h>
50#include <linux/delayacct.h> 51#include <linux/delayacct.h>
@@ -56,6 +57,7 @@
56#include <linux/swapops.h> 57#include <linux/swapops.h>
57#include <linux/elf.h> 58#include <linux/elf.h>
58 59
60#include <asm/io.h>
59#include <asm/pgalloc.h> 61#include <asm/pgalloc.h>
60#include <asm/uaccess.h> 62#include <asm/uaccess.h>
61#include <asm/tlb.h> 63#include <asm/tlb.h>
@@ -106,6 +108,18 @@ static int __init disable_randmaps(char *s)
106} 108}
107__setup("norandmaps", disable_randmaps); 109__setup("norandmaps", disable_randmaps);
108 110
111unsigned long zero_pfn __read_mostly;
112unsigned long highest_memmap_pfn __read_mostly;
113
114/*
115 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
116 */
117static int __init init_zero_pfn(void)
118{
119 zero_pfn = page_to_pfn(ZERO_PAGE(0));
120 return 0;
121}
122core_initcall(init_zero_pfn);
109 123
110/* 124/*
111 * If a p?d_bad entry is found while walking page tables, report 125 * If a p?d_bad entry is found while walking page tables, report
@@ -135,11 +149,12 @@ void pmd_clear_bad(pmd_t *pmd)
135 * Note: this doesn't free the actual pages themselves. That 149 * Note: this doesn't free the actual pages themselves. That
136 * has been handled earlier when unmapping all the memory regions. 150 * has been handled earlier when unmapping all the memory regions.
137 */ 151 */
138static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) 152static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
153 unsigned long addr)
139{ 154{
140 pgtable_t token = pmd_pgtable(*pmd); 155 pgtable_t token = pmd_pgtable(*pmd);
141 pmd_clear(pmd); 156 pmd_clear(pmd);
142 pte_free_tlb(tlb, token); 157 pte_free_tlb(tlb, token, addr);
143 tlb->mm->nr_ptes--; 158 tlb->mm->nr_ptes--;
144} 159}
145 160
@@ -157,7 +172,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
157 next = pmd_addr_end(addr, end); 172 next = pmd_addr_end(addr, end);
158 if (pmd_none_or_clear_bad(pmd)) 173 if (pmd_none_or_clear_bad(pmd))
159 continue; 174 continue;
160 free_pte_range(tlb, pmd); 175 free_pte_range(tlb, pmd, addr);
161 } while (pmd++, addr = next, addr != end); 176 } while (pmd++, addr = next, addr != end);
162 177
163 start &= PUD_MASK; 178 start &= PUD_MASK;
@@ -173,7 +188,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
173 188
174 pmd = pmd_offset(pud, start); 189 pmd = pmd_offset(pud, start);
175 pud_clear(pud); 190 pud_clear(pud);
176 pmd_free_tlb(tlb, pmd); 191 pmd_free_tlb(tlb, pmd, start);
177} 192}
178 193
179static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, 194static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -206,7 +221,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
206 221
207 pud = pud_offset(pgd, start); 222 pud = pud_offset(pgd, start);
208 pgd_clear(pgd); 223 pgd_clear(pgd);
209 pud_free_tlb(tlb, pud); 224 pud_free_tlb(tlb, pud, start);
210} 225}
211 226
212/* 227/*
@@ -282,7 +297,8 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
282 unsigned long addr = vma->vm_start; 297 unsigned long addr = vma->vm_start;
283 298
284 /* 299 /*
285 * Hide vma from rmap and vmtruncate before freeing pgtables 300 * Hide vma from rmap and truncate_pagecache before freeing
301 * pgtables
286 */ 302 */
287 anon_vma_unlink(vma); 303 anon_vma_unlink(vma);
288 unlink_file_vma(vma); 304 unlink_file_vma(vma);
@@ -441,6 +457,20 @@ static inline int is_cow_mapping(unsigned int flags)
441 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 457 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
442} 458}
443 459
460#ifndef is_zero_pfn
461static inline int is_zero_pfn(unsigned long pfn)
462{
463 return pfn == zero_pfn;
464}
465#endif
466
467#ifndef my_zero_pfn
468static inline unsigned long my_zero_pfn(unsigned long addr)
469{
470 return zero_pfn;
471}
472#endif
473
444/* 474/*
445 * vm_normal_page -- This function gets the "struct page" associated with a pte. 475 * vm_normal_page -- This function gets the "struct page" associated with a pte.
446 * 476 *
@@ -496,7 +526,9 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
496 if (HAVE_PTE_SPECIAL) { 526 if (HAVE_PTE_SPECIAL) {
497 if (likely(!pte_special(pte))) 527 if (likely(!pte_special(pte)))
498 goto check_pfn; 528 goto check_pfn;
499 if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) 529 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
530 return NULL;
531 if (!is_zero_pfn(pfn))
500 print_bad_pte(vma, addr, pte, NULL); 532 print_bad_pte(vma, addr, pte, NULL);
501 return NULL; 533 return NULL;
502 } 534 }
@@ -518,6 +550,8 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
518 } 550 }
519 } 551 }
520 552
553 if (is_zero_pfn(pfn))
554 return NULL;
521check_pfn: 555check_pfn:
522 if (unlikely(pfn > highest_memmap_pfn)) { 556 if (unlikely(pfn > highest_memmap_pfn)) {
523 print_bad_pte(vma, addr, pte, NULL); 557 print_bad_pte(vma, addr, pte, NULL);
@@ -595,8 +629,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
595 page = vm_normal_page(vma, addr, pte); 629 page = vm_normal_page(vma, addr, pte);
596 if (page) { 630 if (page) {
597 get_page(page); 631 get_page(page);
598 page_dup_rmap(page, vma, addr); 632 page_dup_rmap(page);
599 rss[!!PageAnon(page)]++; 633 rss[PageAnon(page)]++;
600 } 634 }
601 635
602out_set_pte: 636out_set_pte:
@@ -607,6 +641,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
607 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, 641 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
608 unsigned long addr, unsigned long end) 642 unsigned long addr, unsigned long end)
609{ 643{
644 pte_t *orig_src_pte, *orig_dst_pte;
610 pte_t *src_pte, *dst_pte; 645 pte_t *src_pte, *dst_pte;
611 spinlock_t *src_ptl, *dst_ptl; 646 spinlock_t *src_ptl, *dst_ptl;
612 int progress = 0; 647 int progress = 0;
@@ -620,6 +655,8 @@ again:
620 src_pte = pte_offset_map_nested(src_pmd, addr); 655 src_pte = pte_offset_map_nested(src_pmd, addr);
621 src_ptl = pte_lockptr(src_mm, src_pmd); 656 src_ptl = pte_lockptr(src_mm, src_pmd);
622 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 657 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
658 orig_src_pte = src_pte;
659 orig_dst_pte = dst_pte;
623 arch_enter_lazy_mmu_mode(); 660 arch_enter_lazy_mmu_mode();
624 661
625 do { 662 do {
@@ -643,9 +680,9 @@ again:
643 680
644 arch_leave_lazy_mmu_mode(); 681 arch_leave_lazy_mmu_mode();
645 spin_unlock(src_ptl); 682 spin_unlock(src_ptl);
646 pte_unmap_nested(src_pte - 1); 683 pte_unmap_nested(orig_src_pte);
647 add_mm_rss(dst_mm, rss[0], rss[1]); 684 add_mm_rss(dst_mm, rss[0], rss[1]);
648 pte_unmap_unlock(dst_pte - 1, dst_ptl); 685 pte_unmap_unlock(orig_dst_pte, dst_ptl);
649 cond_resched(); 686 cond_resched();
650 if (addr != end) 687 if (addr != end)
651 goto again; 688 goto again;
@@ -1141,9 +1178,14 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1141 goto no_page; 1178 goto no_page;
1142 if ((flags & FOLL_WRITE) && !pte_write(pte)) 1179 if ((flags & FOLL_WRITE) && !pte_write(pte))
1143 goto unlock; 1180 goto unlock;
1181
1144 page = vm_normal_page(vma, address, pte); 1182 page = vm_normal_page(vma, address, pte);
1145 if (unlikely(!page)) 1183 if (unlikely(!page)) {
1146 goto bad_page; 1184 if ((flags & FOLL_DUMP) ||
1185 !is_zero_pfn(pte_pfn(pte)))
1186 goto bad_page;
1187 page = pte_page(pte);
1188 }
1147 1189
1148 if (flags & FOLL_GET) 1190 if (flags & FOLL_GET)
1149 get_page(page); 1191 get_page(page);
@@ -1171,65 +1213,46 @@ no_page:
1171 pte_unmap_unlock(ptep, ptl); 1213 pte_unmap_unlock(ptep, ptl);
1172 if (!pte_none(pte)) 1214 if (!pte_none(pte))
1173 return page; 1215 return page;
1174 /* Fall through to ZERO_PAGE handling */ 1216
1175no_page_table: 1217no_page_table:
1176 /* 1218 /*
1177 * When core dumping an enormous anonymous area that nobody 1219 * When core dumping an enormous anonymous area that nobody
1178 * has touched so far, we don't want to allocate page tables. 1220 * has touched so far, we don't want to allocate unnecessary pages or
1221 * page tables. Return error instead of NULL to skip handle_mm_fault,
1222 * then get_dump_page() will return NULL to leave a hole in the dump.
1223 * But we can only make this optimization where a hole would surely
1224 * be zero-filled if handle_mm_fault() actually did handle it.
1179 */ 1225 */
1180 if (flags & FOLL_ANON) { 1226 if ((flags & FOLL_DUMP) &&
1181 page = ZERO_PAGE(0); 1227 (!vma->vm_ops || !vma->vm_ops->fault))
1182 if (flags & FOLL_GET) 1228 return ERR_PTR(-EFAULT);
1183 get_page(page);
1184 BUG_ON(flags & FOLL_WRITE);
1185 }
1186 return page; 1229 return page;
1187} 1230}
1188 1231
1189/* Can we do the FOLL_ANON optimization? */
1190static inline int use_zero_page(struct vm_area_struct *vma)
1191{
1192 /*
1193 * We don't want to optimize FOLL_ANON for make_pages_present()
1194 * when it tries to page in a VM_LOCKED region. As to VM_SHARED,
1195 * we want to get the page from the page tables to make sure
1196 * that we serialize and update with any other user of that
1197 * mapping.
1198 */
1199 if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
1200 return 0;
1201 /*
1202 * And if we have a fault routine, it's not an anonymous region.
1203 */
1204 return !vma->vm_ops || !vma->vm_ops->fault;
1205}
1206
1207
1208
1209int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1232int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1210 unsigned long start, int len, int flags, 1233 unsigned long start, int nr_pages, unsigned int gup_flags,
1211 struct page **pages, struct vm_area_struct **vmas) 1234 struct page **pages, struct vm_area_struct **vmas)
1212{ 1235{
1213 int i; 1236 int i;
1214 unsigned int vm_flags = 0; 1237 unsigned long vm_flags;
1215 int write = !!(flags & GUP_FLAGS_WRITE);
1216 int force = !!(flags & GUP_FLAGS_FORCE);
1217 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
1218 int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL);
1219 1238
1220 if (len <= 0) 1239 if (nr_pages <= 0)
1221 return 0; 1240 return 0;
1241
1242 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
1243
1222 /* 1244 /*
1223 * Require read or write permissions. 1245 * Require read or write permissions.
1224 * If 'force' is set, we only require the "MAY" flags. 1246 * If FOLL_FORCE is set, we only require the "MAY" flags.
1225 */ 1247 */
1226 vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); 1248 vm_flags = (gup_flags & FOLL_WRITE) ?
1227 vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 1249 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1250 vm_flags &= (gup_flags & FOLL_FORCE) ?
1251 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1228 i = 0; 1252 i = 0;
1229 1253
1230 do { 1254 do {
1231 struct vm_area_struct *vma; 1255 struct vm_area_struct *vma;
1232 unsigned int foll_flags;
1233 1256
1234 vma = find_extend_vma(mm, start); 1257 vma = find_extend_vma(mm, start);
1235 if (!vma && in_gate_area(tsk, start)) { 1258 if (!vma && in_gate_area(tsk, start)) {
@@ -1241,7 +1264,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1241 pte_t *pte; 1264 pte_t *pte;
1242 1265
1243 /* user gate pages are read-only */ 1266 /* user gate pages are read-only */
1244 if (!ignore && write) 1267 if (gup_flags & FOLL_WRITE)
1245 return i ? : -EFAULT; 1268 return i ? : -EFAULT;
1246 if (pg > TASK_SIZE) 1269 if (pg > TASK_SIZE)
1247 pgd = pgd_offset_k(pg); 1270 pgd = pgd_offset_k(pg);
@@ -1269,53 +1292,45 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1269 vmas[i] = gate_vma; 1292 vmas[i] = gate_vma;
1270 i++; 1293 i++;
1271 start += PAGE_SIZE; 1294 start += PAGE_SIZE;
1272 len--; 1295 nr_pages--;
1273 continue; 1296 continue;
1274 } 1297 }
1275 1298
1276 if (!vma || 1299 if (!vma ||
1277 (vma->vm_flags & (VM_IO | VM_PFNMAP)) || 1300 (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1278 (!ignore && !(vm_flags & vma->vm_flags))) 1301 !(vm_flags & vma->vm_flags))
1279 return i ? : -EFAULT; 1302 return i ? : -EFAULT;
1280 1303
1281 if (is_vm_hugetlb_page(vma)) { 1304 if (is_vm_hugetlb_page(vma)) {
1282 i = follow_hugetlb_page(mm, vma, pages, vmas, 1305 i = follow_hugetlb_page(mm, vma, pages, vmas,
1283 &start, &len, i, write); 1306 &start, &nr_pages, i, gup_flags);
1284 continue; 1307 continue;
1285 } 1308 }
1286 1309
1287 foll_flags = FOLL_TOUCH;
1288 if (pages)
1289 foll_flags |= FOLL_GET;
1290 if (!write && use_zero_page(vma))
1291 foll_flags |= FOLL_ANON;
1292
1293 do { 1310 do {
1294 struct page *page; 1311 struct page *page;
1312 unsigned int foll_flags = gup_flags;
1295 1313
1296 /* 1314 /*
1297 * If we have a pending SIGKILL, don't keep faulting 1315 * If we have a pending SIGKILL, don't keep faulting
1298 * pages and potentially allocating memory, unless 1316 * pages and potentially allocating memory.
1299 * current is handling munlock--e.g., on exit. In
1300 * that case, we are not allocating memory. Rather,
1301 * we're only unlocking already resident/mapped pages.
1302 */ 1317 */
1303 if (unlikely(!ignore_sigkill && 1318 if (unlikely(fatal_signal_pending(current)))
1304 fatal_signal_pending(current)))
1305 return i ? i : -ERESTARTSYS; 1319 return i ? i : -ERESTARTSYS;
1306 1320
1307 if (write)
1308 foll_flags |= FOLL_WRITE;
1309
1310 cond_resched(); 1321 cond_resched();
1311 while (!(page = follow_page(vma, start, foll_flags))) { 1322 while (!(page = follow_page(vma, start, foll_flags))) {
1312 int ret; 1323 int ret;
1324
1313 ret = handle_mm_fault(mm, vma, start, 1325 ret = handle_mm_fault(mm, vma, start,
1314 foll_flags & FOLL_WRITE); 1326 (foll_flags & FOLL_WRITE) ?
1327 FAULT_FLAG_WRITE : 0);
1328
1315 if (ret & VM_FAULT_ERROR) { 1329 if (ret & VM_FAULT_ERROR) {
1316 if (ret & VM_FAULT_OOM) 1330 if (ret & VM_FAULT_OOM)
1317 return i ? i : -ENOMEM; 1331 return i ? i : -ENOMEM;
1318 else if (ret & VM_FAULT_SIGBUS) 1332 if (ret &
1333 (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS))
1319 return i ? i : -EFAULT; 1334 return i ? i : -EFAULT;
1320 BUG(); 1335 BUG();
1321 } 1336 }
@@ -1354,30 +1369,107 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1354 vmas[i] = vma; 1369 vmas[i] = vma;
1355 i++; 1370 i++;
1356 start += PAGE_SIZE; 1371 start += PAGE_SIZE;
1357 len--; 1372 nr_pages--;
1358 } while (len && start < vma->vm_end); 1373 } while (nr_pages && start < vma->vm_end);
1359 } while (len); 1374 } while (nr_pages);
1360 return i; 1375 return i;
1361} 1376}
1362 1377
1378/**
1379 * get_user_pages() - pin user pages in memory
1380 * @tsk: task_struct of target task
1381 * @mm: mm_struct of target mm
1382 * @start: starting user address
1383 * @nr_pages: number of pages from start to pin
1384 * @write: whether pages will be written to by the caller
1385 * @force: whether to force write access even if user mapping is
1386 * readonly. This will result in the page being COWed even
1387 * in MAP_SHARED mappings. You do not want this.
1388 * @pages: array that receives pointers to the pages pinned.
1389 * Should be at least nr_pages long. Or NULL, if caller
1390 * only intends to ensure the pages are faulted in.
1391 * @vmas: array of pointers to vmas corresponding to each page.
1392 * Or NULL if the caller does not require them.
1393 *
1394 * Returns number of pages pinned. This may be fewer than the number
1395 * requested. If nr_pages is 0 or negative, returns 0. If no pages
1396 * were pinned, returns -errno. Each page returned must be released
1397 * with a put_page() call when it is finished with. vmas will only
1398 * remain valid while mmap_sem is held.
1399 *
1400 * Must be called with mmap_sem held for read or write.
1401 *
1402 * get_user_pages walks a process's page tables and takes a reference to
1403 * each struct page that each user address corresponds to at a given
1404 * instant. That is, it takes the page that would be accessed if a user
1405 * thread accesses the given user virtual address at that instant.
1406 *
1407 * This does not guarantee that the page exists in the user mappings when
1408 * get_user_pages returns, and there may even be a completely different
1409 * page there in some cases (eg. if mmapped pagecache has been invalidated
1410 * and subsequently re faulted). However it does guarantee that the page
1411 * won't be freed completely. And mostly callers simply care that the page
1412 * contains data that was valid *at some point in time*. Typically, an IO
1413 * or similar operation cannot guarantee anything stronger anyway because
1414 * locks can't be held over the syscall boundary.
1415 *
1416 * If write=0, the page must not be written to. If the page is written to,
1417 * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called
1418 * after the page is finished with, and before put_page is called.
1419 *
1420 * get_user_pages is typically used for fewer-copy IO operations, to get a
1421 * handle on the memory by some means other than accesses via the user virtual
1422 * addresses. The pages may be submitted for DMA to devices or accessed via
1423 * their kernel linear mapping (via the kmap APIs). Care should be taken to
1424 * use the correct cache flushing APIs.
1425 *
1426 * See also get_user_pages_fast, for performance critical applications.
1427 */
1363int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1428int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1364 unsigned long start, int len, int write, int force, 1429 unsigned long start, int nr_pages, int write, int force,
1365 struct page **pages, struct vm_area_struct **vmas) 1430 struct page **pages, struct vm_area_struct **vmas)
1366{ 1431{
1367 int flags = 0; 1432 int flags = FOLL_TOUCH;
1368 1433
1434 if (pages)
1435 flags |= FOLL_GET;
1369 if (write) 1436 if (write)
1370 flags |= GUP_FLAGS_WRITE; 1437 flags |= FOLL_WRITE;
1371 if (force) 1438 if (force)
1372 flags |= GUP_FLAGS_FORCE; 1439 flags |= FOLL_FORCE;
1373 1440
1374 return __get_user_pages(tsk, mm, 1441 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
1375 start, len, flags,
1376 pages, vmas);
1377} 1442}
1378
1379EXPORT_SYMBOL(get_user_pages); 1443EXPORT_SYMBOL(get_user_pages);
1380 1444
1445/**
1446 * get_dump_page() - pin user page in memory while writing it to core dump
1447 * @addr: user address
1448 *
1449 * Returns struct page pointer of user page pinned for dump,
1450 * to be freed afterwards by page_cache_release() or put_page().
1451 *
1452 * Returns NULL on any kind of failure - a hole must then be inserted into
1453 * the corefile, to preserve alignment with its headers; and also returns
1454 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
1455 * allowing a hole to be left in the corefile to save diskspace.
1456 *
1457 * Called without mmap_sem, but after all other threads have been killed.
1458 */
1459#ifdef CONFIG_ELF_CORE
1460struct page *get_dump_page(unsigned long addr)
1461{
1462 struct vm_area_struct *vma;
1463 struct page *page;
1464
1465 if (__get_user_pages(current, current->mm, addr, 1,
1466 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1)
1467 return NULL;
1468 flush_cache_page(vma, addr, page_to_pfn(page));
1469 return page;
1470}
1471#endif /* CONFIG_ELF_CORE */
1472
1381pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, 1473pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
1382 spinlock_t **ptl) 1474 spinlock_t **ptl)
1383{ 1475{
@@ -1555,7 +1647,8 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1555 * If we don't have pte special, then we have to use the pfn_valid() 1647 * If we don't have pte special, then we have to use the pfn_valid()
1556 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* 1648 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
1557 * refcount the page if pfn_valid is true (hence insert_page rather 1649 * refcount the page if pfn_valid is true (hence insert_page rather
1558 * than insert_pfn). 1650 * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
1651 * without pte special, it would there be refcounted as a normal page.
1559 */ 1652 */
1560 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { 1653 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
1561 struct page *page; 1654 struct page *page;
@@ -1730,10 +1823,10 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
1730 token = pmd_pgtable(*pmd); 1823 token = pmd_pgtable(*pmd);
1731 1824
1732 do { 1825 do {
1733 err = fn(pte, token, addr, data); 1826 err = fn(pte++, token, addr, data);
1734 if (err) 1827 if (err)
1735 break; 1828 break;
1736 } while (pte++, addr += PAGE_SIZE, addr != end); 1829 } while (addr += PAGE_SIZE, addr != end);
1737 1830
1738 arch_leave_lazy_mmu_mode(); 1831 arch_leave_lazy_mmu_mode();
1739 1832
@@ -1921,7 +2014,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1921 * Take out anonymous pages first, anonymous shared vmas are 2014 * Take out anonymous pages first, anonymous shared vmas are
1922 * not dirty accountable. 2015 * not dirty accountable.
1923 */ 2016 */
1924 if (PageAnon(old_page)) { 2017 if (PageAnon(old_page) && !PageKsm(old_page)) {
1925 if (!trylock_page(old_page)) { 2018 if (!trylock_page(old_page)) {
1926 page_cache_get(old_page); 2019 page_cache_get(old_page);
1927 pte_unmap_unlock(page_table, ptl); 2020 pte_unmap_unlock(page_table, ptl);
@@ -1971,6 +2064,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1971 ret = tmp; 2064 ret = tmp;
1972 goto unwritable_page; 2065 goto unwritable_page;
1973 } 2066 }
2067 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
2068 lock_page(old_page);
2069 if (!old_page->mapping) {
2070 ret = 0; /* retry the fault */
2071 unlock_page(old_page);
2072 goto unwritable_page;
2073 }
2074 } else
2075 VM_BUG_ON(!PageLocked(old_page));
1974 2076
1975 /* 2077 /*
1976 * Since we dropped the lock we need to revalidate 2078 * Since we dropped the lock we need to revalidate
@@ -1980,9 +2082,11 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1980 */ 2082 */
1981 page_table = pte_offset_map_lock(mm, pmd, address, 2083 page_table = pte_offset_map_lock(mm, pmd, address,
1982 &ptl); 2084 &ptl);
1983 page_cache_release(old_page); 2085 if (!pte_same(*page_table, orig_pte)) {
1984 if (!pte_same(*page_table, orig_pte)) 2086 unlock_page(old_page);
2087 page_cache_release(old_page);
1985 goto unlock; 2088 goto unlock;
2089 }
1986 2090
1987 page_mkwrite = 1; 2091 page_mkwrite = 1;
1988 } 2092 }
@@ -2011,10 +2115,19 @@ gotten:
2011 2115
2012 if (unlikely(anon_vma_prepare(vma))) 2116 if (unlikely(anon_vma_prepare(vma)))
2013 goto oom; 2117 goto oom;
2014 VM_BUG_ON(old_page == ZERO_PAGE(0)); 2118
2015 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 2119 if (is_zero_pfn(pte_pfn(orig_pte))) {
2016 if (!new_page) 2120 new_page = alloc_zeroed_user_highpage_movable(vma, address);
2017 goto oom; 2121 if (!new_page)
2122 goto oom;
2123 } else {
2124 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2125 if (!new_page)
2126 goto oom;
2127 cow_user_page(new_page, old_page, address, vma);
2128 }
2129 __SetPageUptodate(new_page);
2130
2018 /* 2131 /*
2019 * Don't let another task, with possibly unlocked vma, 2132 * Don't let another task, with possibly unlocked vma,
2020 * keep the mlocked page. 2133 * keep the mlocked page.
@@ -2024,8 +2137,6 @@ gotten:
2024 clear_page_mlock(old_page); 2137 clear_page_mlock(old_page);
2025 unlock_page(old_page); 2138 unlock_page(old_page);
2026 } 2139 }
2027 cow_user_page(new_page, old_page, address, vma);
2028 __SetPageUptodate(new_page);
2029 2140
2030 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) 2141 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2031 goto oom_free_new; 2142 goto oom_free_new;
@@ -2051,9 +2162,14 @@ gotten:
2051 * seen in the presence of one thread doing SMC and another 2162 * seen in the presence of one thread doing SMC and another
2052 * thread doing COW. 2163 * thread doing COW.
2053 */ 2164 */
2054 ptep_clear_flush_notify(vma, address, page_table); 2165 ptep_clear_flush(vma, address, page_table);
2055 page_add_new_anon_rmap(new_page, vma, address); 2166 page_add_new_anon_rmap(new_page, vma, address);
2056 set_pte_at(mm, address, page_table, entry); 2167 /*
2168 * We call the notify macro here because, when using secondary
2169 * mmu page tables (such as kvm shadow page tables), we want the
2170 * new page to be mapped directly into the secondary page table.
2171 */
2172 set_pte_at_notify(mm, address, page_table, entry);
2057 update_mmu_cache(vma, address, entry); 2173 update_mmu_cache(vma, address, entry);
2058 if (old_page) { 2174 if (old_page) {
2059 /* 2175 /*
@@ -2094,9 +2210,6 @@ gotten:
2094unlock: 2210unlock:
2095 pte_unmap_unlock(page_table, ptl); 2211 pte_unmap_unlock(page_table, ptl);
2096 if (dirty_page) { 2212 if (dirty_page) {
2097 if (vma->vm_file)
2098 file_update_time(vma->vm_file);
2099
2100 /* 2213 /*
2101 * Yes, Virginia, this is actually required to prevent a race 2214 * Yes, Virginia, this is actually required to prevent a race
2102 * with clear_page_dirty_for_io() from clearing the page dirty 2215 * with clear_page_dirty_for_io() from clearing the page dirty
@@ -2105,16 +2218,41 @@ unlock:
2105 * 2218 *
2106 * do_no_page is protected similarly. 2219 * do_no_page is protected similarly.
2107 */ 2220 */
2108 wait_on_page_locked(dirty_page); 2221 if (!page_mkwrite) {
2109 set_page_dirty_balance(dirty_page, page_mkwrite); 2222 wait_on_page_locked(dirty_page);
2223 set_page_dirty_balance(dirty_page, page_mkwrite);
2224 }
2110 put_page(dirty_page); 2225 put_page(dirty_page);
2226 if (page_mkwrite) {
2227 struct address_space *mapping = dirty_page->mapping;
2228
2229 set_page_dirty(dirty_page);
2230 unlock_page(dirty_page);
2231 page_cache_release(dirty_page);
2232 if (mapping) {
2233 /*
2234 * Some device drivers do not set page.mapping
2235 * but still dirty their pages
2236 */
2237 balance_dirty_pages_ratelimited(mapping);
2238 }
2239 }
2240
2241 /* file_update_time outside page_lock */
2242 if (vma->vm_file)
2243 file_update_time(vma->vm_file);
2111 } 2244 }
2112 return ret; 2245 return ret;
2113oom_free_new: 2246oom_free_new:
2114 page_cache_release(new_page); 2247 page_cache_release(new_page);
2115oom: 2248oom:
2116 if (old_page) 2249 if (old_page) {
2250 if (page_mkwrite) {
2251 unlock_page(old_page);
2252 page_cache_release(old_page);
2253 }
2117 page_cache_release(old_page); 2254 page_cache_release(old_page);
2255 }
2118 return VM_FAULT_OOM; 2256 return VM_FAULT_OOM;
2119 2257
2120unwritable_page: 2258unwritable_page:
@@ -2274,7 +2412,7 @@ restart:
2274 * @mapping: the address space containing mmaps to be unmapped. 2412 * @mapping: the address space containing mmaps to be unmapped.
2275 * @holebegin: byte in first page to unmap, relative to the start of 2413 * @holebegin: byte in first page to unmap, relative to the start of
2276 * the underlying file. This will be rounded down to a PAGE_SIZE 2414 * the underlying file. This will be rounded down to a PAGE_SIZE
2277 * boundary. Note that this is different from vmtruncate(), which 2415 * boundary. Note that this is different from truncate_pagecache(), which
2278 * must keep the partial page. In contrast, we must get rid of 2416 * must keep the partial page. In contrast, we must get rid of
2279 * partial pages. 2417 * partial pages.
2280 * @holelen: size of prospective hole in bytes. This will be rounded 2418 * @holelen: size of prospective hole in bytes. This will be rounded
@@ -2325,63 +2463,6 @@ void unmap_mapping_range(struct address_space *mapping,
2325} 2463}
2326EXPORT_SYMBOL(unmap_mapping_range); 2464EXPORT_SYMBOL(unmap_mapping_range);
2327 2465
2328/**
2329 * vmtruncate - unmap mappings "freed" by truncate() syscall
2330 * @inode: inode of the file used
2331 * @offset: file offset to start truncating
2332 *
2333 * NOTE! We have to be ready to update the memory sharing
2334 * between the file and the memory map for a potential last
2335 * incomplete page. Ugly, but necessary.
2336 */
2337int vmtruncate(struct inode * inode, loff_t offset)
2338{
2339 if (inode->i_size < offset) {
2340 unsigned long limit;
2341
2342 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
2343 if (limit != RLIM_INFINITY && offset > limit)
2344 goto out_sig;
2345 if (offset > inode->i_sb->s_maxbytes)
2346 goto out_big;
2347 i_size_write(inode, offset);
2348 } else {
2349 struct address_space *mapping = inode->i_mapping;
2350
2351 /*
2352 * truncation of in-use swapfiles is disallowed - it would
2353 * cause subsequent swapout to scribble on the now-freed
2354 * blocks.
2355 */
2356 if (IS_SWAPFILE(inode))
2357 return -ETXTBSY;
2358 i_size_write(inode, offset);
2359
2360 /*
2361 * unmap_mapping_range is called twice, first simply for
2362 * efficiency so that truncate_inode_pages does fewer
2363 * single-page unmaps. However after this first call, and
2364 * before truncate_inode_pages finishes, it is possible for
2365 * private pages to be COWed, which remain after
2366 * truncate_inode_pages finishes, hence the second
2367 * unmap_mapping_range call must be made for correctness.
2368 */
2369 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
2370 truncate_inode_pages(mapping, offset);
2371 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
2372 }
2373
2374 if (inode->i_op->truncate)
2375 inode->i_op->truncate(inode);
2376 return 0;
2377
2378out_sig:
2379 send_sig(SIGXFSZ, current, 0);
2380out_big:
2381 return -EFBIG;
2382}
2383EXPORT_SYMBOL(vmtruncate);
2384
2385int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) 2466int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
2386{ 2467{
2387 struct address_space *mapping = inode->i_mapping; 2468 struct address_space *mapping = inode->i_mapping;
@@ -2413,7 +2494,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
2413 */ 2494 */
2414static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, 2495static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2415 unsigned long address, pte_t *page_table, pmd_t *pmd, 2496 unsigned long address, pte_t *page_table, pmd_t *pmd,
2416 int write_access, pte_t orig_pte) 2497 unsigned int flags, pte_t orig_pte)
2417{ 2498{
2418 spinlock_t *ptl; 2499 spinlock_t *ptl;
2419 struct page *page; 2500 struct page *page;
@@ -2426,14 +2507,21 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2426 goto out; 2507 goto out;
2427 2508
2428 entry = pte_to_swp_entry(orig_pte); 2509 entry = pte_to_swp_entry(orig_pte);
2429 if (is_migration_entry(entry)) { 2510 if (unlikely(non_swap_entry(entry))) {
2430 migration_entry_wait(mm, pmd, address); 2511 if (is_migration_entry(entry)) {
2512 migration_entry_wait(mm, pmd, address);
2513 } else if (is_hwpoison_entry(entry)) {
2514 ret = VM_FAULT_HWPOISON;
2515 } else {
2516 print_bad_pte(vma, address, orig_pte, NULL);
2517 ret = VM_FAULT_OOM;
2518 }
2431 goto out; 2519 goto out;
2432 } 2520 }
2433 delayacct_set_flag(DELAYACCT_PF_SWAPIN); 2521 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2434 page = lookup_swap_cache(entry); 2522 page = lookup_swap_cache(entry);
2435 if (!page) { 2523 if (!page) {
2436 grab_swap_token(); /* Contend for token _before_ read-in */ 2524 grab_swap_token(mm); /* Contend for token _before_ read-in */
2437 page = swapin_readahead(entry, 2525 page = swapin_readahead(entry,
2438 GFP_HIGHUSER_MOVABLE, vma, address); 2526 GFP_HIGHUSER_MOVABLE, vma, address);
2439 if (!page) { 2527 if (!page) {
@@ -2451,6 +2539,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2451 /* Had to read the page from swap area: Major fault */ 2539 /* Had to read the page from swap area: Major fault */
2452 ret = VM_FAULT_MAJOR; 2540 ret = VM_FAULT_MAJOR;
2453 count_vm_event(PGMAJFAULT); 2541 count_vm_event(PGMAJFAULT);
2542 } else if (PageHWPoison(page)) {
2543 ret = VM_FAULT_HWPOISON;
2544 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2545 goto out_release;
2454 } 2546 }
2455 2547
2456 lock_page(page); 2548 lock_page(page);
@@ -2458,8 +2550,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2458 2550
2459 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { 2551 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
2460 ret = VM_FAULT_OOM; 2552 ret = VM_FAULT_OOM;
2461 unlock_page(page); 2553 goto out_page;
2462 goto out;
2463 } 2554 }
2464 2555
2465 /* 2556 /*
@@ -2490,9 +2581,9 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2490 2581
2491 inc_mm_counter(mm, anon_rss); 2582 inc_mm_counter(mm, anon_rss);
2492 pte = mk_pte(page, vma->vm_page_prot); 2583 pte = mk_pte(page, vma->vm_page_prot);
2493 if (write_access && reuse_swap_page(page)) { 2584 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
2494 pte = maybe_mkwrite(pte_mkdirty(pte), vma); 2585 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2495 write_access = 0; 2586 flags &= ~FAULT_FLAG_WRITE;
2496 } 2587 }
2497 flush_icache_page(vma, page); 2588 flush_icache_page(vma, page);
2498 set_pte_at(mm, address, page_table, pte); 2589 set_pte_at(mm, address, page_table, pte);
@@ -2505,7 +2596,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2505 try_to_free_swap(page); 2596 try_to_free_swap(page);
2506 unlock_page(page); 2597 unlock_page(page);
2507 2598
2508 if (write_access) { 2599 if (flags & FAULT_FLAG_WRITE) {
2509 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); 2600 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
2510 if (ret & VM_FAULT_ERROR) 2601 if (ret & VM_FAULT_ERROR)
2511 ret &= VM_FAULT_ERROR; 2602 ret &= VM_FAULT_ERROR;
@@ -2521,7 +2612,9 @@ out:
2521out_nomap: 2612out_nomap:
2522 mem_cgroup_cancel_charge_swapin(ptr); 2613 mem_cgroup_cancel_charge_swapin(ptr);
2523 pte_unmap_unlock(page_table, ptl); 2614 pte_unmap_unlock(page_table, ptl);
2615out_page:
2524 unlock_page(page); 2616 unlock_page(page);
2617out_release:
2525 page_cache_release(page); 2618 page_cache_release(page);
2526 return ret; 2619 return ret;
2527} 2620}
@@ -2533,12 +2626,22 @@ out_nomap:
2533 */ 2626 */
2534static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, 2627static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2535 unsigned long address, pte_t *page_table, pmd_t *pmd, 2628 unsigned long address, pte_t *page_table, pmd_t *pmd,
2536 int write_access) 2629 unsigned int flags)
2537{ 2630{
2538 struct page *page; 2631 struct page *page;
2539 spinlock_t *ptl; 2632 spinlock_t *ptl;
2540 pte_t entry; 2633 pte_t entry;
2541 2634
2635 if (!(flags & FAULT_FLAG_WRITE)) {
2636 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
2637 vma->vm_page_prot));
2638 ptl = pte_lockptr(mm, pmd);
2639 spin_lock(ptl);
2640 if (!pte_none(*page_table))
2641 goto unlock;
2642 goto setpte;
2643 }
2644
2542 /* Allocate our own private page. */ 2645 /* Allocate our own private page. */
2543 pte_unmap(page_table); 2646 pte_unmap(page_table);
2544 2647
@@ -2553,13 +2656,16 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2553 goto oom_free_page; 2656 goto oom_free_page;
2554 2657
2555 entry = mk_pte(page, vma->vm_page_prot); 2658 entry = mk_pte(page, vma->vm_page_prot);
2556 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2659 if (vma->vm_flags & VM_WRITE)
2660 entry = pte_mkwrite(pte_mkdirty(entry));
2557 2661
2558 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2662 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2559 if (!pte_none(*page_table)) 2663 if (!pte_none(*page_table))
2560 goto release; 2664 goto release;
2665
2561 inc_mm_counter(mm, anon_rss); 2666 inc_mm_counter(mm, anon_rss);
2562 page_add_new_anon_rmap(page, vma, address); 2667 page_add_new_anon_rmap(page, vma, address);
2668setpte:
2563 set_pte_at(mm, address, page_table, entry); 2669 set_pte_at(mm, address, page_table, entry);
2564 2670
2565 /* No need to invalidate - it was non-present before */ 2671 /* No need to invalidate - it was non-present before */
@@ -2614,6 +2720,12 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2614 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) 2720 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
2615 return ret; 2721 return ret;
2616 2722
2723 if (unlikely(PageHWPoison(vmf.page))) {
2724 if (ret & VM_FAULT_LOCKED)
2725 unlock_page(vmf.page);
2726 return VM_FAULT_HWPOISON;
2727 }
2728
2617 /* 2729 /*
2618 * For consistency in subsequent calls, make the faulted page always 2730 * For consistency in subsequent calls, make the faulted page always
2619 * locked. 2731 * locked.
@@ -2664,27 +2776,22 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2664 int tmp; 2776 int tmp;
2665 2777
2666 unlock_page(page); 2778 unlock_page(page);
2667 vmf.flags |= FAULT_FLAG_MKWRITE; 2779 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2668 tmp = vma->vm_ops->page_mkwrite(vma, &vmf); 2780 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
2669 if (unlikely(tmp & 2781 if (unlikely(tmp &
2670 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { 2782 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2671 ret = tmp; 2783 ret = tmp;
2672 anon = 1; /* no anon but release vmf.page */ 2784 goto unwritable_page;
2673 goto out_unlocked;
2674 }
2675 lock_page(page);
2676 /*
2677 * XXX: this is not quite right (racy vs
2678 * invalidate) to unlock and relock the page
2679 * like this, however a better fix requires
2680 * reworking page_mkwrite locking API, which
2681 * is better done later.
2682 */
2683 if (!page->mapping) {
2684 ret = 0;
2685 anon = 1; /* no anon but release vmf.page */
2686 goto out;
2687 } 2785 }
2786 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
2787 lock_page(page);
2788 if (!page->mapping) {
2789 ret = 0; /* retry the fault */
2790 unlock_page(page);
2791 goto unwritable_page;
2792 }
2793 } else
2794 VM_BUG_ON(!PageLocked(page));
2688 page_mkwrite = 1; 2795 page_mkwrite = 1;
2689 } 2796 }
2690 } 2797 }
@@ -2698,7 +2805,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2698 * due to the bad i386 page protection. But it's valid 2805 * due to the bad i386 page protection. But it's valid
2699 * for other architectures too. 2806 * for other architectures too.
2700 * 2807 *
2701 * Note that if write_access is true, we either now have 2808 * Note that if FAULT_FLAG_WRITE is set, we either now have
2702 * an exclusive copy of the page, or this is a shared mapping, 2809 * an exclusive copy of the page, or this is a shared mapping,
2703 * so we can make it writable and dirty to avoid having to 2810 * so we can make it writable and dirty to avoid having to
2704 * handle that later. 2811 * handle that later.
@@ -2736,28 +2843,43 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2736 pte_unmap_unlock(page_table, ptl); 2843 pte_unmap_unlock(page_table, ptl);
2737 2844
2738out: 2845out:
2739 unlock_page(vmf.page); 2846 if (dirty_page) {
2740out_unlocked: 2847 struct address_space *mapping = page->mapping;
2741 if (anon)
2742 page_cache_release(vmf.page);
2743 else if (dirty_page) {
2744 if (vma->vm_file)
2745 file_update_time(vma->vm_file);
2746 2848
2747 set_page_dirty_balance(dirty_page, page_mkwrite); 2849 if (set_page_dirty(dirty_page))
2850 page_mkwrite = 1;
2851 unlock_page(dirty_page);
2748 put_page(dirty_page); 2852 put_page(dirty_page);
2853 if (page_mkwrite && mapping) {
2854 /*
2855 * Some device drivers do not set page.mapping but still
2856 * dirty their pages
2857 */
2858 balance_dirty_pages_ratelimited(mapping);
2859 }
2860
2861 /* file_update_time outside page_lock */
2862 if (vma->vm_file)
2863 file_update_time(vma->vm_file);
2864 } else {
2865 unlock_page(vmf.page);
2866 if (anon)
2867 page_cache_release(vmf.page);
2749 } 2868 }
2750 2869
2751 return ret; 2870 return ret;
2871
2872unwritable_page:
2873 page_cache_release(page);
2874 return ret;
2752} 2875}
2753 2876
2754static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, 2877static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2755 unsigned long address, pte_t *page_table, pmd_t *pmd, 2878 unsigned long address, pte_t *page_table, pmd_t *pmd,
2756 int write_access, pte_t orig_pte) 2879 unsigned int flags, pte_t orig_pte)
2757{ 2880{
2758 pgoff_t pgoff = (((address & PAGE_MASK) 2881 pgoff_t pgoff = (((address & PAGE_MASK)
2759 - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 2882 - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2760 unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
2761 2883
2762 pte_unmap(page_table); 2884 pte_unmap(page_table);
2763 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); 2885 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
@@ -2774,12 +2896,12 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2774 */ 2896 */
2775static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, 2897static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2776 unsigned long address, pte_t *page_table, pmd_t *pmd, 2898 unsigned long address, pte_t *page_table, pmd_t *pmd,
2777 int write_access, pte_t orig_pte) 2899 unsigned int flags, pte_t orig_pte)
2778{ 2900{
2779 unsigned int flags = FAULT_FLAG_NONLINEAR |
2780 (write_access ? FAULT_FLAG_WRITE : 0);
2781 pgoff_t pgoff; 2901 pgoff_t pgoff;
2782 2902
2903 flags |= FAULT_FLAG_NONLINEAR;
2904
2783 if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) 2905 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2784 return 0; 2906 return 0;
2785 2907
@@ -2810,7 +2932,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2810 */ 2932 */
2811static inline int handle_pte_fault(struct mm_struct *mm, 2933static inline int handle_pte_fault(struct mm_struct *mm,
2812 struct vm_area_struct *vma, unsigned long address, 2934 struct vm_area_struct *vma, unsigned long address,
2813 pte_t *pte, pmd_t *pmd, int write_access) 2935 pte_t *pte, pmd_t *pmd, unsigned int flags)
2814{ 2936{
2815 pte_t entry; 2937 pte_t entry;
2816 spinlock_t *ptl; 2938 spinlock_t *ptl;
@@ -2821,30 +2943,30 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2821 if (vma->vm_ops) { 2943 if (vma->vm_ops) {
2822 if (likely(vma->vm_ops->fault)) 2944 if (likely(vma->vm_ops->fault))
2823 return do_linear_fault(mm, vma, address, 2945 return do_linear_fault(mm, vma, address,
2824 pte, pmd, write_access, entry); 2946 pte, pmd, flags, entry);
2825 } 2947 }
2826 return do_anonymous_page(mm, vma, address, 2948 return do_anonymous_page(mm, vma, address,
2827 pte, pmd, write_access); 2949 pte, pmd, flags);
2828 } 2950 }
2829 if (pte_file(entry)) 2951 if (pte_file(entry))
2830 return do_nonlinear_fault(mm, vma, address, 2952 return do_nonlinear_fault(mm, vma, address,
2831 pte, pmd, write_access, entry); 2953 pte, pmd, flags, entry);
2832 return do_swap_page(mm, vma, address, 2954 return do_swap_page(mm, vma, address,
2833 pte, pmd, write_access, entry); 2955 pte, pmd, flags, entry);
2834 } 2956 }
2835 2957
2836 ptl = pte_lockptr(mm, pmd); 2958 ptl = pte_lockptr(mm, pmd);
2837 spin_lock(ptl); 2959 spin_lock(ptl);
2838 if (unlikely(!pte_same(*pte, entry))) 2960 if (unlikely(!pte_same(*pte, entry)))
2839 goto unlock; 2961 goto unlock;
2840 if (write_access) { 2962 if (flags & FAULT_FLAG_WRITE) {
2841 if (!pte_write(entry)) 2963 if (!pte_write(entry))
2842 return do_wp_page(mm, vma, address, 2964 return do_wp_page(mm, vma, address,
2843 pte, pmd, ptl, entry); 2965 pte, pmd, ptl, entry);
2844 entry = pte_mkdirty(entry); 2966 entry = pte_mkdirty(entry);
2845 } 2967 }
2846 entry = pte_mkyoung(entry); 2968 entry = pte_mkyoung(entry);
2847 if (ptep_set_access_flags(vma, address, pte, entry, write_access)) { 2969 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
2848 update_mmu_cache(vma, address, entry); 2970 update_mmu_cache(vma, address, entry);
2849 } else { 2971 } else {
2850 /* 2972 /*
@@ -2853,7 +2975,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2853 * This still avoids useless tlb flushes for .text page faults 2975 * This still avoids useless tlb flushes for .text page faults
2854 * with threads. 2976 * with threads.
2855 */ 2977 */
2856 if (write_access) 2978 if (flags & FAULT_FLAG_WRITE)
2857 flush_tlb_page(vma, address); 2979 flush_tlb_page(vma, address);
2858 } 2980 }
2859unlock: 2981unlock:
@@ -2865,7 +2987,7 @@ unlock:
2865 * By the time we get here, we already hold the mm semaphore 2987 * By the time we get here, we already hold the mm semaphore
2866 */ 2988 */
2867int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, 2989int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2868 unsigned long address, int write_access) 2990 unsigned long address, unsigned int flags)
2869{ 2991{
2870 pgd_t *pgd; 2992 pgd_t *pgd;
2871 pud_t *pud; 2993 pud_t *pud;
@@ -2877,7 +2999,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2877 count_vm_event(PGFAULT); 2999 count_vm_event(PGFAULT);
2878 3000
2879 if (unlikely(is_vm_hugetlb_page(vma))) 3001 if (unlikely(is_vm_hugetlb_page(vma)))
2880 return hugetlb_fault(mm, vma, address, write_access); 3002 return hugetlb_fault(mm, vma, address, flags);
2881 3003
2882 pgd = pgd_offset(mm, address); 3004 pgd = pgd_offset(mm, address);
2883 pud = pud_alloc(mm, pgd, address); 3005 pud = pud_alloc(mm, pgd, address);
@@ -2890,7 +3012,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2890 if (!pte) 3012 if (!pte)
2891 return VM_FAULT_OOM; 3013 return VM_FAULT_OOM;
2892 3014
2893 return handle_pte_fault(mm, vma, address, pte, pmd, write_access); 3015 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
2894} 3016}
2895 3017
2896#ifndef __PAGETABLE_PUD_FOLDED 3018#ifndef __PAGETABLE_PUD_FOLDED
@@ -3009,22 +3131,13 @@ int in_gate_area_no_task(unsigned long addr)
3009 3131
3010#endif /* __HAVE_ARCH_GATE_AREA */ 3132#endif /* __HAVE_ARCH_GATE_AREA */
3011 3133
3012#ifdef CONFIG_HAVE_IOREMAP_PROT 3134static int follow_pte(struct mm_struct *mm, unsigned long address,
3013int follow_phys(struct vm_area_struct *vma, 3135 pte_t **ptepp, spinlock_t **ptlp)
3014 unsigned long address, unsigned int flags,
3015 unsigned long *prot, resource_size_t *phys)
3016{ 3136{
3017 pgd_t *pgd; 3137 pgd_t *pgd;
3018 pud_t *pud; 3138 pud_t *pud;
3019 pmd_t *pmd; 3139 pmd_t *pmd;
3020 pte_t *ptep, pte; 3140 pte_t *ptep;
3021 spinlock_t *ptl;
3022 resource_size_t phys_addr = 0;
3023 struct mm_struct *mm = vma->vm_mm;
3024 int ret = -EINVAL;
3025
3026 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3027 goto out;
3028 3141
3029 pgd = pgd_offset(mm, address); 3142 pgd = pgd_offset(mm, address);
3030 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) 3143 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
@@ -3042,22 +3155,71 @@ int follow_phys(struct vm_area_struct *vma,
3042 if (pmd_huge(*pmd)) 3155 if (pmd_huge(*pmd))
3043 goto out; 3156 goto out;
3044 3157
3045 ptep = pte_offset_map_lock(mm, pmd, address, &ptl); 3158 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
3046 if (!ptep) 3159 if (!ptep)
3047 goto out; 3160 goto out;
3161 if (!pte_present(*ptep))
3162 goto unlock;
3163 *ptepp = ptep;
3164 return 0;
3165unlock:
3166 pte_unmap_unlock(ptep, *ptlp);
3167out:
3168 return -EINVAL;
3169}
3170
3171/**
3172 * follow_pfn - look up PFN at a user virtual address
3173 * @vma: memory mapping
3174 * @address: user virtual address
3175 * @pfn: location to store found PFN
3176 *
3177 * Only IO mappings and raw PFN mappings are allowed.
3178 *
3179 * Returns zero and the pfn at @pfn on success, -ve otherwise.
3180 */
3181int follow_pfn(struct vm_area_struct *vma, unsigned long address,
3182 unsigned long *pfn)
3183{
3184 int ret = -EINVAL;
3185 spinlock_t *ptl;
3186 pte_t *ptep;
3187
3188 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3189 return ret;
3048 3190
3191 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
3192 if (ret)
3193 return ret;
3194 *pfn = pte_pfn(*ptep);
3195 pte_unmap_unlock(ptep, ptl);
3196 return 0;
3197}
3198EXPORT_SYMBOL(follow_pfn);
3199
3200#ifdef CONFIG_HAVE_IOREMAP_PROT
3201int follow_phys(struct vm_area_struct *vma,
3202 unsigned long address, unsigned int flags,
3203 unsigned long *prot, resource_size_t *phys)
3204{
3205 int ret = -EINVAL;
3206 pte_t *ptep, pte;
3207 spinlock_t *ptl;
3208
3209 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3210 goto out;
3211
3212 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
3213 goto out;
3049 pte = *ptep; 3214 pte = *ptep;
3050 if (!pte_present(pte)) 3215
3051 goto unlock;
3052 if ((flags & FOLL_WRITE) && !pte_write(pte)) 3216 if ((flags & FOLL_WRITE) && !pte_write(pte))
3053 goto unlock; 3217 goto unlock;
3054 phys_addr = pte_pfn(pte);
3055 phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */
3056 3218
3057 *prot = pgprot_val(pte_pgprot(pte)); 3219 *prot = pgprot_val(pte_pgprot(pte));
3058 *phys = phys_addr; 3220 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
3059 ret = 0;
3060 3221
3222 ret = 0;
3061unlock: 3223unlock:
3062 pte_unmap_unlock(ptep, ptl); 3224 pte_unmap_unlock(ptep, ptl);
3063out: 3225out:
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c083cf5fd6df..2047465cd27c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -26,6 +26,7 @@
26#include <linux/migrate.h> 26#include <linux/migrate.h>
27#include <linux/page-isolation.h> 27#include <linux/page-isolation.h>
28#include <linux/pfn.h> 28#include <linux/pfn.h>
29#include <linux/suspend.h>
29 30
30#include <asm/tlbflush.h> 31#include <asm/tlbflush.h>
31 32
@@ -339,8 +340,11 @@ EXPORT_SYMBOL_GPL(__remove_pages);
339 340
340void online_page(struct page *page) 341void online_page(struct page *page)
341{ 342{
343 unsigned long pfn = page_to_pfn(page);
344
342 totalram_pages++; 345 totalram_pages++;
343 num_physpages++; 346 if (pfn >= num_physpages)
347 num_physpages = pfn + 1;
344 348
345#ifdef CONFIG_HIGHMEM 349#ifdef CONFIG_HIGHMEM
346 if (PageHighMem(page)) 350 if (PageHighMem(page))
@@ -410,7 +414,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
410 if (!populated_zone(zone)) 414 if (!populated_zone(zone))
411 need_zonelists_rebuild = 1; 415 need_zonelists_rebuild = 1;
412 416
413 ret = walk_memory_resource(pfn, nr_pages, &onlined_pages, 417 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
414 online_pages_range); 418 online_pages_range);
415 if (ret) { 419 if (ret) {
416 printk(KERN_DEBUG "online_pages %lx at %lx failed\n", 420 printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
@@ -422,7 +426,9 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
422 zone->present_pages += onlined_pages; 426 zone->present_pages += onlined_pages;
423 zone->zone_pgdat->node_present_pages += onlined_pages; 427 zone->zone_pgdat->node_present_pages += onlined_pages;
424 428
425 setup_per_zone_pages_min(); 429 zone_pcp_update(zone);
430 setup_per_zone_wmarks();
431 calculate_zone_inactive_ratio(zone);
426 if (onlined_pages) { 432 if (onlined_pages) {
427 kswapd_run(zone_to_nid(zone)); 433 kswapd_run(zone_to_nid(zone));
428 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); 434 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
@@ -442,7 +448,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
442} 448}
443#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ 449#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
444 450
445static pg_data_t *hotadd_new_pgdat(int nid, u64 start) 451/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
452static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
446{ 453{
447 struct pglist_data *pgdat; 454 struct pglist_data *pgdat;
448 unsigned long zones_size[MAX_NR_ZONES] = {0}; 455 unsigned long zones_size[MAX_NR_ZONES] = {0};
@@ -479,14 +486,18 @@ int __ref add_memory(int nid, u64 start, u64 size)
479 struct resource *res; 486 struct resource *res;
480 int ret; 487 int ret;
481 488
489 lock_system_sleep();
490
482 res = register_memory_resource(start, size); 491 res = register_memory_resource(start, size);
492 ret = -EEXIST;
483 if (!res) 493 if (!res)
484 return -EEXIST; 494 goto out;
485 495
486 if (!node_online(nid)) { 496 if (!node_online(nid)) {
487 pgdat = hotadd_new_pgdat(nid, start); 497 pgdat = hotadd_new_pgdat(nid, start);
498 ret = -ENOMEM;
488 if (!pgdat) 499 if (!pgdat)
489 return -ENOMEM; 500 goto out;
490 new_pgdat = 1; 501 new_pgdat = 1;
491 } 502 }
492 503
@@ -509,7 +520,8 @@ int __ref add_memory(int nid, u64 start, u64 size)
509 BUG_ON(ret); 520 BUG_ON(ret);
510 } 521 }
511 522
512 return ret; 523 goto out;
524
513error: 525error:
514 /* rollback pgdat allocation and others */ 526 /* rollback pgdat allocation and others */
515 if (new_pgdat) 527 if (new_pgdat)
@@ -517,6 +529,8 @@ error:
517 if (res) 529 if (res)
518 release_memory_resource(res); 530 release_memory_resource(res);
519 531
532out:
533 unlock_system_sleep();
520 return ret; 534 return ret;
521} 535}
522EXPORT_SYMBOL_GPL(add_memory); 536EXPORT_SYMBOL_GPL(add_memory);
@@ -700,7 +714,7 @@ offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
700static void 714static void
701offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 715offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
702{ 716{
703 walk_memory_resource(start_pfn, end_pfn - start_pfn, NULL, 717 walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL,
704 offline_isolated_pages_cb); 718 offline_isolated_pages_cb);
705} 719}
706 720
@@ -726,7 +740,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
726 long offlined = 0; 740 long offlined = 0;
727 int ret; 741 int ret;
728 742
729 ret = walk_memory_resource(start_pfn, end_pfn - start_pfn, &offlined, 743 ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined,
730 check_pages_isolated_cb); 744 check_pages_isolated_cb);
731 if (ret < 0) 745 if (ret < 0)
732 offlined = (long)ret; 746 offlined = (long)ret;
@@ -753,6 +767,8 @@ int offline_pages(unsigned long start_pfn,
753 if (!test_pages_in_a_zone(start_pfn, end_pfn)) 767 if (!test_pages_in_a_zone(start_pfn, end_pfn))
754 return -EINVAL; 768 return -EINVAL;
755 769
770 lock_system_sleep();
771
756 zone = page_zone(pfn_to_page(start_pfn)); 772 zone = page_zone(pfn_to_page(start_pfn));
757 node = zone_to_nid(zone); 773 node = zone_to_nid(zone);
758 nr_pages = end_pfn - start_pfn; 774 nr_pages = end_pfn - start_pfn;
@@ -760,7 +776,7 @@ int offline_pages(unsigned long start_pfn,
760 /* set above range as isolated */ 776 /* set above range as isolated */
761 ret = start_isolate_page_range(start_pfn, end_pfn); 777 ret = start_isolate_page_range(start_pfn, end_pfn);
762 if (ret) 778 if (ret)
763 return ret; 779 goto out;
764 780
765 arg.start_pfn = start_pfn; 781 arg.start_pfn = start_pfn;
766 arg.nr_pages = nr_pages; 782 arg.nr_pages = nr_pages;
@@ -830,12 +846,15 @@ repeat:
830 zone->present_pages -= offlined_pages; 846 zone->present_pages -= offlined_pages;
831 zone->zone_pgdat->node_present_pages -= offlined_pages; 847 zone->zone_pgdat->node_present_pages -= offlined_pages;
832 totalram_pages -= offlined_pages; 848 totalram_pages -= offlined_pages;
833 num_physpages -= offlined_pages; 849
850 setup_per_zone_wmarks();
851 calculate_zone_inactive_ratio(zone);
834 852
835 vm_total_pages = nr_free_pagecache_pages(); 853 vm_total_pages = nr_free_pagecache_pages();
836 writeback_set_ratelimit(); 854 writeback_set_ratelimit();
837 855
838 memory_notify(MEM_OFFLINE, &arg); 856 memory_notify(MEM_OFFLINE, &arg);
857 unlock_system_sleep();
839 return 0; 858 return 0;
840 859
841failed_removal: 860failed_removal:
@@ -845,6 +864,8 @@ failed_removal:
845 /* pushback to free area */ 864 /* pushback to free area */
846 undo_isolate_page_range(start_pfn, end_pfn); 865 undo_isolate_page_range(start_pfn, end_pfn);
847 866
867out:
868 unlock_system_sleep();
848 return ret; 869 return ret;
849} 870}
850 871
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3eb4a6fdc043..4545d5944243 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -182,13 +182,58 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
182 return 0; 182 return 0;
183} 183}
184 184
185/* Create a new policy */ 185/*
186 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
187 * any, for the new policy. mpol_new() has already validated the nodes
188 * parameter with respect to the policy mode and flags. But, we need to
189 * handle an empty nodemask with MPOL_PREFERRED here.
190 *
191 * Must be called holding task's alloc_lock to protect task's mems_allowed
192 * and mempolicy. May also be called holding the mmap_semaphore for write.
193 */
194static int mpol_set_nodemask(struct mempolicy *pol,
195 const nodemask_t *nodes, struct nodemask_scratch *nsc)
196{
197 int ret;
198
199 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
200 if (pol == NULL)
201 return 0;
202 /* Check N_HIGH_MEMORY */
203 nodes_and(nsc->mask1,
204 cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
205
206 VM_BUG_ON(!nodes);
207 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
208 nodes = NULL; /* explicit local allocation */
209 else {
210 if (pol->flags & MPOL_F_RELATIVE_NODES)
211 mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
212 else
213 nodes_and(nsc->mask2, *nodes, nsc->mask1);
214
215 if (mpol_store_user_nodemask(pol))
216 pol->w.user_nodemask = *nodes;
217 else
218 pol->w.cpuset_mems_allowed =
219 cpuset_current_mems_allowed;
220 }
221
222 if (nodes)
223 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
224 else
225 ret = mpol_ops[pol->mode].create(pol, NULL);
226 return ret;
227}
228
229/*
230 * This function just creates a new policy, does some check and simple
231 * initialization. You must invoke mpol_set_nodemask() to set nodes.
232 */
186static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, 233static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
187 nodemask_t *nodes) 234 nodemask_t *nodes)
188{ 235{
189 struct mempolicy *policy; 236 struct mempolicy *policy;
190 nodemask_t cpuset_context_nmask;
191 int ret;
192 237
193 pr_debug("setting mode %d flags %d nodes[0] %lx\n", 238 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
194 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); 239 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
@@ -210,7 +255,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
210 if (((flags & MPOL_F_STATIC_NODES) || 255 if (((flags & MPOL_F_STATIC_NODES) ||
211 (flags & MPOL_F_RELATIVE_NODES))) 256 (flags & MPOL_F_RELATIVE_NODES)))
212 return ERR_PTR(-EINVAL); 257 return ERR_PTR(-EINVAL);
213 nodes = NULL; /* flag local alloc */
214 } 258 }
215 } else if (nodes_empty(*nodes)) 259 } else if (nodes_empty(*nodes))
216 return ERR_PTR(-EINVAL); 260 return ERR_PTR(-EINVAL);
@@ -221,30 +265,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
221 policy->mode = mode; 265 policy->mode = mode;
222 policy->flags = flags; 266 policy->flags = flags;
223 267
224 if (nodes) {
225 /*
226 * cpuset related setup doesn't apply to local allocation
227 */
228 cpuset_update_task_memory_state();
229 if (flags & MPOL_F_RELATIVE_NODES)
230 mpol_relative_nodemask(&cpuset_context_nmask, nodes,
231 &cpuset_current_mems_allowed);
232 else
233 nodes_and(cpuset_context_nmask, *nodes,
234 cpuset_current_mems_allowed);
235 if (mpol_store_user_nodemask(policy))
236 policy->w.user_nodemask = *nodes;
237 else
238 policy->w.cpuset_mems_allowed =
239 cpuset_mems_allowed(current);
240 }
241
242 ret = mpol_ops[mode].create(policy,
243 nodes ? &cpuset_context_nmask : NULL);
244 if (ret < 0) {
245 kmem_cache_free(policy_cache, policy);
246 return ERR_PTR(ret);
247 }
248 return policy; 268 return policy;
249} 269}
250 270
@@ -324,6 +344,8 @@ static void mpol_rebind_policy(struct mempolicy *pol,
324/* 344/*
325 * Wrapper for mpol_rebind_policy() that just requires task 345 * Wrapper for mpol_rebind_policy() that just requires task
326 * pointer, and updates task mempolicy. 346 * pointer, and updates task mempolicy.
347 *
348 * Called with task's alloc_lock held.
327 */ 349 */
328 350
329void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) 351void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
@@ -600,13 +622,19 @@ static void mpol_set_task_struct_flag(void)
600static long do_set_mempolicy(unsigned short mode, unsigned short flags, 622static long do_set_mempolicy(unsigned short mode, unsigned short flags,
601 nodemask_t *nodes) 623 nodemask_t *nodes)
602{ 624{
603 struct mempolicy *new; 625 struct mempolicy *new, *old;
604 struct mm_struct *mm = current->mm; 626 struct mm_struct *mm = current->mm;
627 NODEMASK_SCRATCH(scratch);
628 int ret;
605 629
606 new = mpol_new(mode, flags, nodes); 630 if (!scratch)
607 if (IS_ERR(new)) 631 return -ENOMEM;
608 return PTR_ERR(new);
609 632
633 new = mpol_new(mode, flags, nodes);
634 if (IS_ERR(new)) {
635 ret = PTR_ERR(new);
636 goto out;
637 }
610 /* 638 /*
611 * prevent changing our mempolicy while show_numa_maps() 639 * prevent changing our mempolicy while show_numa_maps()
612 * is using it. 640 * is using it.
@@ -615,20 +643,36 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
615 */ 643 */
616 if (mm) 644 if (mm)
617 down_write(&mm->mmap_sem); 645 down_write(&mm->mmap_sem);
618 mpol_put(current->mempolicy); 646 task_lock(current);
647 ret = mpol_set_nodemask(new, nodes, scratch);
648 if (ret) {
649 task_unlock(current);
650 if (mm)
651 up_write(&mm->mmap_sem);
652 mpol_put(new);
653 goto out;
654 }
655 old = current->mempolicy;
619 current->mempolicy = new; 656 current->mempolicy = new;
620 mpol_set_task_struct_flag(); 657 mpol_set_task_struct_flag();
621 if (new && new->mode == MPOL_INTERLEAVE && 658 if (new && new->mode == MPOL_INTERLEAVE &&
622 nodes_weight(new->v.nodes)) 659 nodes_weight(new->v.nodes))
623 current->il_next = first_node(new->v.nodes); 660 current->il_next = first_node(new->v.nodes);
661 task_unlock(current);
624 if (mm) 662 if (mm)
625 up_write(&mm->mmap_sem); 663 up_write(&mm->mmap_sem);
626 664
627 return 0; 665 mpol_put(old);
666 ret = 0;
667out:
668 NODEMASK_SCRATCH_FREE(scratch);
669 return ret;
628} 670}
629 671
630/* 672/*
631 * Return nodemask for policy for get_mempolicy() query 673 * Return nodemask for policy for get_mempolicy() query
674 *
675 * Called with task's alloc_lock held
632 */ 676 */
633static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) 677static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
634{ 678{
@@ -674,7 +718,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
674 struct vm_area_struct *vma = NULL; 718 struct vm_area_struct *vma = NULL;
675 struct mempolicy *pol = current->mempolicy; 719 struct mempolicy *pol = current->mempolicy;
676 720
677 cpuset_update_task_memory_state();
678 if (flags & 721 if (flags &
679 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) 722 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
680 return -EINVAL; 723 return -EINVAL;
@@ -683,7 +726,9 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
683 if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) 726 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
684 return -EINVAL; 727 return -EINVAL;
685 *policy = 0; /* just so it's initialized */ 728 *policy = 0; /* just so it's initialized */
729 task_lock(current);
686 *nmask = cpuset_current_mems_allowed; 730 *nmask = cpuset_current_mems_allowed;
731 task_unlock(current);
687 return 0; 732 return 0;
688 } 733 }
689 734
@@ -738,8 +783,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
738 } 783 }
739 784
740 err = 0; 785 err = 0;
741 if (nmask) 786 if (nmask) {
787 task_lock(current);
742 get_policy_nodemask(pol, nmask); 788 get_policy_nodemask(pol, nmask);
789 task_unlock(current);
790 }
743 791
744 out: 792 out:
745 mpol_cond_put(pol); 793 mpol_cond_put(pol);
@@ -767,7 +815,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
767 815
768static struct page *new_node_page(struct page *page, unsigned long node, int **x) 816static struct page *new_node_page(struct page *page, unsigned long node, int **x)
769{ 817{
770 return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0); 818 return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
771} 819}
772 820
773/* 821/*
@@ -976,9 +1024,24 @@ static long do_mbind(unsigned long start, unsigned long len,
976 1024
977 err = migrate_prep(); 1025 err = migrate_prep();
978 if (err) 1026 if (err)
979 return err; 1027 goto mpol_out;
980 } 1028 }
981 down_write(&mm->mmap_sem); 1029 {
1030 NODEMASK_SCRATCH(scratch);
1031 if (scratch) {
1032 down_write(&mm->mmap_sem);
1033 task_lock(current);
1034 err = mpol_set_nodemask(new, nmask, scratch);
1035 task_unlock(current);
1036 if (err)
1037 up_write(&mm->mmap_sem);
1038 } else
1039 err = -ENOMEM;
1040 NODEMASK_SCRATCH_FREE(scratch);
1041 }
1042 if (err)
1043 goto mpol_out;
1044
982 vma = check_range(mm, start, end, nmask, 1045 vma = check_range(mm, start, end, nmask,
983 flags | MPOL_MF_INVERT, &pagelist); 1046 flags | MPOL_MF_INVERT, &pagelist);
984 1047
@@ -994,9 +1057,11 @@ static long do_mbind(unsigned long start, unsigned long len,
994 1057
995 if (!err && nr_failed && (flags & MPOL_MF_STRICT)) 1058 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
996 err = -EIO; 1059 err = -EIO;
997 } 1060 } else
1061 putback_lru_pages(&pagelist);
998 1062
999 up_write(&mm->mmap_sem); 1063 up_write(&mm->mmap_sem);
1064 mpol_out:
1000 mpol_put(new); 1065 mpol_put(new);
1001 return err; 1066 return err;
1002} 1067}
@@ -1545,8 +1610,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1545 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1610 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1546 struct zonelist *zl; 1611 struct zonelist *zl;
1547 1612
1548 cpuset_update_task_memory_state();
1549
1550 if (unlikely(pol->mode == MPOL_INTERLEAVE)) { 1613 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1551 unsigned nid; 1614 unsigned nid;
1552 1615
@@ -1593,8 +1656,6 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1593{ 1656{
1594 struct mempolicy *pol = current->mempolicy; 1657 struct mempolicy *pol = current->mempolicy;
1595 1658
1596 if ((gfp & __GFP_WAIT) && !in_interrupt())
1597 cpuset_update_task_memory_state();
1598 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) 1659 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1599 pol = &default_policy; 1660 pol = &default_policy;
1600 1661
@@ -1851,27 +1912,46 @@ restart:
1851 * Install non-NULL @mpol in inode's shared policy rb-tree. 1912 * Install non-NULL @mpol in inode's shared policy rb-tree.
1852 * On entry, the current task has a reference on a non-NULL @mpol. 1913 * On entry, the current task has a reference on a non-NULL @mpol.
1853 * This must be released on exit. 1914 * This must be released on exit.
1915 * This is called at get_inode() calls and we can use GFP_KERNEL.
1854 */ 1916 */
1855void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) 1917void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1856{ 1918{
1919 int ret;
1920
1857 sp->root = RB_ROOT; /* empty tree == default mempolicy */ 1921 sp->root = RB_ROOT; /* empty tree == default mempolicy */
1858 spin_lock_init(&sp->lock); 1922 spin_lock_init(&sp->lock);
1859 1923
1860 if (mpol) { 1924 if (mpol) {
1861 struct vm_area_struct pvma; 1925 struct vm_area_struct pvma;
1862 struct mempolicy *new; 1926 struct mempolicy *new;
1927 NODEMASK_SCRATCH(scratch);
1863 1928
1929 if (!scratch)
1930 return;
1864 /* contextualize the tmpfs mount point mempolicy */ 1931 /* contextualize the tmpfs mount point mempolicy */
1865 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); 1932 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
1866 mpol_put(mpol); /* drop our ref on sb mpol */ 1933 if (IS_ERR(new)) {
1867 if (IS_ERR(new)) 1934 mpol_put(mpol); /* drop our ref on sb mpol */
1935 NODEMASK_SCRATCH_FREE(scratch);
1868 return; /* no valid nodemask intersection */ 1936 return; /* no valid nodemask intersection */
1937 }
1938
1939 task_lock(current);
1940 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
1941 task_unlock(current);
1942 mpol_put(mpol); /* drop our ref on sb mpol */
1943 if (ret) {
1944 NODEMASK_SCRATCH_FREE(scratch);
1945 mpol_put(new);
1946 return;
1947 }
1869 1948
1870 /* Create pseudo-vma that contains just the policy */ 1949 /* Create pseudo-vma that contains just the policy */
1871 memset(&pvma, 0, sizeof(struct vm_area_struct)); 1950 memset(&pvma, 0, sizeof(struct vm_area_struct));
1872 pvma.vm_end = TASK_SIZE; /* policy covers entire file */ 1951 pvma.vm_end = TASK_SIZE; /* policy covers entire file */
1873 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ 1952 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
1874 mpol_put(new); /* drop initial ref */ 1953 mpol_put(new); /* drop initial ref */
1954 NODEMASK_SCRATCH_FREE(scratch);
1875 } 1955 }
1876} 1956}
1877 1957
@@ -2086,8 +2166,24 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2086 new = mpol_new(mode, mode_flags, &nodes); 2166 new = mpol_new(mode, mode_flags, &nodes);
2087 if (IS_ERR(new)) 2167 if (IS_ERR(new))
2088 err = 1; 2168 err = 1;
2089 else if (no_context) 2169 else {
2090 new->w.user_nodemask = nodes; /* save for contextualization */ 2170 int ret;
2171 NODEMASK_SCRATCH(scratch);
2172 if (scratch) {
2173 task_lock(current);
2174 ret = mpol_set_nodemask(new, &nodes, scratch);
2175 task_unlock(current);
2176 } else
2177 ret = -ENOMEM;
2178 NODEMASK_SCRATCH_FREE(scratch);
2179 if (ret) {
2180 err = 1;
2181 mpol_put(new);
2182 } else if (no_context) {
2183 /* save for contextualization */
2184 new->w.user_nodemask = nodes;
2185 }
2186 }
2091 2187
2092out: 2188out:
2093 /* Restore string for error message */ 2189 /* Restore string for error message */
diff --git a/mm/mempool.c b/mm/mempool.c
index a46eb1b4bb66..1a3bc3d4d554 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -303,18 +303,11 @@ EXPORT_SYMBOL(mempool_free_slab);
303 */ 303 */
304void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data) 304void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
305{ 305{
306 size_t size = (size_t)(long)pool_data; 306 size_t size = (size_t)pool_data;
307 return kmalloc(size, gfp_mask); 307 return kmalloc(size, gfp_mask);
308} 308}
309EXPORT_SYMBOL(mempool_kmalloc); 309EXPORT_SYMBOL(mempool_kmalloc);
310 310
311void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data)
312{
313 size_t size = (size_t) pool_data;
314 return kzalloc(size, gfp_mask);
315}
316EXPORT_SYMBOL(mempool_kzalloc);
317
318void mempool_kfree(void *element, void *pool_data) 311void mempool_kfree(void *element, void *pool_data)
319{ 312{
320 kfree(element); 313 kfree(element);
diff --git a/mm/migrate.c b/mm/migrate.c
index a9eff3f092f6..7dbcb22316d2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -67,6 +67,8 @@ int putback_lru_pages(struct list_head *l)
67 67
68 list_for_each_entry_safe(page, page2, l, lru) { 68 list_for_each_entry_safe(page, page2, l, lru) {
69 list_del(&page->lru); 69 list_del(&page->lru);
70 dec_zone_page_state(page, NR_ISOLATED_ANON +
71 page_is_file_cache(page));
70 putback_lru_page(page); 72 putback_lru_page(page);
71 count++; 73 count++;
72 } 74 }
@@ -147,7 +149,7 @@ out:
147static void remove_file_migration_ptes(struct page *old, struct page *new) 149static void remove_file_migration_ptes(struct page *old, struct page *new)
148{ 150{
149 struct vm_area_struct *vma; 151 struct vm_area_struct *vma;
150 struct address_space *mapping = page_mapping(new); 152 struct address_space *mapping = new->mapping;
151 struct prio_tree_iter iter; 153 struct prio_tree_iter iter;
152 pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 154 pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
153 155
@@ -250,7 +252,7 @@ out:
250 * The number of remaining references must be: 252 * The number of remaining references must be:
251 * 1 for anonymous pages without a mapping 253 * 1 for anonymous pages without a mapping
252 * 2 for pages with a mapping 254 * 2 for pages with a mapping
253 * 3 for pages with a mapping and PagePrivate set. 255 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
254 */ 256 */
255static int migrate_page_move_mapping(struct address_space *mapping, 257static int migrate_page_move_mapping(struct address_space *mapping,
256 struct page *newpage, struct page *page) 258 struct page *newpage, struct page *page)
@@ -270,7 +272,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
270 pslot = radix_tree_lookup_slot(&mapping->page_tree, 272 pslot = radix_tree_lookup_slot(&mapping->page_tree,
271 page_index(page)); 273 page_index(page));
272 274
273 expected_count = 2 + !!PagePrivate(page); 275 expected_count = 2 + page_has_private(page);
274 if (page_count(page) != expected_count || 276 if (page_count(page) != expected_count ||
275 (struct page *)radix_tree_deref_slot(pslot) != page) { 277 (struct page *)radix_tree_deref_slot(pslot) != page) {
276 spin_unlock_irq(&mapping->tree_lock); 278 spin_unlock_irq(&mapping->tree_lock);
@@ -312,7 +314,10 @@ static int migrate_page_move_mapping(struct address_space *mapping,
312 */ 314 */
313 __dec_zone_page_state(page, NR_FILE_PAGES); 315 __dec_zone_page_state(page, NR_FILE_PAGES);
314 __inc_zone_page_state(newpage, NR_FILE_PAGES); 316 __inc_zone_page_state(newpage, NR_FILE_PAGES);
315 317 if (PageSwapBacked(page)) {
318 __dec_zone_page_state(page, NR_SHMEM);
319 __inc_zone_page_state(newpage, NR_SHMEM);
320 }
316 spin_unlock_irq(&mapping->tree_lock); 321 spin_unlock_irq(&mapping->tree_lock);
317 322
318 return 0; 323 return 0;
@@ -386,7 +391,7 @@ EXPORT_SYMBOL(fail_migrate_page);
386 391
387/* 392/*
388 * Common logic to directly migrate a single page suitable for 393 * Common logic to directly migrate a single page suitable for
389 * pages that do not use PagePrivate. 394 * pages that do not use PagePrivate/PagePrivate2.
390 * 395 *
391 * Pages are locked upon entry and exit. 396 * Pages are locked upon entry and exit.
392 */ 397 */
@@ -522,7 +527,7 @@ static int fallback_migrate_page(struct address_space *mapping,
522 * Buffers may be managed in a filesystem specific way. 527 * Buffers may be managed in a filesystem specific way.
523 * We must have no buffers or drop them. 528 * We must have no buffers or drop them.
524 */ 529 */
525 if (PagePrivate(page) && 530 if (page_has_private(page) &&
526 !try_to_release_page(page, GFP_KERNEL)) 531 !try_to_release_page(page, GFP_KERNEL))
527 return -EAGAIN; 532 return -EAGAIN;
528 533
@@ -597,7 +602,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
597 struct page *newpage = get_new_page(page, private, &result); 602 struct page *newpage = get_new_page(page, private, &result);
598 int rcu_locked = 0; 603 int rcu_locked = 0;
599 int charge = 0; 604 int charge = 0;
600 struct mem_cgroup *mem; 605 struct mem_cgroup *mem = NULL;
601 606
602 if (!newpage) 607 if (!newpage)
603 return -ENOMEM; 608 return -ENOMEM;
@@ -655,7 +660,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
655 * free the metadata, so the page can be freed. 660 * free the metadata, so the page can be freed.
656 */ 661 */
657 if (!page->mapping) { 662 if (!page->mapping) {
658 if (!PageAnon(page) && PagePrivate(page)) { 663 if (!PageAnon(page) && page_has_private(page)) {
659 /* 664 /*
660 * Go direct to try_to_free_buffers() here because 665 * Go direct to try_to_free_buffers() here because
661 * a) that's what try_to_release_page() would do anyway 666 * a) that's what try_to_release_page() would do anyway
@@ -664,13 +669,15 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
664 * needs to be effective. 669 * needs to be effective.
665 */ 670 */
666 try_to_free_buffers(page); 671 try_to_free_buffers(page);
672 goto rcu_unlock;
667 } 673 }
668 goto rcu_unlock; 674 goto skip_unmap;
669 } 675 }
670 676
671 /* Establish migration ptes or remove ptes */ 677 /* Establish migration ptes or remove ptes */
672 try_to_unmap(page, 1); 678 try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
673 679
680skip_unmap:
674 if (!page_mapped(page)) 681 if (!page_mapped(page))
675 rc = move_to_new_page(newpage, page); 682 rc = move_to_new_page(newpage, page);
676 683
@@ -693,6 +700,8 @@ unlock:
693 * restored. 700 * restored.
694 */ 701 */
695 list_del(&page->lru); 702 list_del(&page->lru);
703 dec_zone_page_state(page, NR_ISOLATED_ANON +
704 page_is_file_cache(page));
696 putback_lru_page(page); 705 putback_lru_page(page);
697 } 706 }
698 707
@@ -737,6 +746,13 @@ int migrate_pages(struct list_head *from,
737 struct page *page2; 746 struct page *page2;
738 int swapwrite = current->flags & PF_SWAPWRITE; 747 int swapwrite = current->flags & PF_SWAPWRITE;
739 int rc; 748 int rc;
749 unsigned long flags;
750
751 local_irq_save(flags);
752 list_for_each_entry(page, from, lru)
753 __inc_zone_page_state(page, NR_ISOLATED_ANON +
754 page_is_file_cache(page));
755 local_irq_restore(flags);
740 756
741 if (!swapwrite) 757 if (!swapwrite)
742 current->flags |= PF_SWAPWRITE; 758 current->flags |= PF_SWAPWRITE;
@@ -802,7 +818,7 @@ static struct page *new_page_node(struct page *p, unsigned long private,
802 818
803 *result = &pm->status; 819 *result = &pm->status;
804 820
805 return alloc_pages_node(pm->node, 821 return alloc_pages_exact_node(pm->node,
806 GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); 822 GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
807} 823}
808 824
@@ -820,7 +836,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
820 struct page_to_node *pp; 836 struct page_to_node *pp;
821 LIST_HEAD(pagelist); 837 LIST_HEAD(pagelist);
822 838
823 migrate_prep();
824 down_read(&mm->mmap_sem); 839 down_read(&mm->mmap_sem);
825 840
826 /* 841 /*
@@ -907,6 +922,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
907 pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); 922 pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
908 if (!pm) 923 if (!pm)
909 goto out; 924 goto out;
925
926 migrate_prep();
927
910 /* 928 /*
911 * Store a chunk of page_to_node array in a page, 929 * Store a chunk of page_to_node array in a page,
912 * but keep the last one as a marker 930 * but keep the last one as a marker
diff --git a/mm/mlock.c b/mm/mlock.c
index cbe9e0581b75..bd6f0e466f6c 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -31,7 +31,6 @@ int can_do_mlock(void)
31} 31}
32EXPORT_SYMBOL(can_do_mlock); 32EXPORT_SYMBOL(can_do_mlock);
33 33
34#ifdef CONFIG_UNEVICTABLE_LRU
35/* 34/*
36 * Mlocked pages are marked with PageMlocked() flag for efficient testing 35 * Mlocked pages are marked with PageMlocked() flag for efficient testing
37 * in vmscan and, possibly, the fault path; and to support semi-accurate 36 * in vmscan and, possibly, the fault path; and to support semi-accurate
@@ -140,49 +139,36 @@ static void munlock_vma_page(struct page *page)
140} 139}
141 140
142/** 141/**
143 * __mlock_vma_pages_range() - mlock/munlock a range of pages in the vma. 142 * __mlock_vma_pages_range() - mlock a range of pages in the vma.
144 * @vma: target vma 143 * @vma: target vma
145 * @start: start address 144 * @start: start address
146 * @end: end address 145 * @end: end address
147 * @mlock: 0 indicate munlock, otherwise mlock.
148 * 146 *
149 * If @mlock == 0, unlock an mlocked range; 147 * This takes care of making the pages present too.
150 * else mlock the range of pages. This takes care of making the pages present ,
151 * too.
152 * 148 *
153 * return 0 on success, negative error code on error. 149 * return 0 on success, negative error code on error.
154 * 150 *
155 * vma->vm_mm->mmap_sem must be held for at least read. 151 * vma->vm_mm->mmap_sem must be held for at least read.
156 */ 152 */
157static long __mlock_vma_pages_range(struct vm_area_struct *vma, 153static long __mlock_vma_pages_range(struct vm_area_struct *vma,
158 unsigned long start, unsigned long end, 154 unsigned long start, unsigned long end)
159 int mlock)
160{ 155{
161 struct mm_struct *mm = vma->vm_mm; 156 struct mm_struct *mm = vma->vm_mm;
162 unsigned long addr = start; 157 unsigned long addr = start;
163 struct page *pages[16]; /* 16 gives a reasonable batch */ 158 struct page *pages[16]; /* 16 gives a reasonable batch */
164 int nr_pages = (end - start) / PAGE_SIZE; 159 int nr_pages = (end - start) / PAGE_SIZE;
165 int ret = 0; 160 int ret = 0;
166 int gup_flags = 0; 161 int gup_flags;
167 162
168 VM_BUG_ON(start & ~PAGE_MASK); 163 VM_BUG_ON(start & ~PAGE_MASK);
169 VM_BUG_ON(end & ~PAGE_MASK); 164 VM_BUG_ON(end & ~PAGE_MASK);
170 VM_BUG_ON(start < vma->vm_start); 165 VM_BUG_ON(start < vma->vm_start);
171 VM_BUG_ON(end > vma->vm_end); 166 VM_BUG_ON(end > vma->vm_end);
172 VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) && 167 VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
173 (atomic_read(&mm->mm_users) != 0));
174
175 /*
176 * mlock: don't page populate if vma has PROT_NONE permission.
177 * munlock: always do munlock although the vma has PROT_NONE
178 * permission, or SIGKILL is pending.
179 */
180 if (!mlock)
181 gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS |
182 GUP_FLAGS_IGNORE_SIGKILL;
183 168
169 gup_flags = FOLL_TOUCH | FOLL_GET;
184 if (vma->vm_flags & VM_WRITE) 170 if (vma->vm_flags & VM_WRITE)
185 gup_flags |= GUP_FLAGS_WRITE; 171 gup_flags |= FOLL_WRITE;
186 172
187 while (nr_pages > 0) { 173 while (nr_pages > 0) {
188 int i; 174 int i;
@@ -202,51 +188,45 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
202 * This can happen for, e.g., VM_NONLINEAR regions before 188 * This can happen for, e.g., VM_NONLINEAR regions before
203 * a page has been allocated and mapped at a given offset, 189 * a page has been allocated and mapped at a given offset,
204 * or for addresses that map beyond end of a file. 190 * or for addresses that map beyond end of a file.
205 * We'll mlock the the pages if/when they get faulted in. 191 * We'll mlock the pages if/when they get faulted in.
206 */ 192 */
207 if (ret < 0) 193 if (ret < 0)
208 break; 194 break;
209 if (ret == 0) {
210 /*
211 * We know the vma is there, so the only time
212 * we cannot get a single page should be an
213 * error (ret < 0) case.
214 */
215 WARN_ON(1);
216 break;
217 }
218 195
219 lru_add_drain(); /* push cached pages to LRU */ 196 lru_add_drain(); /* push cached pages to LRU */
220 197
221 for (i = 0; i < ret; i++) { 198 for (i = 0; i < ret; i++) {
222 struct page *page = pages[i]; 199 struct page *page = pages[i];
223 200
224 lock_page(page);
225 /*
226 * Because we lock page here and migration is blocked
227 * by the elevated reference, we need only check for
228 * page truncation (file-cache only).
229 */
230 if (page->mapping) { 201 if (page->mapping) {
231 if (mlock) 202 /*
203 * That preliminary check is mainly to avoid
204 * the pointless overhead of lock_page on the
205 * ZERO_PAGE: which might bounce very badly if
206 * there is contention. However, we're still
207 * dirtying its cacheline with get/put_page:
208 * we'll add another __get_user_pages flag to
209 * avoid it if that case turns out to matter.
210 */
211 lock_page(page);
212 /*
213 * Because we lock page here and migration is
214 * blocked by the elevated reference, we need
215 * only check for file-cache page truncation.
216 */
217 if (page->mapping)
232 mlock_vma_page(page); 218 mlock_vma_page(page);
233 else 219 unlock_page(page);
234 munlock_vma_page(page);
235 } 220 }
236 unlock_page(page); 221 put_page(page); /* ref from get_user_pages() */
237 put_page(page); /* ref from get_user_pages() */
238
239 /*
240 * here we assume that get_user_pages() has given us
241 * a list of virtually contiguous pages.
242 */
243 addr += PAGE_SIZE; /* for next get_user_pages() */
244 nr_pages--;
245 } 222 }
223
224 addr += ret * PAGE_SIZE;
225 nr_pages -= ret;
246 ret = 0; 226 ret = 0;
247 } 227 }
248 228
249 return ret; /* count entire vma as locked_vm */ 229 return ret; /* 0 or negative error code */
250} 230}
251 231
252/* 232/*
@@ -261,27 +241,6 @@ static int __mlock_posix_error_return(long retval)
261 return retval; 241 return retval;
262} 242}
263 243
264#else /* CONFIG_UNEVICTABLE_LRU */
265
266/*
267 * Just make pages present if VM_LOCKED. No-op if unlocking.
268 */
269static long __mlock_vma_pages_range(struct vm_area_struct *vma,
270 unsigned long start, unsigned long end,
271 int mlock)
272{
273 if (mlock && (vma->vm_flags & VM_LOCKED))
274 return make_pages_present(start, end);
275 return 0;
276}
277
278static inline int __mlock_posix_error_return(long retval)
279{
280 return 0;
281}
282
283#endif /* CONFIG_UNEVICTABLE_LRU */
284
285/** 244/**
286 * mlock_vma_pages_range() - mlock pages in specified vma range. 245 * mlock_vma_pages_range() - mlock pages in specified vma range.
287 * @vma - the vma containing the specfied address range 246 * @vma - the vma containing the specfied address range
@@ -311,7 +270,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
311 is_vm_hugetlb_page(vma) || 270 is_vm_hugetlb_page(vma) ||
312 vma == get_gate_vma(current))) { 271 vma == get_gate_vma(current))) {
313 272
314 __mlock_vma_pages_range(vma, start, end, 1); 273 __mlock_vma_pages_range(vma, start, end);
315 274
316 /* Hide errors from mmap() and other callers */ 275 /* Hide errors from mmap() and other callers */
317 return 0; 276 return 0;
@@ -332,7 +291,6 @@ no_mlock:
332 return nr_pages; /* error or pages NOT mlocked */ 291 return nr_pages; /* error or pages NOT mlocked */
333} 292}
334 293
335
336/* 294/*
337 * munlock_vma_pages_range() - munlock all pages in the vma range.' 295 * munlock_vma_pages_range() - munlock all pages in the vma range.'
338 * @vma - vma containing range to be munlock()ed. 296 * @vma - vma containing range to be munlock()ed.
@@ -352,10 +310,38 @@ no_mlock:
352 * free them. This will result in freeing mlocked pages. 310 * free them. This will result in freeing mlocked pages.
353 */ 311 */
354void munlock_vma_pages_range(struct vm_area_struct *vma, 312void munlock_vma_pages_range(struct vm_area_struct *vma,
355 unsigned long start, unsigned long end) 313 unsigned long start, unsigned long end)
356{ 314{
315 unsigned long addr;
316
317 lru_add_drain();
357 vma->vm_flags &= ~VM_LOCKED; 318 vma->vm_flags &= ~VM_LOCKED;
358 __mlock_vma_pages_range(vma, start, end, 0); 319
320 for (addr = start; addr < end; addr += PAGE_SIZE) {
321 struct page *page;
322 /*
323 * Although FOLL_DUMP is intended for get_dump_page(),
324 * it just so happens that its special treatment of the
325 * ZERO_PAGE (returning an error instead of doing get_page)
326 * suits munlock very well (and if somehow an abnormal page
327 * has sneaked into the range, we won't oops here: great).
328 */
329 page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
330 if (page && !IS_ERR(page)) {
331 lock_page(page);
332 /*
333 * Like in __mlock_vma_pages_range(),
334 * because we lock page here and migration is
335 * blocked by the elevated reference, we need
336 * only check for file-cache page truncation.
337 */
338 if (page->mapping)
339 munlock_vma_page(page);
340 unlock_page(page);
341 put_page(page);
342 }
343 cond_resched();
344 }
359} 345}
360 346
361/* 347/*
@@ -422,18 +408,14 @@ success:
422 * It's okay if try_to_unmap_one unmaps a page just after we 408 * It's okay if try_to_unmap_one unmaps a page just after we
423 * set VM_LOCKED, __mlock_vma_pages_range will bring it back. 409 * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
424 */ 410 */
425 vma->vm_flags = newflags;
426 411
427 if (lock) { 412 if (lock) {
428 ret = __mlock_vma_pages_range(vma, start, end, 1); 413 vma->vm_flags = newflags;
429 414 ret = __mlock_vma_pages_range(vma, start, end);
430 if (ret > 0) { 415 if (ret < 0)
431 mm->locked_vm -= ret; 416 ret = __mlock_posix_error_return(ret);
432 ret = 0;
433 } else
434 ret = __mlock_posix_error_return(ret); /* translate if needed */
435 } else { 417 } else {
436 __mlock_vma_pages_range(vma, start, end, 0); 418 munlock_vma_pages_range(vma, start, end);
437 } 419 }
438 420
439out: 421out:
@@ -629,52 +611,43 @@ void user_shm_unlock(size_t size, struct user_struct *user)
629 free_uid(user); 611 free_uid(user);
630} 612}
631 613
632void *alloc_locked_buffer(size_t size) 614int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
615 size_t size)
633{ 616{
634 unsigned long rlim, vm, pgsz; 617 unsigned long lim, vm, pgsz;
635 void *buffer = NULL; 618 int error = -ENOMEM;
636 619
637 pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; 620 pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
638 621
639 down_write(&current->mm->mmap_sem); 622 down_write(&mm->mmap_sem);
640 623
641 rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; 624 lim = rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
642 vm = current->mm->total_vm + pgsz; 625 vm = mm->total_vm + pgsz;
643 if (rlim < vm) 626 if (lim < vm)
644 goto out; 627 goto out;
645 628
646 rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; 629 lim = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
647 vm = current->mm->locked_vm + pgsz; 630 vm = mm->locked_vm + pgsz;
648 if (rlim < vm) 631 if (lim < vm)
649 goto out; 632 goto out;
650 633
651 buffer = kzalloc(size, GFP_KERNEL); 634 mm->total_vm += pgsz;
652 if (!buffer) 635 mm->locked_vm += pgsz;
653 goto out;
654
655 current->mm->total_vm += pgsz;
656 current->mm->locked_vm += pgsz;
657 636
637 error = 0;
658 out: 638 out:
659 up_write(&current->mm->mmap_sem); 639 up_write(&mm->mmap_sem);
660 return buffer; 640 return error;
661} 641}
662 642
663void release_locked_buffer(void *buffer, size_t size) 643void refund_locked_memory(struct mm_struct *mm, size_t size)
664{ 644{
665 unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; 645 unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
666 646
667 down_write(&current->mm->mmap_sem); 647 down_write(&mm->mmap_sem);
668
669 current->mm->total_vm -= pgsz;
670 current->mm->locked_vm -= pgsz;
671
672 up_write(&current->mm->mmap_sem);
673}
674 648
675void free_locked_buffer(void *buffer, size_t size) 649 mm->total_vm -= pgsz;
676{ 650 mm->locked_vm -= pgsz;
677 release_locked_buffer(buffer, size);
678 651
679 kfree(buffer); 652 up_write(&mm->mmap_sem);
680} 653}
diff --git a/mm/mmap.c b/mm/mmap.c
index 1abb9185a686..292ddc3cef9c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -20,7 +20,6 @@
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/personality.h> 21#include <linux/personality.h>
22#include <linux/security.h> 22#include <linux/security.h>
23#include <linux/ima.h>
24#include <linux/hugetlb.h> 23#include <linux/hugetlb.h>
25#include <linux/profile.h> 24#include <linux/profile.h>
26#include <linux/module.h> 25#include <linux/module.h>
@@ -28,6 +27,7 @@
28#include <linux/mempolicy.h> 27#include <linux/mempolicy.h>
29#include <linux/rmap.h> 28#include <linux/rmap.h>
30#include <linux/mmu_notifier.h> 29#include <linux/mmu_notifier.h>
30#include <linux/perf_event.h>
31 31
32#include <asm/uaccess.h> 32#include <asm/uaccess.h>
33#include <asm/cacheflush.h> 33#include <asm/cacheflush.h>
@@ -85,7 +85,7 @@ EXPORT_SYMBOL(vm_get_page_prot);
85int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ 85int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
86int sysctl_overcommit_ratio = 50; /* default is 50% */ 86int sysctl_overcommit_ratio = 50; /* default is 50% */
87int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; 87int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
88atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); 88struct percpu_counter vm_committed_as;
89 89
90/* 90/*
91 * Check that a process has enough memory to allocate a new virtual 91 * Check that a process has enough memory to allocate a new virtual
@@ -179,11 +179,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
179 if (mm) 179 if (mm)
180 allowed -= mm->total_vm / 32; 180 allowed -= mm->total_vm / 32;
181 181
182 /* 182 if (percpu_counter_read_positive(&vm_committed_as) < allowed)
183 * cast `allowed' as a signed long because vm_committed_space
184 * sometimes has a negative value
185 */
186 if (atomic_long_read(&vm_committed_space) < (long)allowed)
187 return 0; 183 return 0;
188error: 184error:
189 vm_unacct_memory(pages); 185 vm_unacct_memory(pages);
@@ -573,9 +569,9 @@ again: remove_next = 1 + (end > next->vm_end);
573 569
574 /* 570 /*
575 * When changing only vma->vm_end, we don't really need 571 * When changing only vma->vm_end, we don't really need
576 * anon_vma lock: but is that case worth optimizing out? 572 * anon_vma lock.
577 */ 573 */
578 if (vma->anon_vma) 574 if (vma->anon_vma && (insert || importer || start != vma->vm_start))
579 anon_vma = vma->anon_vma; 575 anon_vma = vma->anon_vma;
580 if (anon_vma) { 576 if (anon_vma) {
581 spin_lock(&anon_vma->lock); 577 spin_lock(&anon_vma->lock);
@@ -659,9 +655,6 @@ again: remove_next = 1 + (end > next->vm_end);
659 validate_mm(mm); 655 validate_mm(mm);
660} 656}
661 657
662/* Flags that can be inherited from an existing mapping when merging */
663#define VM_MERGEABLE_FLAGS (VM_CAN_NONLINEAR)
664
665/* 658/*
666 * If the vma has a ->close operation then the driver probably needs to release 659 * If the vma has a ->close operation then the driver probably needs to release
667 * per-vma resources, so we don't attempt to merge those. 660 * per-vma resources, so we don't attempt to merge those.
@@ -669,7 +662,8 @@ again: remove_next = 1 + (end > next->vm_end);
669static inline int is_mergeable_vma(struct vm_area_struct *vma, 662static inline int is_mergeable_vma(struct vm_area_struct *vma,
670 struct file *file, unsigned long vm_flags) 663 struct file *file, unsigned long vm_flags)
671{ 664{
672 if ((vma->vm_flags ^ vm_flags) & ~VM_MERGEABLE_FLAGS) 665 /* VM_CAN_NONLINEAR may get set later by f_op->mmap() */
666 if ((vma->vm_flags ^ vm_flags) & ~VM_CAN_NONLINEAR)
673 return 0; 667 return 0;
674 if (vma->vm_file != file) 668 if (vma->vm_file != file)
675 return 0; 669 return 0;
@@ -908,7 +902,7 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
908#endif /* CONFIG_PROC_FS */ 902#endif /* CONFIG_PROC_FS */
909 903
910/* 904/*
911 * The caller must hold down_write(current->mm->mmap_sem). 905 * The caller must hold down_write(&current->mm->mmap_sem).
912 */ 906 */
913 907
914unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, 908unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
@@ -954,6 +948,24 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
954 if (mm->map_count > sysctl_max_map_count) 948 if (mm->map_count > sysctl_max_map_count)
955 return -ENOMEM; 949 return -ENOMEM;
956 950
951 if (flags & MAP_HUGETLB) {
952 struct user_struct *user = NULL;
953 if (file)
954 return -EINVAL;
955
956 /*
957 * VM_NORESERVE is used because the reservations will be
958 * taken when vm_ops->mmap() is called
959 * A dummy user value is used because we are not locking
960 * memory so no accounting is necessary
961 */
962 len = ALIGN(len, huge_page_size(&default_hstate));
963 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
964 &user, HUGETLB_ANONHUGE_INODE);
965 if (IS_ERR(file))
966 return PTR_ERR(file);
967 }
968
957 /* Obtain the address to map to. we verify (or select) it and ensure 969 /* Obtain the address to map to. we verify (or select) it and ensure
958 * that it represents a valid section of the address space. 970 * that it represents a valid section of the address space.
959 */ 971 */
@@ -968,11 +980,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
968 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | 980 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
969 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; 981 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
970 982
971 if (flags & MAP_LOCKED) { 983 if (flags & MAP_LOCKED)
972 if (!can_do_mlock()) 984 if (!can_do_mlock())
973 return -EPERM; 985 return -EPERM;
974 vm_flags |= VM_LOCKED;
975 }
976 986
977 /* mlock MCL_FUTURE? */ 987 /* mlock MCL_FUTURE? */
978 if (vm_flags & VM_LOCKED) { 988 if (vm_flags & VM_LOCKED) {
@@ -1050,9 +1060,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1050 error = security_file_mmap(file, reqprot, prot, flags, addr, 0); 1060 error = security_file_mmap(file, reqprot, prot, flags, addr, 0);
1051 if (error) 1061 if (error)
1052 return error; 1062 return error;
1053 error = ima_file_mmap(file, prot);
1054 if (error)
1055 return error;
1056 1063
1057 return mmap_region(file, addr, len, flags, vm_flags, pgoff); 1064 return mmap_region(file, addr, len, flags, vm_flags, pgoff);
1058} 1065}
@@ -1198,21 +1205,21 @@ munmap_back:
1198 goto unmap_and_free_vma; 1205 goto unmap_and_free_vma;
1199 if (vm_flags & VM_EXECUTABLE) 1206 if (vm_flags & VM_EXECUTABLE)
1200 added_exe_file_vma(mm); 1207 added_exe_file_vma(mm);
1208
1209 /* Can addr have changed??
1210 *
1211 * Answer: Yes, several device drivers can do it in their
1212 * f_op->mmap method. -DaveM
1213 */
1214 addr = vma->vm_start;
1215 pgoff = vma->vm_pgoff;
1216 vm_flags = vma->vm_flags;
1201 } else if (vm_flags & VM_SHARED) { 1217 } else if (vm_flags & VM_SHARED) {
1202 error = shmem_zero_setup(vma); 1218 error = shmem_zero_setup(vma);
1203 if (error) 1219 if (error)
1204 goto free_vma; 1220 goto free_vma;
1205 } 1221 }
1206 1222
1207 /* Can addr have changed??
1208 *
1209 * Answer: Yes, several device drivers can do it in their
1210 * f_op->mmap method. -DaveM
1211 */
1212 addr = vma->vm_start;
1213 pgoff = vma->vm_pgoff;
1214 vm_flags = vma->vm_flags;
1215
1216 if (vma_wants_writenotify(vma)) 1223 if (vma_wants_writenotify(vma))
1217 vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); 1224 vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
1218 1225
@@ -1223,6 +1230,8 @@ munmap_back:
1223 if (correct_wcount) 1230 if (correct_wcount)
1224 atomic_inc(&inode->i_writecount); 1231 atomic_inc(&inode->i_writecount);
1225out: 1232out:
1233 perf_event_mmap(vma);
1234
1226 mm->total_vm += len >> PAGE_SHIFT; 1235 mm->total_vm += len >> PAGE_SHIFT;
1227 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1236 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1228 if (vm_flags & VM_LOCKED) { 1237 if (vm_flags & VM_LOCKED) {
@@ -1575,7 +1584,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
1575 * Overcommit.. This must be the final test, as it will 1584 * Overcommit.. This must be the final test, as it will
1576 * update security statistics. 1585 * update security statistics.
1577 */ 1586 */
1578 if (security_vm_enough_memory(grow)) 1587 if (security_vm_enough_memory_mm(mm, grow))
1579 return -ENOMEM; 1588 return -ENOMEM;
1580 1589
1581 /* Ok, everything looks good - let it rip */ 1590 /* Ok, everything looks good - let it rip */
@@ -2112,6 +2121,7 @@ void exit_mmap(struct mm_struct *mm)
2112 /* Use -1 here to ensure all VMAs in the mm are unmapped */ 2121 /* Use -1 here to ensure all VMAs in the mm are unmapped */
2113 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); 2122 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
2114 vm_unacct_memory(nr_accounted); 2123 vm_unacct_memory(nr_accounted);
2124
2115 free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); 2125 free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
2116 tlb_finish_mmu(tlb, 0, end); 2126 tlb_finish_mmu(tlb, 0, end);
2117 2127
@@ -2268,7 +2278,7 @@ static void special_mapping_close(struct vm_area_struct *vma)
2268{ 2278{
2269} 2279}
2270 2280
2271static struct vm_operations_struct special_mapping_vmops = { 2281static const struct vm_operations_struct special_mapping_vmops = {
2272 .close = special_mapping_close, 2282 .close = special_mapping_close,
2273 .fault = special_mapping_fault, 2283 .fault = special_mapping_fault,
2274}; 2284};
@@ -2309,6 +2319,8 @@ int install_special_mapping(struct mm_struct *mm,
2309 2319
2310 mm->total_vm += len >> PAGE_SHIFT; 2320 mm->total_vm += len >> PAGE_SHIFT;
2311 2321
2322 perf_event_mmap(vma);
2323
2312 return 0; 2324 return 0;
2313} 2325}
2314 2326
@@ -2481,7 +2493,8 @@ void mm_drop_all_locks(struct mm_struct *mm)
2481 */ 2493 */
2482void __init mmap_init(void) 2494void __init mmap_init(void)
2483{ 2495{
2484 vm_area_cachep = kmem_cache_create("vm_area_struct", 2496 int ret;
2485 sizeof(struct vm_area_struct), 0, 2497
2486 SLAB_PANIC, NULL); 2498 ret = percpu_counter_init(&vm_committed_as, 0);
2499 VM_BUG_ON(ret);
2487} 2500}
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
new file mode 100644
index 000000000000..ded9081f4021
--- /dev/null
+++ b/mm/mmu_context.c
@@ -0,0 +1,58 @@
1/* Copyright (C) 2009 Red Hat, Inc.
2 *
3 * See ../COPYING for licensing terms.
4 */
5
6#include <linux/mm.h>
7#include <linux/mmu_context.h>
8#include <linux/sched.h>
9
10#include <asm/mmu_context.h>
11
12/*
13 * use_mm
14 * Makes the calling kernel thread take on the specified
15 * mm context.
16 * Called by the retry thread execute retries within the
17 * iocb issuer's mm context, so that copy_from/to_user
18 * operations work seamlessly for aio.
19 * (Note: this routine is intended to be called only
20 * from a kernel thread context)
21 */
22void use_mm(struct mm_struct *mm)
23{
24 struct mm_struct *active_mm;
25 struct task_struct *tsk = current;
26
27 task_lock(tsk);
28 active_mm = tsk->active_mm;
29 if (active_mm != mm) {
30 atomic_inc(&mm->mm_count);
31 tsk->active_mm = mm;
32 }
33 tsk->mm = mm;
34 switch_mm(active_mm, mm, tsk);
35 task_unlock(tsk);
36
37 if (active_mm != mm)
38 mmdrop(active_mm);
39}
40
41/*
42 * unuse_mm
43 * Reverses the effect of use_mm, i.e. releases the
44 * specified mm context which was earlier taken on
45 * by the calling kernel thread
46 * (Note: this routine is intended to be called only
47 * from a kernel thread context)
48 */
49void unuse_mm(struct mm_struct *mm)
50{
51 struct task_struct *tsk = current;
52
53 task_lock(tsk);
54 tsk->mm = NULL;
55 /* active_mm is still 'mm' */
56 enter_lazy_tlb(mm, tsk);
57 task_unlock(tsk);
58}
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 5f4ef0250bee..7e33f2cb3c77 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -99,6 +99,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
99 return young; 99 return young;
100} 100}
101 101
102void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
103 pte_t pte)
104{
105 struct mmu_notifier *mn;
106 struct hlist_node *n;
107
108 rcu_read_lock();
109 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
110 if (mn->ops->change_pte)
111 mn->ops->change_pte(mn, mm, address, pte);
112 /*
113 * Some drivers don't have change_pte,
114 * so we must call invalidate_page in that case.
115 */
116 else if (mn->ops->invalidate_page)
117 mn->ops->invalidate_page(mn, mm, address);
118 }
119 rcu_read_unlock();
120}
121
102void __mmu_notifier_invalidate_page(struct mm_struct *mm, 122void __mmu_notifier_invalidate_page(struct mm_struct *mm,
103 unsigned long address) 123 unsigned long address)
104{ 124{
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 16ce8b955dcf..f5b7d1760213 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -6,6 +6,7 @@
6 6
7 7
8#include <linux/stddef.h> 8#include <linux/stddef.h>
9#include <linux/mm.h>
9#include <linux/mmzone.h> 10#include <linux/mmzone.h>
10#include <linux/module.h> 11#include <linux/module.h>
11 12
@@ -72,3 +73,17 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
72 *zone = zonelist_zone(z); 73 *zone = zonelist_zone(z);
73 return z; 74 return z;
74} 75}
76
77#ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL
78int memmap_valid_within(unsigned long pfn,
79 struct page *page, struct zone *zone)
80{
81 if (page_to_pfn(page) != pfn)
82 return 0;
83
84 if (page_zone(page) != zone)
85 return 0;
86
87 return 1;
88}
89#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 258197b76fb4..8bc969d8112d 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -23,6 +23,7 @@
23#include <linux/swapops.h> 23#include <linux/swapops.h>
24#include <linux/mmu_notifier.h> 24#include <linux/mmu_notifier.h>
25#include <linux/migrate.h> 25#include <linux/migrate.h>
26#include <linux/perf_event.h>
26#include <asm/uaccess.h> 27#include <asm/uaccess.h>
27#include <asm/pgtable.h> 28#include <asm/pgtable.h>
28#include <asm/cacheflush.h> 29#include <asm/cacheflush.h>
@@ -299,6 +300,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
299 error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); 300 error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
300 if (error) 301 if (error)
301 goto out; 302 goto out;
303 perf_event_mmap(vma);
302 nstart = tmp; 304 nstart = tmp;
303 305
304 if (nstart < prev->vm_end) 306 if (nstart < prev->vm_end)
diff --git a/mm/mremap.c b/mm/mremap.c
index a39b7b91be46..97bff2547719 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -11,6 +11,7 @@
11#include <linux/hugetlb.h> 11#include <linux/hugetlb.h>
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/shm.h> 13#include <linux/shm.h>
14#include <linux/ksm.h>
14#include <linux/mman.h> 15#include <linux/mman.h>
15#include <linux/swap.h> 16#include <linux/swap.h>
16#include <linux/capability.h> 17#include <linux/capability.h>
@@ -85,8 +86,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
85 if (vma->vm_file) { 86 if (vma->vm_file) {
86 /* 87 /*
87 * Subtle point from Rajesh Venkatasubramanian: before 88 * Subtle point from Rajesh Venkatasubramanian: before
88 * moving file-based ptes, we must lock vmtruncate out, 89 * moving file-based ptes, we must lock truncate_pagecache
89 * since it might clean the dst vma before the src vma, 90 * out, since it might clean the dst vma before the src vma,
90 * and we propagate stale pages into the dst afterward. 91 * and we propagate stale pages into the dst afterward.
91 */ 92 */
92 mapping = vma->vm_file->f_mapping; 93 mapping = vma->vm_file->f_mapping;
@@ -174,6 +175,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
174 unsigned long excess = 0; 175 unsigned long excess = 0;
175 unsigned long hiwater_vm; 176 unsigned long hiwater_vm;
176 int split = 0; 177 int split = 0;
178 int err;
177 179
178 /* 180 /*
179 * We'd prefer to avoid failure later on in do_munmap: 181 * We'd prefer to avoid failure later on in do_munmap:
@@ -182,6 +184,18 @@ static unsigned long move_vma(struct vm_area_struct *vma,
182 if (mm->map_count >= sysctl_max_map_count - 3) 184 if (mm->map_count >= sysctl_max_map_count - 3)
183 return -ENOMEM; 185 return -ENOMEM;
184 186
187 /*
188 * Advise KSM to break any KSM pages in the area to be moved:
189 * it would be confusing if they were to turn up at the new
190 * location, where they happen to coincide with different KSM
191 * pages recently unmapped. But leave vma->vm_flags as it was,
192 * so KSM can come around to merge on vma and new_vma afterwards.
193 */
194 err = ksm_madvise(vma, old_addr, old_addr + old_len,
195 MADV_UNMERGEABLE, &vm_flags);
196 if (err)
197 return err;
198
185 new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); 199 new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
186 new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); 200 new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
187 if (!new_vma) 201 if (!new_vma)
diff --git a/mm/nommu.c b/mm/nommu.c
index 2fcf47d449b4..9876fa0c3ad3 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -33,6 +33,7 @@
33#include <asm/uaccess.h> 33#include <asm/uaccess.h>
34#include <asm/tlb.h> 34#include <asm/tlb.h>
35#include <asm/tlbflush.h> 35#include <asm/tlbflush.h>
36#include <asm/mmu_context.h>
36#include "internal.h" 37#include "internal.h"
37 38
38static inline __attribute__((format(printf, 1, 2))) 39static inline __attribute__((format(printf, 1, 2)))
@@ -56,20 +57,19 @@ void no_printk(const char *fmt, ...)
56 no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__) 57 no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__)
57#endif 58#endif
58 59
59#include "internal.h"
60
61void *high_memory; 60void *high_memory;
62struct page *mem_map; 61struct page *mem_map;
63unsigned long max_mapnr; 62unsigned long max_mapnr;
64unsigned long num_physpages; 63unsigned long num_physpages;
65atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); 64unsigned long highest_memmap_pfn;
65struct percpu_counter vm_committed_as;
66int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ 66int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
67int sysctl_overcommit_ratio = 50; /* default is 50% */ 67int sysctl_overcommit_ratio = 50; /* default is 50% */
68int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; 68int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
69int sysctl_nr_trim_pages = 1; /* page trimming behaviour */ 69int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
70int heap_stack_gap = 0; 70int heap_stack_gap = 0;
71 71
72atomic_t mmap_pages_allocated; 72atomic_long_t mmap_pages_allocated;
73 73
74EXPORT_SYMBOL(mem_map); 74EXPORT_SYMBOL(mem_map);
75EXPORT_SYMBOL(num_physpages); 75EXPORT_SYMBOL(num_physpages);
@@ -79,50 +79,10 @@ static struct kmem_cache *vm_region_jar;
79struct rb_root nommu_region_tree = RB_ROOT; 79struct rb_root nommu_region_tree = RB_ROOT;
80DECLARE_RWSEM(nommu_region_sem); 80DECLARE_RWSEM(nommu_region_sem);
81 81
82struct vm_operations_struct generic_file_vm_ops = { 82const struct vm_operations_struct generic_file_vm_ops = {
83}; 83};
84 84
85/* 85/*
86 * Handle all mappings that got truncated by a "truncate()"
87 * system call.
88 *
89 * NOTE! We have to be ready to update the memory sharing
90 * between the file and the memory map for a potential last
91 * incomplete page. Ugly, but necessary.
92 */
93int vmtruncate(struct inode *inode, loff_t offset)
94{
95 struct address_space *mapping = inode->i_mapping;
96 unsigned long limit;
97
98 if (inode->i_size < offset)
99 goto do_expand;
100 i_size_write(inode, offset);
101
102 truncate_inode_pages(mapping, offset);
103 goto out_truncate;
104
105do_expand:
106 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
107 if (limit != RLIM_INFINITY && offset > limit)
108 goto out_sig;
109 if (offset > inode->i_sb->s_maxbytes)
110 goto out;
111 i_size_write(inode, offset);
112
113out_truncate:
114 if (inode->i_op->truncate)
115 inode->i_op->truncate(inode);
116 return 0;
117out_sig:
118 send_sig(SIGXFSZ, current, 0);
119out:
120 return -EFBIG;
121}
122
123EXPORT_SYMBOL(vmtruncate);
124
125/*
126 * Return the total memory allocated for this pointer, not 86 * Return the total memory allocated for this pointer, not
127 * just what the caller asked for. 87 * just what the caller asked for.
128 * 88 *
@@ -170,30 +130,29 @@ unsigned int kobjsize(const void *objp)
170} 130}
171 131
172int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 132int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
173 unsigned long start, int len, int flags, 133 unsigned long start, int nr_pages, unsigned int foll_flags,
174 struct page **pages, struct vm_area_struct **vmas) 134 struct page **pages, struct vm_area_struct **vmas)
175{ 135{
176 struct vm_area_struct *vma; 136 struct vm_area_struct *vma;
177 unsigned long vm_flags; 137 unsigned long vm_flags;
178 int i; 138 int i;
179 int write = !!(flags & GUP_FLAGS_WRITE);
180 int force = !!(flags & GUP_FLAGS_FORCE);
181 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
182 139
183 /* calculate required read or write permissions. 140 /* calculate required read or write permissions.
184 * - if 'force' is set, we only require the "MAY" flags. 141 * If FOLL_FORCE is set, we only require the "MAY" flags.
185 */ 142 */
186 vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); 143 vm_flags = (foll_flags & FOLL_WRITE) ?
187 vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 144 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
145 vm_flags &= (foll_flags & FOLL_FORCE) ?
146 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
188 147
189 for (i = 0; i < len; i++) { 148 for (i = 0; i < nr_pages; i++) {
190 vma = find_vma(mm, start); 149 vma = find_vma(mm, start);
191 if (!vma) 150 if (!vma)
192 goto finish_or_fault; 151 goto finish_or_fault;
193 152
194 /* protect what we can, including chardevs */ 153 /* protect what we can, including chardevs */
195 if (vma->vm_flags & (VM_IO | VM_PFNMAP) || 154 if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
196 (!ignore && !(vm_flags & vma->vm_flags))) 155 !(vm_flags & vma->vm_flags))
197 goto finish_or_fault; 156 goto finish_or_fault;
198 157
199 if (pages) { 158 if (pages) {
@@ -212,7 +171,6 @@ finish_or_fault:
212 return i ? : -EFAULT; 171 return i ? : -EFAULT;
213} 172}
214 173
215
216/* 174/*
217 * get a list of pages in an address range belonging to the specified process 175 * get a list of pages in an address range belonging to the specified process
218 * and indicate the VMA that covers each page 176 * and indicate the VMA that covers each page
@@ -221,22 +179,41 @@ finish_or_fault:
221 * - don't permit access to VMAs that don't support it, such as I/O mappings 179 * - don't permit access to VMAs that don't support it, such as I/O mappings
222 */ 180 */
223int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 181int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
224 unsigned long start, int len, int write, int force, 182 unsigned long start, int nr_pages, int write, int force,
225 struct page **pages, struct vm_area_struct **vmas) 183 struct page **pages, struct vm_area_struct **vmas)
226{ 184{
227 int flags = 0; 185 int flags = 0;
228 186
229 if (write) 187 if (write)
230 flags |= GUP_FLAGS_WRITE; 188 flags |= FOLL_WRITE;
231 if (force) 189 if (force)
232 flags |= GUP_FLAGS_FORCE; 190 flags |= FOLL_FORCE;
233 191
234 return __get_user_pages(tsk, mm, 192 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
235 start, len, flags,
236 pages, vmas);
237} 193}
238EXPORT_SYMBOL(get_user_pages); 194EXPORT_SYMBOL(get_user_pages);
239 195
196/**
197 * follow_pfn - look up PFN at a user virtual address
198 * @vma: memory mapping
199 * @address: user virtual address
200 * @pfn: location to store found PFN
201 *
202 * Only IO mappings and raw PFN mappings are allowed.
203 *
204 * Returns zero and the pfn at @pfn on success, -ve otherwise.
205 */
206int follow_pfn(struct vm_area_struct *vma, unsigned long address,
207 unsigned long *pfn)
208{
209 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
210 return -EINVAL;
211
212 *pfn = address >> PAGE_SHIFT;
213 return 0;
214}
215EXPORT_SYMBOL(follow_pfn);
216
240DEFINE_RWLOCK(vmlist_lock); 217DEFINE_RWLOCK(vmlist_lock);
241struct vm_struct *vmlist; 218struct vm_struct *vmlist;
242 219
@@ -463,12 +440,11 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
463 */ 440 */
464void __init mmap_init(void) 441void __init mmap_init(void)
465{ 442{
466 vm_region_jar = kmem_cache_create("vm_region_jar", 443 int ret;
467 sizeof(struct vm_region), 0, 444
468 SLAB_PANIC, NULL); 445 ret = percpu_counter_init(&vm_committed_as, 0);
469 vm_area_cachep = kmem_cache_create("vm_area_struct", 446 VM_BUG_ON(ret);
470 sizeof(struct vm_area_struct), 0, 447 vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
471 SLAB_PANIC, NULL);
472} 448}
473 449
474/* 450/*
@@ -486,27 +462,24 @@ static noinline void validate_nommu_regions(void)
486 return; 462 return;
487 463
488 last = rb_entry(lastp, struct vm_region, vm_rb); 464 last = rb_entry(lastp, struct vm_region, vm_rb);
489 if (unlikely(last->vm_end <= last->vm_start)) 465 BUG_ON(unlikely(last->vm_end <= last->vm_start));
490 BUG(); 466 BUG_ON(unlikely(last->vm_top < last->vm_end));
491 if (unlikely(last->vm_top < last->vm_end))
492 BUG();
493 467
494 while ((p = rb_next(lastp))) { 468 while ((p = rb_next(lastp))) {
495 region = rb_entry(p, struct vm_region, vm_rb); 469 region = rb_entry(p, struct vm_region, vm_rb);
496 last = rb_entry(lastp, struct vm_region, vm_rb); 470 last = rb_entry(lastp, struct vm_region, vm_rb);
497 471
498 if (unlikely(region->vm_end <= region->vm_start)) 472 BUG_ON(unlikely(region->vm_end <= region->vm_start));
499 BUG(); 473 BUG_ON(unlikely(region->vm_top < region->vm_end));
500 if (unlikely(region->vm_top < region->vm_end)) 474 BUG_ON(unlikely(region->vm_start < last->vm_top));
501 BUG();
502 if (unlikely(region->vm_start < last->vm_top))
503 BUG();
504 475
505 lastp = p; 476 lastp = p;
506 } 477 }
507} 478}
508#else 479#else
509#define validate_nommu_regions() do {} while(0) 480static void validate_nommu_regions(void)
481{
482}
510#endif 483#endif
511 484
512/* 485/*
@@ -519,8 +492,6 @@ static void add_nommu_region(struct vm_region *region)
519 492
520 validate_nommu_regions(); 493 validate_nommu_regions();
521 494
522 BUG_ON(region->vm_start & ~PAGE_MASK);
523
524 parent = NULL; 495 parent = NULL;
525 p = &nommu_region_tree.rb_node; 496 p = &nommu_region_tree.rb_node;
526 while (*p) { 497 while (*p) {
@@ -563,16 +534,17 @@ static void free_page_series(unsigned long from, unsigned long to)
563 struct page *page = virt_to_page(from); 534 struct page *page = virt_to_page(from);
564 535
565 kdebug("- free %lx", from); 536 kdebug("- free %lx", from);
566 atomic_dec(&mmap_pages_allocated); 537 atomic_long_dec(&mmap_pages_allocated);
567 if (page_count(page) != 1) 538 if (page_count(page) != 1)
568 kdebug("free page %p [%d]", page, page_count(page)); 539 kdebug("free page %p: refcount not one: %d",
540 page, page_count(page));
569 put_page(page); 541 put_page(page);
570 } 542 }
571} 543}
572 544
573/* 545/*
574 * release a reference to a region 546 * release a reference to a region
575 * - the caller must hold the region semaphore, which this releases 547 * - the caller must hold the region semaphore for writing, which this releases
576 * - the region may not have been added to the tree yet, in which case vm_top 548 * - the region may not have been added to the tree yet, in which case vm_top
577 * will equal vm_start 549 * will equal vm_start
578 */ 550 */
@@ -613,6 +585,22 @@ static void put_nommu_region(struct vm_region *region)
613} 585}
614 586
615/* 587/*
588 * update protection on a vma
589 */
590static void protect_vma(struct vm_area_struct *vma, unsigned long flags)
591{
592#ifdef CONFIG_MPU
593 struct mm_struct *mm = vma->vm_mm;
594 long start = vma->vm_start & PAGE_MASK;
595 while (start < vma->vm_end) {
596 protect_page(mm, start, flags);
597 start += PAGE_SIZE;
598 }
599 update_protections(mm);
600#endif
601}
602
603/*
616 * add a VMA into a process's mm_struct in the appropriate place in the list 604 * add a VMA into a process's mm_struct in the appropriate place in the list
617 * and tree and add to the address space's page tree also if not an anonymous 605 * and tree and add to the address space's page tree also if not an anonymous
618 * page 606 * page
@@ -631,6 +619,8 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
631 mm->map_count++; 619 mm->map_count++;
632 vma->vm_mm = mm; 620 vma->vm_mm = mm;
633 621
622 protect_vma(vma, vma->vm_flags);
623
634 /* add the VMA to the mapping */ 624 /* add the VMA to the mapping */
635 if (vma->vm_file) { 625 if (vma->vm_file) {
636 mapping = vma->vm_file->f_mapping; 626 mapping = vma->vm_file->f_mapping;
@@ -693,6 +683,8 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
693 683
694 kenter("%p", vma); 684 kenter("%p", vma);
695 685
686 protect_vma(vma, 0);
687
696 mm->map_count--; 688 mm->map_count--;
697 if (mm->mmap_cache == vma) 689 if (mm->mmap_cache == vma)
698 mm->mmap_cache = NULL; 690 mm->mmap_cache = NULL;
@@ -834,7 +826,7 @@ static int validate_mmap_request(struct file *file,
834 int ret; 826 int ret;
835 827
836 /* do the simple checks first */ 828 /* do the simple checks first */
837 if (flags & MAP_FIXED || addr) { 829 if (flags & MAP_FIXED) {
838 printk(KERN_DEBUG 830 printk(KERN_DEBUG
839 "%d: Can't do fixed-address/overlay mmap of RAM\n", 831 "%d: Can't do fixed-address/overlay mmap of RAM\n",
840 current->pid); 832 current->pid);
@@ -905,6 +897,10 @@ static int validate_mmap_request(struct file *file,
905 if (!file->f_op->read) 897 if (!file->f_op->read)
906 capabilities &= ~BDI_CAP_MAP_COPY; 898 capabilities &= ~BDI_CAP_MAP_COPY;
907 899
900 /* The file shall have been opened with read permission. */
901 if (!(file->f_mode & FMODE_READ))
902 return -EACCES;
903
908 if (flags & MAP_SHARED) { 904 if (flags & MAP_SHARED) {
909 /* do checks for writing, appending and locking */ 905 /* do checks for writing, appending and locking */
910 if ((prot & PROT_WRITE) && 906 if ((prot & PROT_WRITE) &&
@@ -1038,7 +1034,7 @@ static int do_mmap_shared_file(struct vm_area_struct *vma)
1038 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); 1034 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
1039 if (ret == 0) { 1035 if (ret == 0) {
1040 vma->vm_region->vm_top = vma->vm_region->vm_end; 1036 vma->vm_region->vm_top = vma->vm_region->vm_end;
1041 return ret; 1037 return 0;
1042 } 1038 }
1043 if (ret != -ENOSYS) 1039 if (ret != -ENOSYS)
1044 return ret; 1040 return ret;
@@ -1055,7 +1051,8 @@ static int do_mmap_shared_file(struct vm_area_struct *vma)
1055 */ 1051 */
1056static int do_mmap_private(struct vm_area_struct *vma, 1052static int do_mmap_private(struct vm_area_struct *vma,
1057 struct vm_region *region, 1053 struct vm_region *region,
1058 unsigned long len) 1054 unsigned long len,
1055 unsigned long capabilities)
1059{ 1056{
1060 struct page *pages; 1057 struct page *pages;
1061 unsigned long total, point, n, rlen; 1058 unsigned long total, point, n, rlen;
@@ -1066,13 +1063,13 @@ static int do_mmap_private(struct vm_area_struct *vma,
1066 * shared mappings on devices or memory 1063 * shared mappings on devices or memory
1067 * - VM_MAYSHARE will be set if it may attempt to share 1064 * - VM_MAYSHARE will be set if it may attempt to share
1068 */ 1065 */
1069 if (vma->vm_file) { 1066 if (capabilities & BDI_CAP_MAP_DIRECT) {
1070 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); 1067 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
1071 if (ret == 0) { 1068 if (ret == 0) {
1072 /* shouldn't return success if we're not sharing */ 1069 /* shouldn't return success if we're not sharing */
1073 BUG_ON(!(vma->vm_flags & VM_MAYSHARE)); 1070 BUG_ON(!(vma->vm_flags & VM_MAYSHARE));
1074 vma->vm_region->vm_top = vma->vm_region->vm_end; 1071 vma->vm_region->vm_top = vma->vm_region->vm_end;
1075 return ret; 1072 return 0;
1076 } 1073 }
1077 if (ret != -ENOSYS) 1074 if (ret != -ENOSYS)
1078 return ret; 1075 return ret;
@@ -1096,7 +1093,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
1096 goto enomem; 1093 goto enomem;
1097 1094
1098 total = 1 << order; 1095 total = 1 << order;
1099 atomic_add(total, &mmap_pages_allocated); 1096 atomic_long_add(total, &mmap_pages_allocated);
1100 1097
1101 point = rlen >> PAGE_SHIFT; 1098 point = rlen >> PAGE_SHIFT;
1102 1099
@@ -1107,7 +1104,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
1107 order = ilog2(total - point); 1104 order = ilog2(total - point);
1108 n = 1 << order; 1105 n = 1 << order;
1109 kdebug("shave %lu/%lu @%lu", n, total - point, total); 1106 kdebug("shave %lu/%lu @%lu", n, total - point, total);
1110 atomic_sub(n, &mmap_pages_allocated); 1107 atomic_long_sub(n, &mmap_pages_allocated);
1111 total -= n; 1108 total -= n;
1112 set_page_refcounted(pages + total); 1109 set_page_refcounted(pages + total);
1113 __free_pages(pages + total, order); 1110 __free_pages(pages + total, order);
@@ -1185,9 +1182,6 @@ unsigned long do_mmap_pgoff(struct file *file,
1185 1182
1186 kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); 1183 kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff);
1187 1184
1188 if (!(flags & MAP_FIXED))
1189 addr = round_hint_to_min(addr);
1190
1191 /* decide whether we should attempt the mapping, and if so what sort of 1185 /* decide whether we should attempt the mapping, and if so what sort of
1192 * mapping */ 1186 * mapping */
1193 ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, 1187 ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
@@ -1197,6 +1191,9 @@ unsigned long do_mmap_pgoff(struct file *file,
1197 return ret; 1191 return ret;
1198 } 1192 }
1199 1193
1194 /* we ignore the address hint */
1195 addr = 0;
1196
1200 /* we've determined that we can make the mapping, now translate what we 1197 /* we've determined that we can make the mapping, now translate what we
1201 * now know into VMA flags */ 1198 * now know into VMA flags */
1202 vm_flags = determine_vm_flags(file, prot, flags, capabilities); 1199 vm_flags = determine_vm_flags(file, prot, flags, capabilities);
@@ -1310,7 +1307,7 @@ unsigned long do_mmap_pgoff(struct file *file,
1310 * - this is the hook for quasi-memory character devices to 1307 * - this is the hook for quasi-memory character devices to
1311 * tell us the location of a shared mapping 1308 * tell us the location of a shared mapping
1312 */ 1309 */
1313 if (file && file->f_op->get_unmapped_area) { 1310 if (capabilities & BDI_CAP_MAP_DIRECT) {
1314 addr = file->f_op->get_unmapped_area(file, addr, len, 1311 addr = file->f_op->get_unmapped_area(file, addr, len,
1315 pgoff, flags); 1312 pgoff, flags);
1316 if (IS_ERR((void *) addr)) { 1313 if (IS_ERR((void *) addr)) {
@@ -1335,14 +1332,15 @@ unsigned long do_mmap_pgoff(struct file *file,
1335 1332
1336 vma->vm_region = region; 1333 vma->vm_region = region;
1337 1334
1338 /* set up the mapping */ 1335 /* set up the mapping
1336 * - the region is filled in if BDI_CAP_MAP_DIRECT is still set
1337 */
1339 if (file && vma->vm_flags & VM_SHARED) 1338 if (file && vma->vm_flags & VM_SHARED)
1340 ret = do_mmap_shared_file(vma); 1339 ret = do_mmap_shared_file(vma);
1341 else 1340 else
1342 ret = do_mmap_private(vma, region, len); 1341 ret = do_mmap_private(vma, region, len, capabilities);
1343 if (ret < 0) 1342 if (ret < 0)
1344 goto error_put_region; 1343 goto error_just_free;
1345
1346 add_nommu_region(region); 1344 add_nommu_region(region);
1347 1345
1348 /* okay... we have a mapping; now we have to register it */ 1346 /* okay... we have a mapping; now we have to register it */
@@ -1361,25 +1359,14 @@ share:
1361 kleave(" = %lx", result); 1359 kleave(" = %lx", result);
1362 return result; 1360 return result;
1363 1361
1364error_put_region:
1365 __put_nommu_region(region);
1366 if (vma) {
1367 if (vma->vm_file) {
1368 fput(vma->vm_file);
1369 if (vma->vm_flags & VM_EXECUTABLE)
1370 removed_exe_file_vma(vma->vm_mm);
1371 }
1372 kmem_cache_free(vm_area_cachep, vma);
1373 }
1374 kleave(" = %d [pr]", ret);
1375 return ret;
1376
1377error_just_free: 1362error_just_free:
1378 up_write(&nommu_region_sem); 1363 up_write(&nommu_region_sem);
1379error: 1364error:
1380 fput(region->vm_file); 1365 if (region->vm_file)
1366 fput(region->vm_file);
1381 kmem_cache_free(vm_region_jar, region); 1367 kmem_cache_free(vm_region_jar, region);
1382 fput(vma->vm_file); 1368 if (vma->vm_file)
1369 fput(vma->vm_file);
1383 if (vma->vm_flags & VM_EXECUTABLE) 1370 if (vma->vm_flags & VM_EXECUTABLE)
1384 removed_exe_file_vma(vma->vm_mm); 1371 removed_exe_file_vma(vma->vm_mm);
1385 kmem_cache_free(vm_area_cachep, vma); 1372 kmem_cache_free(vm_area_cachep, vma);
@@ -1536,10 +1523,15 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1536 /* find the first potentially overlapping VMA */ 1523 /* find the first potentially overlapping VMA */
1537 vma = find_vma(mm, start); 1524 vma = find_vma(mm, start);
1538 if (!vma) { 1525 if (!vma) {
1539 printk(KERN_WARNING 1526 static int limit = 0;
1540 "munmap of memory not mmapped by process %d (%s):" 1527 if (limit < 5) {
1541 " 0x%lx-0x%lx\n", 1528 printk(KERN_WARNING
1542 current->pid, current->comm, start, start + len - 1); 1529 "munmap of memory not mmapped by process %d"
1530 " (%s): 0x%lx-0x%lx\n",
1531 current->pid, current->comm,
1532 start, start + len - 1);
1533 limit++;
1534 }
1543 return -EINVAL; 1535 return -EINVAL;
1544 } 1536 }
1545 1537
@@ -1849,12 +1841,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
1849 if (mm) 1841 if (mm)
1850 allowed -= mm->total_vm / 32; 1842 allowed -= mm->total_vm / 32;
1851 1843
1852 /* 1844 if (percpu_counter_read_positive(&vm_committed_as) < allowed)
1853 * cast `allowed' as a signed long because vm_committed_space
1854 * sometimes has a negative value
1855 */
1856 if (atomic_long_read(&vm_committed_space) < (long)allowed)
1857 return 0; 1845 return 0;
1846
1858error: 1847error:
1859 vm_unacct_memory(pages); 1848 vm_unacct_memory(pages);
1860 1849
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d3b9bac085b5..ea2147dabba6 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -34,6 +34,23 @@ int sysctl_oom_dump_tasks;
34static DEFINE_SPINLOCK(zone_scan_lock); 34static DEFINE_SPINLOCK(zone_scan_lock);
35/* #define DEBUG */ 35/* #define DEBUG */
36 36
37/*
38 * Is all threads of the target process nodes overlap ours?
39 */
40static int has_intersects_mems_allowed(struct task_struct *tsk)
41{
42 struct task_struct *t;
43
44 t = tsk;
45 do {
46 if (cpuset_mems_allowed_intersects(current, t))
47 return 1;
48 t = next_thread(t);
49 } while (t != tsk);
50
51 return 0;
52}
53
37/** 54/**
38 * badness - calculate a numeric value for how bad this task has been 55 * badness - calculate a numeric value for how bad this task has been
39 * @p: task struct of which task we should calculate 56 * @p: task struct of which task we should calculate
@@ -58,6 +75,13 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
58 unsigned long points, cpu_time, run_time; 75 unsigned long points, cpu_time, run_time;
59 struct mm_struct *mm; 76 struct mm_struct *mm;
60 struct task_struct *child; 77 struct task_struct *child;
78 int oom_adj = p->signal->oom_adj;
79 struct task_cputime task_time;
80 unsigned long utime;
81 unsigned long stime;
82
83 if (oom_adj == OOM_DISABLE)
84 return 0;
61 85
62 task_lock(p); 86 task_lock(p);
63 mm = p->mm; 87 mm = p->mm;
@@ -79,7 +103,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
79 /* 103 /*
80 * swapoff can easily use up all memory, so kill those first. 104 * swapoff can easily use up all memory, so kill those first.
81 */ 105 */
82 if (p->flags & PF_SWAPOFF) 106 if (p->flags & PF_OOM_ORIGIN)
83 return ULONG_MAX; 107 return ULONG_MAX;
84 108
85 /* 109 /*
@@ -102,8 +126,11 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
102 * of seconds. There is no particular reason for this other than 126 * of seconds. There is no particular reason for this other than
103 * that it turned out to work very well in practice. 127 * that it turned out to work very well in practice.
104 */ 128 */
105 cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime)) 129 thread_group_cputime(p, &task_time);
106 >> (SHIFT_HZ + 3); 130 utime = cputime_to_jiffies(task_time.utime);
131 stime = cputime_to_jiffies(task_time.stime);
132 cpu_time = (utime + stime) >> (SHIFT_HZ + 3);
133
107 134
108 if (uptime >= p->start_time.tv_sec) 135 if (uptime >= p->start_time.tv_sec)
109 run_time = (uptime - p->start_time.tv_sec) >> 10; 136 run_time = (uptime - p->start_time.tv_sec) >> 10;
@@ -144,19 +171,19 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
144 * because p may have allocated or otherwise mapped memory on 171 * because p may have allocated or otherwise mapped memory on
145 * this node before. However it will be less likely. 172 * this node before. However it will be less likely.
146 */ 173 */
147 if (!cpuset_mems_allowed_intersects(current, p)) 174 if (!has_intersects_mems_allowed(p))
148 points /= 8; 175 points /= 8;
149 176
150 /* 177 /*
151 * Adjust the score by oomkilladj. 178 * Adjust the score by oom_adj.
152 */ 179 */
153 if (p->oomkilladj) { 180 if (oom_adj) {
154 if (p->oomkilladj > 0) { 181 if (oom_adj > 0) {
155 if (!points) 182 if (!points)
156 points = 1; 183 points = 1;
157 points <<= p->oomkilladj; 184 points <<= oom_adj;
158 } else 185 } else
159 points >>= -(p->oomkilladj); 186 points >>= -(oom_adj);
160 } 187 }
161 188
162#ifdef DEBUG 189#ifdef DEBUG
@@ -200,13 +227,13 @@ static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist,
200static struct task_struct *select_bad_process(unsigned long *ppoints, 227static struct task_struct *select_bad_process(unsigned long *ppoints,
201 struct mem_cgroup *mem) 228 struct mem_cgroup *mem)
202{ 229{
203 struct task_struct *g, *p; 230 struct task_struct *p;
204 struct task_struct *chosen = NULL; 231 struct task_struct *chosen = NULL;
205 struct timespec uptime; 232 struct timespec uptime;
206 *ppoints = 0; 233 *ppoints = 0;
207 234
208 do_posix_clock_monotonic_gettime(&uptime); 235 do_posix_clock_monotonic_gettime(&uptime);
209 do_each_thread(g, p) { 236 for_each_process(p) {
210 unsigned long points; 237 unsigned long points;
211 238
212 /* 239 /*
@@ -251,7 +278,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
251 *ppoints = ULONG_MAX; 278 *ppoints = ULONG_MAX;
252 } 279 }
253 280
254 if (p->oomkilladj == OOM_DISABLE) 281 if (p->signal->oom_adj == OOM_DISABLE)
255 continue; 282 continue;
256 283
257 points = badness(p, uptime.tv_sec); 284 points = badness(p, uptime.tv_sec);
@@ -259,7 +286,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
259 chosen = p; 286 chosen = p;
260 *ppoints = points; 287 *ppoints = points;
261 } 288 }
262 } while_each_thread(g, p); 289 }
263 290
264 return chosen; 291 return chosen;
265} 292}
@@ -284,22 +311,28 @@ static void dump_tasks(const struct mem_cgroup *mem)
284 printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj " 311 printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj "
285 "name\n"); 312 "name\n");
286 do_each_thread(g, p) { 313 do_each_thread(g, p) {
287 /* 314 struct mm_struct *mm;
288 * total_vm and rss sizes do not exist for tasks with a 315
289 * detached mm so there's no need to report them.
290 */
291 if (!p->mm)
292 continue;
293 if (mem && !task_in_mem_cgroup(p, mem)) 316 if (mem && !task_in_mem_cgroup(p, mem))
294 continue; 317 continue;
295 if (!thread_group_leader(p)) 318 if (!thread_group_leader(p))
296 continue; 319 continue;
297 320
298 task_lock(p); 321 task_lock(p);
322 mm = p->mm;
323 if (!mm) {
324 /*
325 * total_vm and rss sizes do not exist for tasks with no
326 * mm so there's no need to report them; they can't be
327 * oom killed anyway.
328 */
329 task_unlock(p);
330 continue;
331 }
299 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", 332 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n",
300 p->pid, __task_cred(p)->uid, p->tgid, 333 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
301 p->mm->total_vm, get_mm_rss(p->mm), (int)task_cpu(p), 334 get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj,
302 p->oomkilladj, p->comm); 335 p->comm);
303 task_unlock(p); 336 task_unlock(p);
304 } while_each_thread(g, p); 337 } while_each_thread(g, p);
305} 338}
@@ -340,11 +373,6 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
340 373
341static int oom_kill_task(struct task_struct *p) 374static int oom_kill_task(struct task_struct *p)
342{ 375{
343 struct mm_struct *mm;
344 struct task_struct *g, *q;
345
346 mm = p->mm;
347
348 /* WARNING: mm may not be dereferenced since we did not obtain its 376 /* WARNING: mm may not be dereferenced since we did not obtain its
349 * value from get_task_mm(p). This is OK since all we need to do is 377 * value from get_task_mm(p). This is OK since all we need to do is
350 * compare mm to q->mm below. 378 * compare mm to q->mm below.
@@ -353,30 +381,11 @@ static int oom_kill_task(struct task_struct *p)
353 * change to NULL at any time since we do not hold task_lock(p). 381 * change to NULL at any time since we do not hold task_lock(p).
354 * However, this is of no concern to us. 382 * However, this is of no concern to us.
355 */ 383 */
356 384 if (!p->mm || p->signal->oom_adj == OOM_DISABLE)
357 if (mm == NULL)
358 return 1; 385 return 1;
359 386
360 /*
361 * Don't kill the process if any threads are set to OOM_DISABLE
362 */
363 do_each_thread(g, q) {
364 if (q->mm == mm && q->oomkilladj == OOM_DISABLE)
365 return 1;
366 } while_each_thread(g, q);
367
368 __oom_kill_task(p, 1); 387 __oom_kill_task(p, 1);
369 388
370 /*
371 * kill all processes that share the ->mm (i.e. all threads),
372 * but are in a different thread group. Don't let them have access
373 * to memory reserves though, otherwise we might deplete all memory.
374 */
375 do_each_thread(g, q) {
376 if (q->mm == mm && !same_thread_group(q, p))
377 force_sig(SIGKILL, q);
378 } while_each_thread(g, q);
379
380 return 0; 389 return 0;
381} 390}
382 391
@@ -388,12 +397,14 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
388 397
389 if (printk_ratelimit()) { 398 if (printk_ratelimit()) {
390 printk(KERN_WARNING "%s invoked oom-killer: " 399 printk(KERN_WARNING "%s invoked oom-killer: "
391 "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", 400 "gfp_mask=0x%x, order=%d, oom_adj=%d\n",
392 current->comm, gfp_mask, order, current->oomkilladj); 401 current->comm, gfp_mask, order,
402 current->signal->oom_adj);
393 task_lock(current); 403 task_lock(current);
394 cpuset_print_task_mems_allowed(current); 404 cpuset_print_task_mems_allowed(current);
395 task_unlock(current); 405 task_unlock(current);
396 dump_stack(); 406 dump_stack();
407 mem_cgroup_print_oom_info(mem, current);
397 show_mem(); 408 show_mem();
398 if (sysctl_oom_dump_tasks) 409 if (sysctl_oom_dump_tasks)
399 dump_tasks(mem); 410 dump_tasks(mem);
@@ -513,34 +524,32 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
513 */ 524 */
514static void __out_of_memory(gfp_t gfp_mask, int order) 525static void __out_of_memory(gfp_t gfp_mask, int order)
515{ 526{
516 if (sysctl_oom_kill_allocating_task) { 527 struct task_struct *p;
517 oom_kill_process(current, gfp_mask, order, 0, NULL, 528 unsigned long points;
518 "Out of memory (oom_kill_allocating_task)");
519
520 } else {
521 unsigned long points;
522 struct task_struct *p;
523
524retry:
525 /*
526 * Rambo mode: Shoot down a process and hope it solves whatever
527 * issues we may have.
528 */
529 p = select_bad_process(&points, NULL);
530 529
531 if (PTR_ERR(p) == -1UL) 530 if (sysctl_oom_kill_allocating_task)
531 if (!oom_kill_process(current, gfp_mask, order, 0, NULL,
532 "Out of memory (oom_kill_allocating_task)"))
532 return; 533 return;
534retry:
535 /*
536 * Rambo mode: Shoot down a process and hope it solves whatever
537 * issues we may have.
538 */
539 p = select_bad_process(&points, NULL);
533 540
534 /* Found nothing?!?! Either we hang forever, or we panic. */ 541 if (PTR_ERR(p) == -1UL)
535 if (!p) { 542 return;
536 read_unlock(&tasklist_lock);
537 panic("Out of memory and no killable processes...\n");
538 }
539 543
540 if (oom_kill_process(p, gfp_mask, order, points, NULL, 544 /* Found nothing?!?! Either we hang forever, or we panic. */
541 "Out of memory")) 545 if (!p) {
542 goto retry; 546 read_unlock(&tasklist_lock);
547 panic("Out of memory and no killable processes...\n");
543 } 548 }
549
550 if (oom_kill_process(p, gfp_mask, order, points, NULL,
551 "Out of memory"))
552 goto retry;
544} 553}
545 554
546/* 555/*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 30351f0063ac..0b19943ecf8b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -36,15 +36,6 @@
36#include <linux/pagevec.h> 36#include <linux/pagevec.h>
37 37
38/* 38/*
39 * The maximum number of pages to writeout in a single bdflush/kupdate
40 * operation. We do this so we don't hold I_SYNC against an inode for
41 * enormous amounts of time, which would block a userspace task which has
42 * been forced to throttle against that inode. Also, the code reevaluates
43 * the dirty each time it has written this many pages.
44 */
45#define MAX_WRITEBACK_PAGES 1024
46
47/*
48 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited 39 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
49 * will look to see if it needs to force writeback or throttling. 40 * will look to see if it needs to force writeback or throttling.
50 */ 41 */
@@ -53,18 +44,21 @@ static long ratelimit_pages = 32;
53/* 44/*
54 * When balance_dirty_pages decides that the caller needs to perform some 45 * When balance_dirty_pages decides that the caller needs to perform some
55 * non-background writeback, this is how many pages it will attempt to write. 46 * non-background writeback, this is how many pages it will attempt to write.
56 * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably 47 * It should be somewhat larger than dirtied pages to ensure that reasonably
57 * large amounts of I/O are submitted. 48 * large amounts of I/O are submitted.
58 */ 49 */
59static inline long sync_writeback_pages(void) 50static inline long sync_writeback_pages(unsigned long dirtied)
60{ 51{
61 return ratelimit_pages + ratelimit_pages / 2; 52 if (dirtied < ratelimit_pages)
53 dirtied = ratelimit_pages;
54
55 return dirtied + dirtied / 2;
62} 56}
63 57
64/* The following parameters are exported via /proc/sys/vm */ 58/* The following parameters are exported via /proc/sys/vm */
65 59
66/* 60/*
67 * Start background writeback (via pdflush) at this percentage 61 * Start background writeback (via writeback threads) at this percentage
68 */ 62 */
69int dirty_background_ratio = 10; 63int dirty_background_ratio = 10;
70 64
@@ -94,12 +88,12 @@ unsigned long vm_dirty_bytes;
94/* 88/*
95 * The interval between `kupdate'-style writebacks 89 * The interval between `kupdate'-style writebacks
96 */ 90 */
97unsigned int dirty_writeback_interval = 5 * 100; /* sentiseconds */ 91unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
98 92
99/* 93/*
100 * The longest time for which data is allowed to remain dirty 94 * The longest time for which data is allowed to remain dirty
101 */ 95 */
102unsigned int dirty_expire_interval = 30 * 100; /* sentiseconds */ 96unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
103 97
104/* 98/*
105 * Flag that makes the machine dump writes/reads and block dirtyings. 99 * Flag that makes the machine dump writes/reads and block dirtyings.
@@ -117,8 +111,6 @@ EXPORT_SYMBOL(laptop_mode);
117/* End of sysctl-exported parameters */ 111/* End of sysctl-exported parameters */
118 112
119 113
120static void background_writeout(unsigned long _min_pages);
121
122/* 114/*
123 * Scale the writeback cache size proportional to the relative writeout speeds. 115 * Scale the writeback cache size proportional to the relative writeout speeds.
124 * 116 *
@@ -166,37 +158,37 @@ static void update_completion_period(void)
166} 158}
167 159
168int dirty_background_ratio_handler(struct ctl_table *table, int write, 160int dirty_background_ratio_handler(struct ctl_table *table, int write,
169 struct file *filp, void __user *buffer, size_t *lenp, 161 void __user *buffer, size_t *lenp,
170 loff_t *ppos) 162 loff_t *ppos)
171{ 163{
172 int ret; 164 int ret;
173 165
174 ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 166 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
175 if (ret == 0 && write) 167 if (ret == 0 && write)
176 dirty_background_bytes = 0; 168 dirty_background_bytes = 0;
177 return ret; 169 return ret;
178} 170}
179 171
180int dirty_background_bytes_handler(struct ctl_table *table, int write, 172int dirty_background_bytes_handler(struct ctl_table *table, int write,
181 struct file *filp, void __user *buffer, size_t *lenp, 173 void __user *buffer, size_t *lenp,
182 loff_t *ppos) 174 loff_t *ppos)
183{ 175{
184 int ret; 176 int ret;
185 177
186 ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); 178 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
187 if (ret == 0 && write) 179 if (ret == 0 && write)
188 dirty_background_ratio = 0; 180 dirty_background_ratio = 0;
189 return ret; 181 return ret;
190} 182}
191 183
192int dirty_ratio_handler(struct ctl_table *table, int write, 184int dirty_ratio_handler(struct ctl_table *table, int write,
193 struct file *filp, void __user *buffer, size_t *lenp, 185 void __user *buffer, size_t *lenp,
194 loff_t *ppos) 186 loff_t *ppos)
195{ 187{
196 int old_ratio = vm_dirty_ratio; 188 int old_ratio = vm_dirty_ratio;
197 int ret; 189 int ret;
198 190
199 ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 191 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
200 if (ret == 0 && write && vm_dirty_ratio != old_ratio) { 192 if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
201 update_completion_period(); 193 update_completion_period();
202 vm_dirty_bytes = 0; 194 vm_dirty_bytes = 0;
@@ -206,13 +198,13 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
206 198
207 199
208int dirty_bytes_handler(struct ctl_table *table, int write, 200int dirty_bytes_handler(struct ctl_table *table, int write,
209 struct file *filp, void __user *buffer, size_t *lenp, 201 void __user *buffer, size_t *lenp,
210 loff_t *ppos) 202 loff_t *ppos)
211{ 203{
212 unsigned long old_bytes = vm_dirty_bytes; 204 unsigned long old_bytes = vm_dirty_bytes;
213 int ret; 205 int ret;
214 206
215 ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); 207 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
216 if (ret == 0 && write && vm_dirty_bytes != old_bytes) { 208 if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
217 update_completion_period(); 209 update_completion_period();
218 vm_dirty_ratio = 0; 210 vm_dirty_ratio = 0;
@@ -265,18 +257,19 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
265 * This avoids exceeding the total dirty_limit when the floating averages 257 * This avoids exceeding the total dirty_limit when the floating averages
266 * fluctuate too quickly. 258 * fluctuate too quickly.
267 */ 259 */
268static void 260static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
269clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty) 261 unsigned long dirty, unsigned long *pbdi_dirty)
270{ 262{
271 long avail_dirty; 263 unsigned long avail_dirty;
272 264
273 avail_dirty = dirty - 265 avail_dirty = global_page_state(NR_FILE_DIRTY) +
274 (global_page_state(NR_FILE_DIRTY) +
275 global_page_state(NR_WRITEBACK) + 266 global_page_state(NR_WRITEBACK) +
276 global_page_state(NR_UNSTABLE_NFS) + 267 global_page_state(NR_UNSTABLE_NFS) +
277 global_page_state(NR_WRITEBACK_TEMP)); 268 global_page_state(NR_WRITEBACK_TEMP);
278 269
279 if (avail_dirty < 0) 270 if (avail_dirty < dirty)
271 avail_dirty = dirty - avail_dirty;
272 else
280 avail_dirty = 0; 273 avail_dirty = 0;
281 274
282 avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + 275 avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
@@ -299,10 +292,10 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
299 * 292 *
300 * dirty -= (dirty/8) * p_{t} 293 * dirty -= (dirty/8) * p_{t}
301 */ 294 */
302static void task_dirty_limit(struct task_struct *tsk, long *pdirty) 295static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
303{ 296{
304 long numerator, denominator; 297 long numerator, denominator;
305 long dirty = *pdirty; 298 unsigned long dirty = *pdirty;
306 u64 inv = dirty >> 3; 299 u64 inv = dirty >> 3;
307 300
308 task_dirties_fraction(tsk, &numerator, &denominator); 301 task_dirties_fraction(tsk, &numerator, &denominator);
@@ -319,15 +312,13 @@ static void task_dirty_limit(struct task_struct *tsk, long *pdirty)
319/* 312/*
320 * 313 *
321 */ 314 */
322static DEFINE_SPINLOCK(bdi_lock);
323static unsigned int bdi_min_ratio; 315static unsigned int bdi_min_ratio;
324 316
325int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) 317int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
326{ 318{
327 int ret = 0; 319 int ret = 0;
328 unsigned long flags;
329 320
330 spin_lock_irqsave(&bdi_lock, flags); 321 spin_lock_bh(&bdi_lock);
331 if (min_ratio > bdi->max_ratio) { 322 if (min_ratio > bdi->max_ratio) {
332 ret = -EINVAL; 323 ret = -EINVAL;
333 } else { 324 } else {
@@ -339,27 +330,26 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
339 ret = -EINVAL; 330 ret = -EINVAL;
340 } 331 }
341 } 332 }
342 spin_unlock_irqrestore(&bdi_lock, flags); 333 spin_unlock_bh(&bdi_lock);
343 334
344 return ret; 335 return ret;
345} 336}
346 337
347int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) 338int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
348{ 339{
349 unsigned long flags;
350 int ret = 0; 340 int ret = 0;
351 341
352 if (max_ratio > 100) 342 if (max_ratio > 100)
353 return -EINVAL; 343 return -EINVAL;
354 344
355 spin_lock_irqsave(&bdi_lock, flags); 345 spin_lock_bh(&bdi_lock);
356 if (bdi->min_ratio > max_ratio) { 346 if (bdi->min_ratio > max_ratio) {
357 ret = -EINVAL; 347 ret = -EINVAL;
358 } else { 348 } else {
359 bdi->max_ratio = max_ratio; 349 bdi->max_ratio = max_ratio;
360 bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; 350 bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
361 } 351 }
362 spin_unlock_irqrestore(&bdi_lock, flags); 352 spin_unlock_bh(&bdi_lock);
363 353
364 return ret; 354 return ret;
365} 355}
@@ -393,7 +383,8 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
393 struct zone *z = 383 struct zone *z =
394 &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; 384 &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
395 385
396 x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z); 386 x += zone_page_state(z, NR_FREE_PAGES) +
387 zone_reclaimable_pages(z);
397 } 388 }
398 /* 389 /*
399 * Make sure that the number of highmem pages is never larger 390 * Make sure that the number of highmem pages is never larger
@@ -417,7 +408,7 @@ unsigned long determine_dirtyable_memory(void)
417{ 408{
418 unsigned long x; 409 unsigned long x;
419 410
420 x = global_page_state(NR_FREE_PAGES) + global_lru_pages(); 411 x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
421 412
422 if (!vm_highmem_is_dirtyable) 413 if (!vm_highmem_is_dirtyable)
423 x -= highmem_dirtyable_memory(x); 414 x -= highmem_dirtyable_memory(x);
@@ -486,10 +477,11 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
486 * balance_dirty_pages() must be called by processes which are generating dirty 477 * balance_dirty_pages() must be called by processes which are generating dirty
487 * data. It looks at the number of dirty pages in the machine and will force 478 * data. It looks at the number of dirty pages in the machine and will force
488 * the caller to perform writeback if the system is over `vm_dirty_ratio'. 479 * the caller to perform writeback if the system is over `vm_dirty_ratio'.
489 * If we're over `background_thresh' then pdflush is woken to perform some 480 * If we're over `background_thresh' then the writeback threads are woken to
490 * writeout. 481 * perform some writeout.
491 */ 482 */
492static void balance_dirty_pages(struct address_space *mapping) 483static void balance_dirty_pages(struct address_space *mapping,
484 unsigned long write_chunk)
493{ 485{
494 long nr_reclaimable, bdi_nr_reclaimable; 486 long nr_reclaimable, bdi_nr_reclaimable;
495 long nr_writeback, bdi_nr_writeback; 487 long nr_writeback, bdi_nr_writeback;
@@ -497,7 +489,7 @@ static void balance_dirty_pages(struct address_space *mapping)
497 unsigned long dirty_thresh; 489 unsigned long dirty_thresh;
498 unsigned long bdi_thresh; 490 unsigned long bdi_thresh;
499 unsigned long pages_written = 0; 491 unsigned long pages_written = 0;
500 unsigned long write_chunk = sync_writeback_pages(); 492 unsigned long pause = 1;
501 493
502 struct backing_dev_info *bdi = mapping->backing_dev_info; 494 struct backing_dev_info *bdi = mapping->backing_dev_info;
503 495
@@ -540,9 +532,12 @@ static void balance_dirty_pages(struct address_space *mapping)
540 * filesystems (i.e. NFS) in which data may have been 532 * filesystems (i.e. NFS) in which data may have been
541 * written to the server's write cache, but has not yet 533 * written to the server's write cache, but has not yet
542 * been flushed to permanent storage. 534 * been flushed to permanent storage.
535 * Only move pages to writeback if this bdi is over its
536 * threshold otherwise wait until the disk writes catch
537 * up.
543 */ 538 */
544 if (bdi_nr_reclaimable) { 539 if (bdi_nr_reclaimable > bdi_thresh) {
545 writeback_inodes(&wbc); 540 writeback_inodes_wbc(&wbc);
546 pages_written += write_chunk - wbc.nr_to_write; 541 pages_written += write_chunk - wbc.nr_to_write;
547 get_dirty_limits(&background_thresh, &dirty_thresh, 542 get_dirty_limits(&background_thresh, &dirty_thresh,
548 &bdi_thresh, bdi); 543 &bdi_thresh, bdi);
@@ -571,7 +566,16 @@ static void balance_dirty_pages(struct address_space *mapping)
571 if (pages_written >= write_chunk) 566 if (pages_written >= write_chunk)
572 break; /* We've done our duty */ 567 break; /* We've done our duty */
573 568
574 congestion_wait(WRITE, HZ/10); 569 __set_current_state(TASK_INTERRUPTIBLE);
570 io_schedule_timeout(pause);
571
572 /*
573 * Increase the delay for each loop, up to our previous
574 * default of taking a 100ms nap.
575 */
576 pause <<= 1;
577 if (pause > HZ / 10)
578 pause = HZ / 10;
575 } 579 }
576 580
577 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && 581 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
@@ -579,7 +583,7 @@ static void balance_dirty_pages(struct address_space *mapping)
579 bdi->dirty_exceeded = 0; 583 bdi->dirty_exceeded = 0;
580 584
581 if (writeback_in_progress(bdi)) 585 if (writeback_in_progress(bdi))
582 return; /* pdflush is already working this queue */ 586 return;
583 587
584 /* 588 /*
585 * In laptop mode, we wait until hitting the higher threshold before 589 * In laptop mode, we wait until hitting the higher threshold before
@@ -590,10 +594,10 @@ static void balance_dirty_pages(struct address_space *mapping)
590 * background_thresh, to keep the amount of dirty memory low. 594 * background_thresh, to keep the amount of dirty memory low.
591 */ 595 */
592 if ((laptop_mode && pages_written) || 596 if ((laptop_mode && pages_written) ||
593 (!laptop_mode && (global_page_state(NR_FILE_DIRTY) 597 (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
594 + global_page_state(NR_UNSTABLE_NFS) 598 + global_page_state(NR_UNSTABLE_NFS))
595 > background_thresh))) 599 > background_thresh)))
596 pdflush_operation(background_writeout, 0); 600 bdi_start_writeback(bdi, NULL, 0);
597} 601}
598 602
599void set_page_dirty_balance(struct page *page, int page_mkwrite) 603void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -606,6 +610,8 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
606 } 610 }
607} 611}
608 612
613static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
614
609/** 615/**
610 * balance_dirty_pages_ratelimited_nr - balance dirty memory state 616 * balance_dirty_pages_ratelimited_nr - balance dirty memory state
611 * @mapping: address_space which was dirtied 617 * @mapping: address_space which was dirtied
@@ -623,7 +629,6 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
623void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, 629void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
624 unsigned long nr_pages_dirtied) 630 unsigned long nr_pages_dirtied)
625{ 631{
626 static DEFINE_PER_CPU(unsigned long, ratelimits) = 0;
627 unsigned long ratelimit; 632 unsigned long ratelimit;
628 unsigned long *p; 633 unsigned long *p;
629 634
@@ -636,12 +641,13 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
636 * tasks in balance_dirty_pages(). Period. 641 * tasks in balance_dirty_pages(). Period.
637 */ 642 */
638 preempt_disable(); 643 preempt_disable();
639 p = &__get_cpu_var(ratelimits); 644 p = &__get_cpu_var(bdp_ratelimits);
640 *p += nr_pages_dirtied; 645 *p += nr_pages_dirtied;
641 if (unlikely(*p >= ratelimit)) { 646 if (unlikely(*p >= ratelimit)) {
647 ratelimit = sync_writeback_pages(*p);
642 *p = 0; 648 *p = 0;
643 preempt_enable(); 649 preempt_enable();
644 balance_dirty_pages(mapping); 650 balance_dirty_pages(mapping, ratelimit);
645 return; 651 return;
646 } 652 }
647 preempt_enable(); 653 preempt_enable();
@@ -665,7 +671,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
665 if (global_page_state(NR_UNSTABLE_NFS) + 671 if (global_page_state(NR_UNSTABLE_NFS) +
666 global_page_state(NR_WRITEBACK) <= dirty_thresh) 672 global_page_state(NR_WRITEBACK) <= dirty_thresh)
667 break; 673 break;
668 congestion_wait(WRITE, HZ/10); 674 congestion_wait(BLK_RW_ASYNC, HZ/10);
669 675
670 /* 676 /*
671 * The caller might hold locks which can prevent IO completion 677 * The caller might hold locks which can prevent IO completion
@@ -677,153 +683,35 @@ void throttle_vm_writeout(gfp_t gfp_mask)
677 } 683 }
678} 684}
679 685
680/*
681 * writeback at least _min_pages, and keep writing until the amount of dirty
682 * memory is less than the background threshold, or until we're all clean.
683 */
684static void background_writeout(unsigned long _min_pages)
685{
686 long min_pages = _min_pages;
687 struct writeback_control wbc = {
688 .bdi = NULL,
689 .sync_mode = WB_SYNC_NONE,
690 .older_than_this = NULL,
691 .nr_to_write = 0,
692 .nonblocking = 1,
693 .range_cyclic = 1,
694 };
695
696 for ( ; ; ) {
697 unsigned long background_thresh;
698 unsigned long dirty_thresh;
699
700 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
701 if (global_page_state(NR_FILE_DIRTY) +
702 global_page_state(NR_UNSTABLE_NFS) < background_thresh
703 && min_pages <= 0)
704 break;
705 wbc.more_io = 0;
706 wbc.encountered_congestion = 0;
707 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
708 wbc.pages_skipped = 0;
709 writeback_inodes(&wbc);
710 min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
711 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
712 /* Wrote less than expected */
713 if (wbc.encountered_congestion || wbc.more_io)
714 congestion_wait(WRITE, HZ/10);
715 else
716 break;
717 }
718 }
719}
720
721/*
722 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
723 * the whole world. Returns 0 if a pdflush thread was dispatched. Returns
724 * -1 if all pdflush threads were busy.
725 */
726int wakeup_pdflush(long nr_pages)
727{
728 if (nr_pages == 0)
729 nr_pages = global_page_state(NR_FILE_DIRTY) +
730 global_page_state(NR_UNSTABLE_NFS);
731 return pdflush_operation(background_writeout, nr_pages);
732}
733
734static void wb_timer_fn(unsigned long unused);
735static void laptop_timer_fn(unsigned long unused); 686static void laptop_timer_fn(unsigned long unused);
736 687
737static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
738static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); 688static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
739 689
740/* 690/*
741 * Periodic writeback of "old" data.
742 *
743 * Define "old": the first time one of an inode's pages is dirtied, we mark the
744 * dirtying-time in the inode's address_space. So this periodic writeback code
745 * just walks the superblock inode list, writing back any inodes which are
746 * older than a specific point in time.
747 *
748 * Try to run once per dirty_writeback_interval. But if a writeback event
749 * takes longer than a dirty_writeback_interval interval, then leave a
750 * one-second gap.
751 *
752 * older_than_this takes precedence over nr_to_write. So we'll only write back
753 * all dirty pages if they are all attached to "old" mappings.
754 */
755static void wb_kupdate(unsigned long arg)
756{
757 unsigned long oldest_jif;
758 unsigned long start_jif;
759 unsigned long next_jif;
760 long nr_to_write;
761 struct writeback_control wbc = {
762 .bdi = NULL,
763 .sync_mode = WB_SYNC_NONE,
764 .older_than_this = &oldest_jif,
765 .nr_to_write = 0,
766 .nonblocking = 1,
767 .for_kupdate = 1,
768 .range_cyclic = 1,
769 };
770
771 sync_supers();
772
773 oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval);
774 start_jif = jiffies;
775 next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
776 nr_to_write = global_page_state(NR_FILE_DIRTY) +
777 global_page_state(NR_UNSTABLE_NFS) +
778 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
779 while (nr_to_write > 0) {
780 wbc.more_io = 0;
781 wbc.encountered_congestion = 0;
782 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
783 writeback_inodes(&wbc);
784 if (wbc.nr_to_write > 0) {
785 if (wbc.encountered_congestion || wbc.more_io)
786 congestion_wait(WRITE, HZ/10);
787 else
788 break; /* All the old data is written */
789 }
790 nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
791 }
792 if (time_before(next_jif, jiffies + HZ))
793 next_jif = jiffies + HZ;
794 if (dirty_writeback_interval)
795 mod_timer(&wb_timer, next_jif);
796}
797
798/*
799 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs 691 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
800 */ 692 */
801int dirty_writeback_centisecs_handler(ctl_table *table, int write, 693int dirty_writeback_centisecs_handler(ctl_table *table, int write,
802 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 694 void __user *buffer, size_t *length, loff_t *ppos)
803{ 695{
804 proc_dointvec(table, write, file, buffer, length, ppos); 696 proc_dointvec(table, write, buffer, length, ppos);
805 if (dirty_writeback_interval)
806 mod_timer(&wb_timer, jiffies +
807 msecs_to_jiffies(dirty_writeback_interval * 10));
808 else
809 del_timer(&wb_timer);
810 return 0; 697 return 0;
811} 698}
812 699
813static void wb_timer_fn(unsigned long unused) 700static void do_laptop_sync(struct work_struct *work)
814{ 701{
815 if (pdflush_operation(wb_kupdate, 0) < 0) 702 wakeup_flusher_threads(0);
816 mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */ 703 kfree(work);
817}
818
819static void laptop_flush(unsigned long unused)
820{
821 sys_sync();
822} 704}
823 705
824static void laptop_timer_fn(unsigned long unused) 706static void laptop_timer_fn(unsigned long unused)
825{ 707{
826 pdflush_operation(laptop_flush, 0); 708 struct work_struct *work;
709
710 work = kmalloc(sizeof(*work), GFP_ATOMIC);
711 if (work) {
712 INIT_WORK(work, do_laptop_sync);
713 schedule_work(work);
714 }
827} 715}
828 716
829/* 717/*
@@ -906,8 +794,6 @@ void __init page_writeback_init(void)
906{ 794{
907 int shift; 795 int shift;
908 796
909 mod_timer(&wb_timer,
910 jiffies + msecs_to_jiffies(dirty_writeback_interval * 10));
911 writeback_set_ratelimit(); 797 writeback_set_ratelimit();
912 register_cpu_notifier(&ratelimit_nb); 798 register_cpu_notifier(&ratelimit_nb);
913 799
@@ -935,7 +821,6 @@ int write_cache_pages(struct address_space *mapping,
935 struct writeback_control *wbc, writepage_t writepage, 821 struct writeback_control *wbc, writepage_t writepage,
936 void *data) 822 void *data)
937{ 823{
938 struct backing_dev_info *bdi = mapping->backing_dev_info;
939 int ret = 0; 824 int ret = 0;
940 int done = 0; 825 int done = 0;
941 struct pagevec pvec; 826 struct pagevec pvec;
@@ -948,11 +833,6 @@ int write_cache_pages(struct address_space *mapping,
948 int range_whole = 0; 833 int range_whole = 0;
949 long nr_to_write = wbc->nr_to_write; 834 long nr_to_write = wbc->nr_to_write;
950 835
951 if (wbc->nonblocking && bdi_write_congested(bdi)) {
952 wbc->encountered_congestion = 1;
953 return 0;
954 }
955
956 pagevec_init(&pvec, 0); 836 pagevec_init(&pvec, 0);
957 if (wbc->range_cyclic) { 837 if (wbc->range_cyclic) {
958 writeback_index = mapping->writeback_index; /* prev offset */ 838 writeback_index = mapping->writeback_index; /* prev offset */
@@ -1071,12 +951,6 @@ continue_unlock:
1071 break; 951 break;
1072 } 952 }
1073 } 953 }
1074
1075 if (wbc->nonblocking && bdi_write_congested(bdi)) {
1076 wbc->encountered_congestion = 1;
1077 done = 1;
1078 break;
1079 }
1080 } 954 }
1081 pagevec_release(&pvec); 955 pagevec_release(&pvec);
1082 cond_resched(); 956 cond_resched();
@@ -1141,12 +1015,10 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
1141 1015
1142 if (wbc->nr_to_write <= 0) 1016 if (wbc->nr_to_write <= 0)
1143 return 0; 1017 return 0;
1144 wbc->for_writepages = 1;
1145 if (mapping->a_ops->writepages) 1018 if (mapping->a_ops->writepages)
1146 ret = mapping->a_ops->writepages(mapping, wbc); 1019 ret = mapping->a_ops->writepages(mapping, wbc);
1147 else 1020 else
1148 ret = generic_writepages(mapping, wbc); 1021 ret = generic_writepages(mapping, wbc);
1149 wbc->for_writepages = 0;
1150 return ret; 1022 return ret;
1151} 1023}
1152 1024
@@ -1270,6 +1142,13 @@ int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
1270EXPORT_SYMBOL(redirty_page_for_writepage); 1142EXPORT_SYMBOL(redirty_page_for_writepage);
1271 1143
1272/* 1144/*
1145 * Dirty a page.
1146 *
1147 * For pages with a mapping this should be done under the page lock
1148 * for the benefit of asynchronous memory errors who prefer a consistent
1149 * dirty state. This rule can be broken in some special cases,
1150 * but should be better not to.
1151 *
1273 * If the mapping doesn't provide a set_page_dirty a_op, then 1152 * If the mapping doesn't provide a set_page_dirty a_op, then
1274 * just fall through and assume that it wants buffer_heads. 1153 * just fall through and assume that it wants buffer_heads.
1275 */ 1154 */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0284e528748d..2bc2ac63f41e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -23,6 +23,7 @@
23#include <linux/bootmem.h> 23#include <linux/bootmem.h>
24#include <linux/compiler.h> 24#include <linux/compiler.h>
25#include <linux/kernel.h> 25#include <linux/kernel.h>
26#include <linux/kmemcheck.h>
26#include <linux/module.h> 27#include <linux/module.h>
27#include <linux/suspend.h> 28#include <linux/suspend.h>
28#include <linux/pagevec.h> 29#include <linux/pagevec.h>
@@ -46,6 +47,8 @@
46#include <linux/page-isolation.h> 47#include <linux/page-isolation.h>
47#include <linux/page_cgroup.h> 48#include <linux/page_cgroup.h>
48#include <linux/debugobjects.h> 49#include <linux/debugobjects.h>
50#include <linux/kmemleak.h>
51#include <trace/events/kmem.h>
49 52
50#include <asm/tlbflush.h> 53#include <asm/tlbflush.h>
51#include <asm/div64.h> 54#include <asm/div64.h>
@@ -69,8 +72,8 @@ EXPORT_SYMBOL(node_states);
69 72
70unsigned long totalram_pages __read_mostly; 73unsigned long totalram_pages __read_mostly;
71unsigned long totalreserve_pages __read_mostly; 74unsigned long totalreserve_pages __read_mostly;
72unsigned long highest_memmap_pfn __read_mostly;
73int percpu_pagelist_fraction; 75int percpu_pagelist_fraction;
76gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
74 77
75#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 78#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
76int pageblock_order __read_mostly; 79int pageblock_order __read_mostly;
@@ -120,8 +123,8 @@ static char * const zone_names[MAX_NR_ZONES] = {
120 123
121int min_free_kbytes = 1024; 124int min_free_kbytes = 1024;
122 125
123unsigned long __meminitdata nr_kernel_pages; 126static unsigned long __meminitdata nr_kernel_pages;
124unsigned long __meminitdata nr_all_pages; 127static unsigned long __meminitdata nr_all_pages;
125static unsigned long __meminitdata dma_reserve; 128static unsigned long __meminitdata dma_reserve;
126 129
127#ifdef CONFIG_ARCH_POPULATES_NODE_MAP 130#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
@@ -149,10 +152,6 @@ static unsigned long __meminitdata dma_reserve;
149 static int __meminitdata nr_nodemap_entries; 152 static int __meminitdata nr_nodemap_entries;
150 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; 153 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
151 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; 154 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
152#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
153 static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
154 static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
155#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
156 static unsigned long __initdata required_kernelcore; 155 static unsigned long __initdata required_kernelcore;
157 static unsigned long __initdata required_movablecore; 156 static unsigned long __initdata required_movablecore;
158 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 157 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
@@ -164,17 +163,25 @@ static unsigned long __meminitdata dma_reserve;
164 163
165#if MAX_NUMNODES > 1 164#if MAX_NUMNODES > 1
166int nr_node_ids __read_mostly = MAX_NUMNODES; 165int nr_node_ids __read_mostly = MAX_NUMNODES;
166int nr_online_nodes __read_mostly = 1;
167EXPORT_SYMBOL(nr_node_ids); 167EXPORT_SYMBOL(nr_node_ids);
168EXPORT_SYMBOL(nr_online_nodes);
168#endif 169#endif
169 170
170int page_group_by_mobility_disabled __read_mostly; 171int page_group_by_mobility_disabled __read_mostly;
171 172
172static void set_pageblock_migratetype(struct page *page, int migratetype) 173static void set_pageblock_migratetype(struct page *page, int migratetype)
173{ 174{
175
176 if (unlikely(page_group_by_mobility_disabled))
177 migratetype = MIGRATE_UNMOVABLE;
178
174 set_pageblock_flags_group(page, (unsigned long)migratetype, 179 set_pageblock_flags_group(page, (unsigned long)migratetype,
175 PB_migrate, PB_migrate_end); 180 PB_migrate, PB_migrate_end);
176} 181}
177 182
183bool oom_killer_disabled __read_mostly;
184
178#ifdef CONFIG_DEBUG_VM 185#ifdef CONFIG_DEBUG_VM
179static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 186static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
180{ 187{
@@ -227,6 +234,12 @@ static void bad_page(struct page *page)
227 static unsigned long nr_shown; 234 static unsigned long nr_shown;
228 static unsigned long nr_unshown; 235 static unsigned long nr_unshown;
229 236
237 /* Don't complain about poisoned pages */
238 if (PageHWPoison(page)) {
239 __ClearPageBuddy(page);
240 return;
241 }
242
230 /* 243 /*
231 * Allow a burst of 60 reports, then keep quiet for that minute; 244 * Allow a burst of 60 reports, then keep quiet for that minute;
232 * or allow a steady drip of one report per second. 245 * or allow a steady drip of one report per second.
@@ -297,23 +310,6 @@ void prep_compound_page(struct page *page, unsigned long order)
297 } 310 }
298} 311}
299 312
300#ifdef CONFIG_HUGETLBFS
301void prep_compound_gigantic_page(struct page *page, unsigned long order)
302{
303 int i;
304 int nr_pages = 1 << order;
305 struct page *p = page + 1;
306
307 set_compound_page_dtor(page, free_compound_page);
308 set_compound_order(page, order);
309 __SetPageHead(page);
310 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
311 __SetPageTail(p);
312 p->first_page = page;
313 }
314}
315#endif
316
317static int destroy_compound_page(struct page *page, unsigned long order) 313static int destroy_compound_page(struct page *page, unsigned long order)
318{ 314{
319 int i; 315 int i;
@@ -331,7 +327,7 @@ static int destroy_compound_page(struct page *page, unsigned long order)
331 for (i = 1; i < nr_pages; i++) { 327 for (i = 1; i < nr_pages; i++) {
332 struct page *p = page + i; 328 struct page *p = page + i;
333 329
334 if (unlikely(!PageTail(p) | (p->first_page != page))) { 330 if (unlikely(!PageTail(p) || (p->first_page != page))) {
335 bad_page(page); 331 bad_page(page);
336 bad++; 332 bad++;
337 } 333 }
@@ -420,7 +416,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
420 return 0; 416 return 0;
421 417
422 if (PageBuddy(buddy) && page_order(buddy) == order) { 418 if (PageBuddy(buddy) && page_order(buddy) == order) {
423 BUG_ON(page_count(buddy) != 0); 419 VM_BUG_ON(page_count(buddy) != 0);
424 return 1; 420 return 1;
425 } 421 }
426 return 0; 422 return 0;
@@ -451,22 +447,22 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
451 */ 447 */
452 448
453static inline void __free_one_page(struct page *page, 449static inline void __free_one_page(struct page *page,
454 struct zone *zone, unsigned int order) 450 struct zone *zone, unsigned int order,
451 int migratetype)
455{ 452{
456 unsigned long page_idx; 453 unsigned long page_idx;
457 int order_size = 1 << order;
458 int migratetype = get_pageblock_migratetype(page);
459 454
460 if (unlikely(PageCompound(page))) 455 if (unlikely(PageCompound(page)))
461 if (unlikely(destroy_compound_page(page, order))) 456 if (unlikely(destroy_compound_page(page, order)))
462 return; 457 return;
463 458
459 VM_BUG_ON(migratetype == -1);
460
464 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 461 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
465 462
466 VM_BUG_ON(page_idx & (order_size - 1)); 463 VM_BUG_ON(page_idx & ((1 << order) - 1));
467 VM_BUG_ON(bad_range(zone, page)); 464 VM_BUG_ON(bad_range(zone, page));
468 465
469 __mod_zone_page_state(zone, NR_FREE_PAGES, order_size);
470 while (order < MAX_ORDER-1) { 466 while (order < MAX_ORDER-1) {
471 unsigned long combined_idx; 467 unsigned long combined_idx;
472 struct page *buddy; 468 struct page *buddy;
@@ -490,12 +486,26 @@ static inline void __free_one_page(struct page *page,
490 zone->free_area[order].nr_free++; 486 zone->free_area[order].nr_free++;
491} 487}
492 488
489#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
490/*
491 * free_page_mlock() -- clean up attempts to free and mlocked() page.
492 * Page should not be on lru, so no need to fix that up.
493 * free_pages_check() will verify...
494 */
495static inline void free_page_mlock(struct page *page)
496{
497 __dec_zone_page_state(page, NR_MLOCK);
498 __count_vm_event(UNEVICTABLE_MLOCKFREED);
499}
500#else
501static void free_page_mlock(struct page *page) { }
502#endif
503
493static inline int free_pages_check(struct page *page) 504static inline int free_pages_check(struct page *page)
494{ 505{
495 free_page_mlock(page);
496 if (unlikely(page_mapcount(page) | 506 if (unlikely(page_mapcount(page) |
497 (page->mapping != NULL) | 507 (page->mapping != NULL) |
498 (page_count(page) != 0) | 508 (atomic_read(&page->_count) != 0) |
499 (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { 509 (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) {
500 bad_page(page); 510 bad_page(page);
501 return 1; 511 return 1;
@@ -506,7 +516,7 @@ static inline int free_pages_check(struct page *page)
506} 516}
507 517
508/* 518/*
509 * Frees a list of pages. 519 * Frees a number of pages from the PCP lists
510 * Assumes all pages on list are in same zone, and of same order. 520 * Assumes all pages on list are in same zone, and of same order.
511 * count is the number of pages to free. 521 * count is the number of pages to free.
512 * 522 *
@@ -516,30 +526,55 @@ static inline int free_pages_check(struct page *page)
516 * And clear the zone's pages_scanned counter, to hold off the "all pages are 526 * And clear the zone's pages_scanned counter, to hold off the "all pages are
517 * pinned" detection logic. 527 * pinned" detection logic.
518 */ 528 */
519static void free_pages_bulk(struct zone *zone, int count, 529static void free_pcppages_bulk(struct zone *zone, int count,
520 struct list_head *list, int order) 530 struct per_cpu_pages *pcp)
521{ 531{
532 int migratetype = 0;
533 int batch_free = 0;
534
522 spin_lock(&zone->lock); 535 spin_lock(&zone->lock);
523 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 536 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
524 zone->pages_scanned = 0; 537 zone->pages_scanned = 0;
525 while (count--) { 538
539 __mod_zone_page_state(zone, NR_FREE_PAGES, count);
540 while (count) {
526 struct page *page; 541 struct page *page;
542 struct list_head *list;
527 543
528 VM_BUG_ON(list_empty(list)); 544 /*
529 page = list_entry(list->prev, struct page, lru); 545 * Remove pages from lists in a round-robin fashion. A
530 /* have to delete it as __free_one_page list manipulates */ 546 * batch_free count is maintained that is incremented when an
531 list_del(&page->lru); 547 * empty list is encountered. This is so more pages are freed
532 __free_one_page(page, zone, order); 548 * off fuller lists instead of spinning excessively around empty
549 * lists
550 */
551 do {
552 batch_free++;
553 if (++migratetype == MIGRATE_PCPTYPES)
554 migratetype = 0;
555 list = &pcp->lists[migratetype];
556 } while (list_empty(list));
557
558 do {
559 page = list_entry(list->prev, struct page, lru);
560 /* must delete as __free_one_page list manipulates */
561 list_del(&page->lru);
562 __free_one_page(page, zone, 0, migratetype);
563 trace_mm_page_pcpu_drain(page, 0, migratetype);
564 } while (--count && --batch_free && !list_empty(list));
533 } 565 }
534 spin_unlock(&zone->lock); 566 spin_unlock(&zone->lock);
535} 567}
536 568
537static void free_one_page(struct zone *zone, struct page *page, int order) 569static void free_one_page(struct zone *zone, struct page *page, int order,
570 int migratetype)
538{ 571{
539 spin_lock(&zone->lock); 572 spin_lock(&zone->lock);
540 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 573 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
541 zone->pages_scanned = 0; 574 zone->pages_scanned = 0;
542 __free_one_page(page, zone, order); 575
576 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
577 __free_one_page(page, zone, order, migratetype);
543 spin_unlock(&zone->lock); 578 spin_unlock(&zone->lock);
544} 579}
545 580
@@ -548,6 +583,9 @@ static void __free_pages_ok(struct page *page, unsigned int order)
548 unsigned long flags; 583 unsigned long flags;
549 int i; 584 int i;
550 int bad = 0; 585 int bad = 0;
586 int wasMlocked = __TestClearPageMlocked(page);
587
588 kmemcheck_free_shadow(page, order);
551 589
552 for (i = 0 ; i < (1 << order) ; ++i) 590 for (i = 0 ; i < (1 << order) ; ++i)
553 bad += free_pages_check(page + i); 591 bad += free_pages_check(page + i);
@@ -563,8 +601,11 @@ static void __free_pages_ok(struct page *page, unsigned int order)
563 kernel_map_pages(page, 1 << order, 0); 601 kernel_map_pages(page, 1 << order, 0);
564 602
565 local_irq_save(flags); 603 local_irq_save(flags);
604 if (unlikely(wasMlocked))
605 free_page_mlock(page);
566 __count_vm_events(PGFREE, 1 << order); 606 __count_vm_events(PGFREE, 1 << order);
567 free_one_page(page_zone(page), page, order); 607 free_one_page(page_zone(page), page, order,
608 get_pageblock_migratetype(page));
568 local_irq_restore(flags); 609 local_irq_restore(flags);
569} 610}
570 611
@@ -631,15 +672,27 @@ static inline void expand(struct zone *zone, struct page *page,
631/* 672/*
632 * This page is about to be returned from the page allocator 673 * This page is about to be returned from the page allocator
633 */ 674 */
634static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) 675static inline int check_new_page(struct page *page)
635{ 676{
636 if (unlikely(page_mapcount(page) | 677 if (unlikely(page_mapcount(page) |
637 (page->mapping != NULL) | 678 (page->mapping != NULL) |
638 (page_count(page) != 0) | 679 (atomic_read(&page->_count) != 0) |
639 (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { 680 (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) {
640 bad_page(page); 681 bad_page(page);
641 return 1; 682 return 1;
642 } 683 }
684 return 0;
685}
686
687static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
688{
689 int i;
690
691 for (i = 0; i < (1 << order); i++) {
692 struct page *p = page + i;
693 if (unlikely(check_new_page(p)))
694 return 1;
695 }
643 696
644 set_page_private(page, 0); 697 set_page_private(page, 0);
645 set_page_refcounted(page); 698 set_page_refcounted(page);
@@ -660,7 +713,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
660 * Go through the free lists for the given migratetype and remove 713 * Go through the free lists for the given migratetype and remove
661 * the smallest available page from the freelists 714 * the smallest available page from the freelists
662 */ 715 */
663static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 716static inline
717struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
664 int migratetype) 718 int migratetype)
665{ 719{
666 unsigned int current_order; 720 unsigned int current_order;
@@ -678,7 +732,6 @@ static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
678 list_del(&page->lru); 732 list_del(&page->lru);
679 rmv_page_order(page); 733 rmv_page_order(page);
680 area->nr_free--; 734 area->nr_free--;
681 __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));
682 expand(zone, page, order, current_order, area, migratetype); 735 expand(zone, page, order, current_order, area, migratetype);
683 return page; 736 return page;
684 } 737 }
@@ -768,9 +821,20 @@ static int move_freepages_block(struct zone *zone, struct page *page,
768 return move_freepages(zone, start_page, end_page, migratetype); 821 return move_freepages(zone, start_page, end_page, migratetype);
769} 822}
770 823
824static void change_pageblock_range(struct page *pageblock_page,
825 int start_order, int migratetype)
826{
827 int nr_pageblocks = 1 << (start_order - pageblock_order);
828
829 while (nr_pageblocks--) {
830 set_pageblock_migratetype(pageblock_page, migratetype);
831 pageblock_page += pageblock_nr_pages;
832 }
833}
834
771/* Remove an element from the buddy allocator from the fallback list */ 835/* Remove an element from the buddy allocator from the fallback list */
772static struct page *__rmqueue_fallback(struct zone *zone, int order, 836static inline struct page *
773 int start_migratetype) 837__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
774{ 838{
775 struct free_area * area; 839 struct free_area * area;
776 int current_order; 840 int current_order;
@@ -802,13 +866,15 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order,
802 * agressive about taking ownership of free pages 866 * agressive about taking ownership of free pages
803 */ 867 */
804 if (unlikely(current_order >= (pageblock_order >> 1)) || 868 if (unlikely(current_order >= (pageblock_order >> 1)) ||
805 start_migratetype == MIGRATE_RECLAIMABLE) { 869 start_migratetype == MIGRATE_RECLAIMABLE ||
870 page_group_by_mobility_disabled) {
806 unsigned long pages; 871 unsigned long pages;
807 pages = move_freepages_block(zone, page, 872 pages = move_freepages_block(zone, page,
808 start_migratetype); 873 start_migratetype);
809 874
810 /* Claim the whole block if over half of it is free */ 875 /* Claim the whole block if over half of it is free */
811 if (pages >= (1 << (pageblock_order-1))) 876 if (pages >= (1 << (pageblock_order-1)) ||
877 page_group_by_mobility_disabled)
812 set_pageblock_migratetype(page, 878 set_pageblock_migratetype(page,
813 start_migratetype); 879 start_migratetype);
814 880
@@ -818,20 +884,22 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order,
818 /* Remove the page from the freelists */ 884 /* Remove the page from the freelists */
819 list_del(&page->lru); 885 list_del(&page->lru);
820 rmv_page_order(page); 886 rmv_page_order(page);
821 __mod_zone_page_state(zone, NR_FREE_PAGES,
822 -(1UL << order));
823 887
824 if (current_order == pageblock_order) 888 /* Take ownership for orders >= pageblock_order */
825 set_pageblock_migratetype(page, 889 if (current_order >= pageblock_order)
890 change_pageblock_range(page, current_order,
826 start_migratetype); 891 start_migratetype);
827 892
828 expand(zone, page, order, current_order, area, migratetype); 893 expand(zone, page, order, current_order, area, migratetype);
894
895 trace_mm_page_alloc_extfrag(page, order, current_order,
896 start_migratetype, migratetype);
897
829 return page; 898 return page;
830 } 899 }
831 } 900 }
832 901
833 /* Use MIGRATE_RESERVE rather than fail an allocation */ 902 return NULL;
834 return __rmqueue_smallest(zone, order, MIGRATE_RESERVE);
835} 903}
836 904
837/* 905/*
@@ -843,11 +911,24 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order,
843{ 911{
844 struct page *page; 912 struct page *page;
845 913
914retry_reserve:
846 page = __rmqueue_smallest(zone, order, migratetype); 915 page = __rmqueue_smallest(zone, order, migratetype);
847 916
848 if (unlikely(!page)) 917 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
849 page = __rmqueue_fallback(zone, order, migratetype); 918 page = __rmqueue_fallback(zone, order, migratetype);
850 919
920 /*
921 * Use MIGRATE_RESERVE rather than fail an allocation. goto
922 * is used because __rmqueue_smallest is an inline function
923 * and we want just one call site
924 */
925 if (!page) {
926 migratetype = MIGRATE_RESERVE;
927 goto retry_reserve;
928 }
929 }
930
931 trace_mm_page_alloc_zone_locked(page, order, migratetype);
851 return page; 932 return page;
852} 933}
853 934
@@ -858,7 +939,7 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order,
858 */ 939 */
859static int rmqueue_bulk(struct zone *zone, unsigned int order, 940static int rmqueue_bulk(struct zone *zone, unsigned int order,
860 unsigned long count, struct list_head *list, 941 unsigned long count, struct list_head *list,
861 int migratetype) 942 int migratetype, int cold)
862{ 943{
863 int i; 944 int i;
864 945
@@ -877,10 +958,14 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
877 * merge IO requests if the physical pages are ordered 958 * merge IO requests if the physical pages are ordered
878 * properly. 959 * properly.
879 */ 960 */
880 list_add(&page->lru, list); 961 if (likely(cold == 0))
962 list_add(&page->lru, list);
963 else
964 list_add_tail(&page->lru, list);
881 set_page_private(page, migratetype); 965 set_page_private(page, migratetype);
882 list = &page->lru; 966 list = &page->lru;
883 } 967 }
968 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
884 spin_unlock(&zone->lock); 969 spin_unlock(&zone->lock);
885 return i; 970 return i;
886} 971}
@@ -904,7 +989,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
904 to_drain = pcp->batch; 989 to_drain = pcp->batch;
905 else 990 else
906 to_drain = pcp->count; 991 to_drain = pcp->count;
907 free_pages_bulk(zone, to_drain, &pcp->list, 0); 992 free_pcppages_bulk(zone, to_drain, pcp);
908 pcp->count -= to_drain; 993 pcp->count -= to_drain;
909 local_irq_restore(flags); 994 local_irq_restore(flags);
910} 995}
@@ -930,7 +1015,7 @@ static void drain_pages(unsigned int cpu)
930 1015
931 pcp = &pset->pcp; 1016 pcp = &pset->pcp;
932 local_irq_save(flags); 1017 local_irq_save(flags);
933 free_pages_bulk(zone, pcp->count, &pcp->list, 0); 1018 free_pcppages_bulk(zone, pcp->count, pcp);
934 pcp->count = 0; 1019 pcp->count = 0;
935 local_irq_restore(flags); 1020 local_irq_restore(flags);
936 } 1021 }
@@ -996,6 +1081,10 @@ static void free_hot_cold_page(struct page *page, int cold)
996 struct zone *zone = page_zone(page); 1081 struct zone *zone = page_zone(page);
997 struct per_cpu_pages *pcp; 1082 struct per_cpu_pages *pcp;
998 unsigned long flags; 1083 unsigned long flags;
1084 int migratetype;
1085 int wasMlocked = __TestClearPageMlocked(page);
1086
1087 kmemcheck_free_shadow(page, 0);
999 1088
1000 if (PageAnon(page)) 1089 if (PageAnon(page))
1001 page->mapping = NULL; 1090 page->mapping = NULL;
@@ -1010,32 +1099,49 @@ static void free_hot_cold_page(struct page *page, int cold)
1010 kernel_map_pages(page, 1, 0); 1099 kernel_map_pages(page, 1, 0);
1011 1100
1012 pcp = &zone_pcp(zone, get_cpu())->pcp; 1101 pcp = &zone_pcp(zone, get_cpu())->pcp;
1102 migratetype = get_pageblock_migratetype(page);
1103 set_page_private(page, migratetype);
1013 local_irq_save(flags); 1104 local_irq_save(flags);
1105 if (unlikely(wasMlocked))
1106 free_page_mlock(page);
1014 __count_vm_event(PGFREE); 1107 __count_vm_event(PGFREE);
1108
1109 /*
1110 * We only track unmovable, reclaimable and movable on pcp lists.
1111 * Free ISOLATE pages back to the allocator because they are being
1112 * offlined but treat RESERVE as movable pages so we can get those
1113 * areas back if necessary. Otherwise, we may have to free
1114 * excessively into the page allocator
1115 */
1116 if (migratetype >= MIGRATE_PCPTYPES) {
1117 if (unlikely(migratetype == MIGRATE_ISOLATE)) {
1118 free_one_page(zone, page, 0, migratetype);
1119 goto out;
1120 }
1121 migratetype = MIGRATE_MOVABLE;
1122 }
1123
1015 if (cold) 1124 if (cold)
1016 list_add_tail(&page->lru, &pcp->list); 1125 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1017 else 1126 else
1018 list_add(&page->lru, &pcp->list); 1127 list_add(&page->lru, &pcp->lists[migratetype]);
1019 set_page_private(page, get_pageblock_migratetype(page));
1020 pcp->count++; 1128 pcp->count++;
1021 if (pcp->count >= pcp->high) { 1129 if (pcp->count >= pcp->high) {
1022 free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 1130 free_pcppages_bulk(zone, pcp->batch, pcp);
1023 pcp->count -= pcp->batch; 1131 pcp->count -= pcp->batch;
1024 } 1132 }
1133
1134out:
1025 local_irq_restore(flags); 1135 local_irq_restore(flags);
1026 put_cpu(); 1136 put_cpu();
1027} 1137}
1028 1138
1029void free_hot_page(struct page *page) 1139void free_hot_page(struct page *page)
1030{ 1140{
1141 trace_mm_page_free_direct(page, 0);
1031 free_hot_cold_page(page, 0); 1142 free_hot_cold_page(page, 0);
1032} 1143}
1033 1144
1034void free_cold_page(struct page *page)
1035{
1036 free_hot_cold_page(page, 1);
1037}
1038
1039/* 1145/*
1040 * split_page takes a non-compound higher-order page, and splits it into 1146 * split_page takes a non-compound higher-order page, and splits it into
1041 * n (1<<order) sub-pages: page[0..n] 1147 * n (1<<order) sub-pages: page[0..n]
@@ -1050,6 +1156,16 @@ void split_page(struct page *page, unsigned int order)
1050 1156
1051 VM_BUG_ON(PageCompound(page)); 1157 VM_BUG_ON(PageCompound(page));
1052 VM_BUG_ON(!page_count(page)); 1158 VM_BUG_ON(!page_count(page));
1159
1160#ifdef CONFIG_KMEMCHECK
1161 /*
1162 * Split shadow pages too, because free(page[0]) would
1163 * otherwise free the whole shadow.
1164 */
1165 if (kmemcheck_page_is_tracked(page))
1166 split_page(virt_to_page(page[0].shadow), order);
1167#endif
1168
1053 for (i = 1; i < (1 << order); i++) 1169 for (i = 1; i < (1 << order); i++)
1054 set_page_refcounted(page + i); 1170 set_page_refcounted(page + i);
1055} 1171}
@@ -1059,52 +1175,57 @@ void split_page(struct page *page, unsigned int order)
1059 * we cheat by calling it from here, in the order > 0 path. Saves a branch 1175 * we cheat by calling it from here, in the order > 0 path. Saves a branch
1060 * or two. 1176 * or two.
1061 */ 1177 */
1062static struct page *buffered_rmqueue(struct zone *preferred_zone, 1178static inline
1063 struct zone *zone, int order, gfp_t gfp_flags) 1179struct page *buffered_rmqueue(struct zone *preferred_zone,
1180 struct zone *zone, int order, gfp_t gfp_flags,
1181 int migratetype)
1064{ 1182{
1065 unsigned long flags; 1183 unsigned long flags;
1066 struct page *page; 1184 struct page *page;
1067 int cold = !!(gfp_flags & __GFP_COLD); 1185 int cold = !!(gfp_flags & __GFP_COLD);
1068 int cpu; 1186 int cpu;
1069 int migratetype = allocflags_to_migratetype(gfp_flags);
1070 1187
1071again: 1188again:
1072 cpu = get_cpu(); 1189 cpu = get_cpu();
1073 if (likely(order == 0)) { 1190 if (likely(order == 0)) {
1074 struct per_cpu_pages *pcp; 1191 struct per_cpu_pages *pcp;
1192 struct list_head *list;
1075 1193
1076 pcp = &zone_pcp(zone, cpu)->pcp; 1194 pcp = &zone_pcp(zone, cpu)->pcp;
1195 list = &pcp->lists[migratetype];
1077 local_irq_save(flags); 1196 local_irq_save(flags);
1078 if (!pcp->count) { 1197 if (list_empty(list)) {
1079 pcp->count = rmqueue_bulk(zone, 0, 1198 pcp->count += rmqueue_bulk(zone, 0,
1080 pcp->batch, &pcp->list, migratetype); 1199 pcp->batch, list,
1081 if (unlikely(!pcp->count)) 1200 migratetype, cold);
1201 if (unlikely(list_empty(list)))
1082 goto failed; 1202 goto failed;
1083 } 1203 }
1084 1204
1085 /* Find a page of the appropriate migrate type */ 1205 if (cold)
1086 if (cold) { 1206 page = list_entry(list->prev, struct page, lru);
1087 list_for_each_entry_reverse(page, &pcp->list, lru) 1207 else
1088 if (page_private(page) == migratetype) 1208 page = list_entry(list->next, struct page, lru);
1089 break;
1090 } else {
1091 list_for_each_entry(page, &pcp->list, lru)
1092 if (page_private(page) == migratetype)
1093 break;
1094 }
1095
1096 /* Allocate more to the pcp list if necessary */
1097 if (unlikely(&page->lru == &pcp->list)) {
1098 pcp->count += rmqueue_bulk(zone, 0,
1099 pcp->batch, &pcp->list, migratetype);
1100 page = list_entry(pcp->list.next, struct page, lru);
1101 }
1102 1209
1103 list_del(&page->lru); 1210 list_del(&page->lru);
1104 pcp->count--; 1211 pcp->count--;
1105 } else { 1212 } else {
1213 if (unlikely(gfp_flags & __GFP_NOFAIL)) {
1214 /*
1215 * __GFP_NOFAIL is not to be used in new code.
1216 *
1217 * All __GFP_NOFAIL callers should be fixed so that they
1218 * properly detect and handle allocation failures.
1219 *
1220 * We most definitely don't want callers attempting to
1221 * allocate greater than order-1 page units with
1222 * __GFP_NOFAIL.
1223 */
1224 WARN_ON_ONCE(order > 1);
1225 }
1106 spin_lock_irqsave(&zone->lock, flags); 1226 spin_lock_irqsave(&zone->lock, flags);
1107 page = __rmqueue(zone, order, migratetype); 1227 page = __rmqueue(zone, order, migratetype);
1228 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
1108 spin_unlock(&zone->lock); 1229 spin_unlock(&zone->lock);
1109 if (!page) 1230 if (!page)
1110 goto failed; 1231 goto failed;
@@ -1126,10 +1247,15 @@ failed:
1126 return NULL; 1247 return NULL;
1127} 1248}
1128 1249
1129#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ 1250/* The ALLOC_WMARK bits are used as an index to zone->watermark */
1130#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ 1251#define ALLOC_WMARK_MIN WMARK_MIN
1131#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ 1252#define ALLOC_WMARK_LOW WMARK_LOW
1132#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ 1253#define ALLOC_WMARK_HIGH WMARK_HIGH
1254#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
1255
1256/* Mask to get the watermark bits */
1257#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
1258
1133#define ALLOC_HARDER 0x10 /* try to alloc harder */ 1259#define ALLOC_HARDER 0x10 /* try to alloc harder */
1134#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ 1260#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
1135#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ 1261#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
@@ -1387,23 +1513,18 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1387 */ 1513 */
1388static struct page * 1514static struct page *
1389get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, 1515get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1390 struct zonelist *zonelist, int high_zoneidx, int alloc_flags) 1516 struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
1517 struct zone *preferred_zone, int migratetype)
1391{ 1518{
1392 struct zoneref *z; 1519 struct zoneref *z;
1393 struct page *page = NULL; 1520 struct page *page = NULL;
1394 int classzone_idx; 1521 int classzone_idx;
1395 struct zone *zone, *preferred_zone; 1522 struct zone *zone;
1396 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ 1523 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1397 int zlc_active = 0; /* set if using zonelist_cache */ 1524 int zlc_active = 0; /* set if using zonelist_cache */
1398 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1525 int did_zlc_setup = 0; /* just call zlc_setup() one time */
1399 1526
1400 (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,
1401 &preferred_zone);
1402 if (!preferred_zone)
1403 return NULL;
1404
1405 classzone_idx = zone_idx(preferred_zone); 1527 classzone_idx = zone_idx(preferred_zone);
1406
1407zonelist_scan: 1528zonelist_scan:
1408 /* 1529 /*
1409 * Scan zonelist, looking for a zone with enough free. 1530 * Scan zonelist, looking for a zone with enough free.
@@ -1418,31 +1539,49 @@ zonelist_scan:
1418 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1539 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1419 goto try_next_zone; 1540 goto try_next_zone;
1420 1541
1542 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1421 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1543 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
1422 unsigned long mark; 1544 unsigned long mark;
1423 if (alloc_flags & ALLOC_WMARK_MIN) 1545 int ret;
1424 mark = zone->pages_min; 1546
1425 else if (alloc_flags & ALLOC_WMARK_LOW) 1547 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1426 mark = zone->pages_low; 1548 if (zone_watermark_ok(zone, order, mark,
1427 else 1549 classzone_idx, alloc_flags))
1428 mark = zone->pages_high; 1550 goto try_this_zone;
1429 if (!zone_watermark_ok(zone, order, mark, 1551
1430 classzone_idx, alloc_flags)) { 1552 if (zone_reclaim_mode == 0)
1431 if (!zone_reclaim_mode || 1553 goto this_zone_full;
1432 !zone_reclaim(zone, gfp_mask, order)) 1554
1555 ret = zone_reclaim(zone, gfp_mask, order);
1556 switch (ret) {
1557 case ZONE_RECLAIM_NOSCAN:
1558 /* did not scan */
1559 goto try_next_zone;
1560 case ZONE_RECLAIM_FULL:
1561 /* scanned but unreclaimable */
1562 goto this_zone_full;
1563 default:
1564 /* did we reclaim enough */
1565 if (!zone_watermark_ok(zone, order, mark,
1566 classzone_idx, alloc_flags))
1433 goto this_zone_full; 1567 goto this_zone_full;
1434 } 1568 }
1435 } 1569 }
1436 1570
1437 page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask); 1571try_this_zone:
1572 page = buffered_rmqueue(preferred_zone, zone, order,
1573 gfp_mask, migratetype);
1438 if (page) 1574 if (page)
1439 break; 1575 break;
1440this_zone_full: 1576this_zone_full:
1441 if (NUMA_BUILD) 1577 if (NUMA_BUILD)
1442 zlc_mark_zone_full(zonelist, z); 1578 zlc_mark_zone_full(zonelist, z);
1443try_next_zone: 1579try_next_zone:
1444 if (NUMA_BUILD && !did_zlc_setup) { 1580 if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
1445 /* we do zlc_setup after the first zone is tried */ 1581 /*
1582 * we do zlc_setup after the first zone is tried but only
1583 * if there are multiple nodes make it worthwhile
1584 */
1446 allowednodes = zlc_setup(zonelist, alloc_flags); 1585 allowednodes = zlc_setup(zonelist, alloc_flags);
1447 zlc_active = 1; 1586 zlc_active = 1;
1448 did_zlc_setup = 1; 1587 did_zlc_setup = 1;
@@ -1457,47 +1596,215 @@ try_next_zone:
1457 return page; 1596 return page;
1458} 1597}
1459 1598
1599static inline int
1600should_alloc_retry(gfp_t gfp_mask, unsigned int order,
1601 unsigned long pages_reclaimed)
1602{
1603 /* Do not loop if specifically requested */
1604 if (gfp_mask & __GFP_NORETRY)
1605 return 0;
1606
1607 /*
1608 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
1609 * means __GFP_NOFAIL, but that may not be true in other
1610 * implementations.
1611 */
1612 if (order <= PAGE_ALLOC_COSTLY_ORDER)
1613 return 1;
1614
1615 /*
1616 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
1617 * specified, then we retry until we no longer reclaim any pages
1618 * (above), or we've reclaimed an order of pages at least as
1619 * large as the allocation's order. In both cases, if the
1620 * allocation still fails, we stop retrying.
1621 */
1622 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
1623 return 1;
1624
1625 /*
1626 * Don't let big-order allocations loop unless the caller
1627 * explicitly requests that.
1628 */
1629 if (gfp_mask & __GFP_NOFAIL)
1630 return 1;
1631
1632 return 0;
1633}
1634
1635static inline struct page *
1636__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
1637 struct zonelist *zonelist, enum zone_type high_zoneidx,
1638 nodemask_t *nodemask, struct zone *preferred_zone,
1639 int migratetype)
1640{
1641 struct page *page;
1642
1643 /* Acquire the OOM killer lock for the zones in zonelist */
1644 if (!try_set_zone_oom(zonelist, gfp_mask)) {
1645 schedule_timeout_uninterruptible(1);
1646 return NULL;
1647 }
1648
1649 /*
1650 * Go through the zonelist yet one more time, keep very high watermark
1651 * here, this is only to catch a parallel oom killing, we must fail if
1652 * we're still under heavy pressure.
1653 */
1654 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
1655 order, zonelist, high_zoneidx,
1656 ALLOC_WMARK_HIGH|ALLOC_CPUSET,
1657 preferred_zone, migratetype);
1658 if (page)
1659 goto out;
1660
1661 /* The OOM killer will not help higher order allocs */
1662 if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL))
1663 goto out;
1664
1665 /* Exhausted what can be done so it's blamo time */
1666 out_of_memory(zonelist, gfp_mask, order);
1667
1668out:
1669 clear_zonelist_oom(zonelist, gfp_mask);
1670 return page;
1671}
1672
1673/* The really slow allocator path where we enter direct reclaim */
1674static inline struct page *
1675__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1676 struct zonelist *zonelist, enum zone_type high_zoneidx,
1677 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1678 int migratetype, unsigned long *did_some_progress)
1679{
1680 struct page *page = NULL;
1681 struct reclaim_state reclaim_state;
1682 struct task_struct *p = current;
1683
1684 cond_resched();
1685
1686 /* We now go into synchronous reclaim */
1687 cpuset_memory_pressure_bump();
1688 p->flags |= PF_MEMALLOC;
1689 lockdep_set_current_reclaim_state(gfp_mask);
1690 reclaim_state.reclaimed_slab = 0;
1691 p->reclaim_state = &reclaim_state;
1692
1693 *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
1694
1695 p->reclaim_state = NULL;
1696 lockdep_clear_current_reclaim_state();
1697 p->flags &= ~PF_MEMALLOC;
1698
1699 cond_resched();
1700
1701 if (order != 0)
1702 drain_all_pages();
1703
1704 if (likely(*did_some_progress))
1705 page = get_page_from_freelist(gfp_mask, nodemask, order,
1706 zonelist, high_zoneidx,
1707 alloc_flags, preferred_zone,
1708 migratetype);
1709 return page;
1710}
1711
1460/* 1712/*
1461 * This is the 'heart' of the zoned buddy allocator. 1713 * This is called in the allocator slow-path if the allocation request is of
1714 * sufficient urgency to ignore watermarks and take other desperate measures
1462 */ 1715 */
1463struct page * 1716static inline struct page *
1464__alloc_pages_internal(gfp_t gfp_mask, unsigned int order, 1717__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
1465 struct zonelist *zonelist, nodemask_t *nodemask) 1718 struct zonelist *zonelist, enum zone_type high_zoneidx,
1719 nodemask_t *nodemask, struct zone *preferred_zone,
1720 int migratetype)
1721{
1722 struct page *page;
1723
1724 do {
1725 page = get_page_from_freelist(gfp_mask, nodemask, order,
1726 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
1727 preferred_zone, migratetype);
1728
1729 if (!page && gfp_mask & __GFP_NOFAIL)
1730 congestion_wait(BLK_RW_ASYNC, HZ/50);
1731 } while (!page && (gfp_mask & __GFP_NOFAIL));
1732
1733 return page;
1734}
1735
1736static inline
1737void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
1738 enum zone_type high_zoneidx)
1466{ 1739{
1467 const gfp_t wait = gfp_mask & __GFP_WAIT;
1468 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1469 struct zoneref *z; 1740 struct zoneref *z;
1470 struct zone *zone; 1741 struct zone *zone;
1471 struct page *page;
1472 struct reclaim_state reclaim_state;
1473 struct task_struct *p = current;
1474 int do_retry;
1475 int alloc_flags;
1476 unsigned long did_some_progress;
1477 unsigned long pages_reclaimed = 0;
1478 1742
1479 lockdep_trace_alloc(gfp_mask); 1743 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
1744 wakeup_kswapd(zone, order);
1745}
1480 1746
1481 might_sleep_if(wait); 1747static inline int
1748gfp_to_alloc_flags(gfp_t gfp_mask)
1749{
1750 struct task_struct *p = current;
1751 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
1752 const gfp_t wait = gfp_mask & __GFP_WAIT;
1482 1753
1483 if (should_fail_alloc_page(gfp_mask, order)) 1754 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
1484 return NULL; 1755 BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH);
1485 1756
1486restart: 1757 /*
1487 z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */ 1758 * The caller may dip into page reserves a bit more if the caller
1759 * cannot run direct reclaim, or if the caller has realtime scheduling
1760 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
1761 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
1762 */
1763 alloc_flags |= (gfp_mask & __GFP_HIGH);
1488 1764
1489 if (unlikely(!z->zone)) { 1765 if (!wait) {
1766 alloc_flags |= ALLOC_HARDER;
1490 /* 1767 /*
1491 * Happens if we have an empty zonelist as a result of 1768 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1492 * GFP_THISNODE being used on a memoryless node 1769 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1493 */ 1770 */
1494 return NULL; 1771 alloc_flags &= ~ALLOC_CPUSET;
1772 } else if (unlikely(rt_task(p)) && !in_interrupt())
1773 alloc_flags |= ALLOC_HARDER;
1774
1775 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
1776 if (!in_interrupt() &&
1777 ((p->flags & PF_MEMALLOC) ||
1778 unlikely(test_thread_flag(TIF_MEMDIE))))
1779 alloc_flags |= ALLOC_NO_WATERMARKS;
1495 } 1780 }
1496 1781
1497 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 1782 return alloc_flags;
1498 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); 1783}
1499 if (page) 1784
1500 goto got_pg; 1785static inline struct page *
1786__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1787 struct zonelist *zonelist, enum zone_type high_zoneidx,
1788 nodemask_t *nodemask, struct zone *preferred_zone,
1789 int migratetype)
1790{
1791 const gfp_t wait = gfp_mask & __GFP_WAIT;
1792 struct page *page = NULL;
1793 int alloc_flags;
1794 unsigned long pages_reclaimed = 0;
1795 unsigned long did_some_progress;
1796 struct task_struct *p = current;
1797
1798 /*
1799 * In the slowpath, we sanity check order to avoid ever trying to
1800 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
1801 * be using allocators in order of preference for an area that is
1802 * too large.
1803 */
1804 if (order >= MAX_ORDER) {
1805 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
1806 return NULL;
1807 }
1501 1808
1502 /* 1809 /*
1503 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and 1810 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
@@ -1510,155 +1817,88 @@ restart:
1510 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 1817 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
1511 goto nopage; 1818 goto nopage;
1512 1819
1513 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 1820restart:
1514 wakeup_kswapd(zone, order); 1821 wake_all_kswapd(order, zonelist, high_zoneidx);
1515 1822
1516 /* 1823 /*
1517 * OK, we're below the kswapd watermark and have kicked background 1824 * OK, we're below the kswapd watermark and have kicked background
1518 * reclaim. Now things get more complex, so set up alloc_flags according 1825 * reclaim. Now things get more complex, so set up alloc_flags according
1519 * to how we want to proceed. 1826 * to how we want to proceed.
1520 *
1521 * The caller may dip into page reserves a bit more if the caller
1522 * cannot run direct reclaim, or if the caller has realtime scheduling
1523 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
1524 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
1525 */ 1827 */
1526 alloc_flags = ALLOC_WMARK_MIN; 1828 alloc_flags = gfp_to_alloc_flags(gfp_mask);
1527 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
1528 alloc_flags |= ALLOC_HARDER;
1529 if (gfp_mask & __GFP_HIGH)
1530 alloc_flags |= ALLOC_HIGH;
1531 if (wait)
1532 alloc_flags |= ALLOC_CPUSET;
1533 1829
1534 /* 1830 /* This is the last chance, in general, before the goto nopage. */
1535 * Go through the zonelist again. Let __GFP_HIGH and allocations
1536 * coming from realtime tasks go deeper into reserves.
1537 *
1538 * This is the last chance, in general, before the goto nopage.
1539 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1540 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1541 */
1542 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 1831 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
1543 high_zoneidx, alloc_flags); 1832 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
1833 preferred_zone, migratetype);
1544 if (page) 1834 if (page)
1545 goto got_pg; 1835 goto got_pg;
1546 1836
1547 /* This allocation should allow future memory freeing. */
1548
1549rebalance: 1837rebalance:
1550 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) 1838 /* Allocate without watermarks if the context allows */
1551 && !in_interrupt()) { 1839 if (alloc_flags & ALLOC_NO_WATERMARKS) {
1552 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 1840 page = __alloc_pages_high_priority(gfp_mask, order,
1553nofail_alloc: 1841 zonelist, high_zoneidx, nodemask,
1554 /* go through the zonelist yet again, ignoring mins */ 1842 preferred_zone, migratetype);
1555 page = get_page_from_freelist(gfp_mask, nodemask, order, 1843 if (page)
1556 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); 1844 goto got_pg;
1557 if (page)
1558 goto got_pg;
1559 if (gfp_mask & __GFP_NOFAIL) {
1560 congestion_wait(WRITE, HZ/50);
1561 goto nofail_alloc;
1562 }
1563 }
1564 goto nopage;
1565 } 1845 }
1566 1846
1567 /* Atomic allocations - we can't balance anything */ 1847 /* Atomic allocations - we can't balance anything */
1568 if (!wait) 1848 if (!wait)
1569 goto nopage; 1849 goto nopage;
1570 1850
1571 cond_resched(); 1851 /* Avoid recursion of direct reclaim */
1572 1852 if (p->flags & PF_MEMALLOC)
1573 /* We now go into synchronous reclaim */ 1853 goto nopage;
1574 cpuset_memory_pressure_bump();
1575 /*
1576 * The task's cpuset might have expanded its set of allowable nodes
1577 */
1578 cpuset_update_task_memory_state();
1579 p->flags |= PF_MEMALLOC;
1580
1581 lockdep_set_current_reclaim_state(gfp_mask);
1582 reclaim_state.reclaimed_slab = 0;
1583 p->reclaim_state = &reclaim_state;
1584 1854
1585 did_some_progress = try_to_free_pages(zonelist, order, 1855 /* Avoid allocations with no watermarks from looping endlessly */
1586 gfp_mask, nodemask); 1856 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
1857 goto nopage;
1587 1858
1588 p->reclaim_state = NULL; 1859 /* Try direct reclaim and then allocating */
1589 lockdep_clear_current_reclaim_state(); 1860 page = __alloc_pages_direct_reclaim(gfp_mask, order,
1590 p->flags &= ~PF_MEMALLOC; 1861 zonelist, high_zoneidx,
1862 nodemask,
1863 alloc_flags, preferred_zone,
1864 migratetype, &did_some_progress);
1865 if (page)
1866 goto got_pg;
1591 1867
1592 cond_resched(); 1868 /*
1869 * If we failed to make any progress reclaiming, then we are
1870 * running out of options and have to consider going OOM
1871 */
1872 if (!did_some_progress) {
1873 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1874 if (oom_killer_disabled)
1875 goto nopage;
1876 page = __alloc_pages_may_oom(gfp_mask, order,
1877 zonelist, high_zoneidx,
1878 nodemask, preferred_zone,
1879 migratetype);
1880 if (page)
1881 goto got_pg;
1593 1882
1594 if (order != 0) 1883 /*
1595 drain_all_pages(); 1884 * The OOM killer does not trigger for high-order
1885 * ~__GFP_NOFAIL allocations so if no progress is being
1886 * made, there are no other options and retrying is
1887 * unlikely to help.
1888 */
1889 if (order > PAGE_ALLOC_COSTLY_ORDER &&
1890 !(gfp_mask & __GFP_NOFAIL))
1891 goto nopage;
1596 1892
1597 if (likely(did_some_progress)) {
1598 page = get_page_from_freelist(gfp_mask, nodemask, order,
1599 zonelist, high_zoneidx, alloc_flags);
1600 if (page)
1601 goto got_pg;
1602 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1603 if (!try_set_zone_oom(zonelist, gfp_mask)) {
1604 schedule_timeout_uninterruptible(1);
1605 goto restart; 1893 goto restart;
1606 } 1894 }
1607
1608 /*
1609 * Go through the zonelist yet one more time, keep
1610 * very high watermark here, this is only to catch
1611 * a parallel oom killing, we must fail if we're still
1612 * under heavy pressure.
1613 */
1614 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
1615 order, zonelist, high_zoneidx,
1616 ALLOC_WMARK_HIGH|ALLOC_CPUSET);
1617 if (page) {
1618 clear_zonelist_oom(zonelist, gfp_mask);
1619 goto got_pg;
1620 }
1621
1622 /* The OOM killer will not help higher order allocs so fail */
1623 if (order > PAGE_ALLOC_COSTLY_ORDER) {
1624 clear_zonelist_oom(zonelist, gfp_mask);
1625 goto nopage;
1626 }
1627
1628 out_of_memory(zonelist, gfp_mask, order);
1629 clear_zonelist_oom(zonelist, gfp_mask);
1630 goto restart;
1631 } 1895 }
1632 1896
1633 /* 1897 /* Check if we should retry the allocation */
1634 * Don't let big-order allocations loop unless the caller explicitly
1635 * requests that. Wait for some write requests to complete then retry.
1636 *
1637 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
1638 * means __GFP_NOFAIL, but that may not be true in other
1639 * implementations.
1640 *
1641 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
1642 * specified, then we retry until we no longer reclaim any pages
1643 * (above), or we've reclaimed an order of pages at least as
1644 * large as the allocation's order. In both cases, if the
1645 * allocation still fails, we stop retrying.
1646 */
1647 pages_reclaimed += did_some_progress; 1898 pages_reclaimed += did_some_progress;
1648 do_retry = 0; 1899 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
1649 if (!(gfp_mask & __GFP_NORETRY)) { 1900 /* Wait for some write requests to complete then retry */
1650 if (order <= PAGE_ALLOC_COSTLY_ORDER) { 1901 congestion_wait(BLK_RW_ASYNC, HZ/50);
1651 do_retry = 1;
1652 } else {
1653 if (gfp_mask & __GFP_REPEAT &&
1654 pages_reclaimed < (1 << order))
1655 do_retry = 1;
1656 }
1657 if (gfp_mask & __GFP_NOFAIL)
1658 do_retry = 1;
1659 }
1660 if (do_retry) {
1661 congestion_wait(WRITE, HZ/50);
1662 goto rebalance; 1902 goto rebalance;
1663 } 1903 }
1664 1904
@@ -1670,54 +1910,102 @@ nopage:
1670 dump_stack(); 1910 dump_stack();
1671 show_mem(); 1911 show_mem();
1672 } 1912 }
1913 return page;
1673got_pg: 1914got_pg:
1915 if (kmemcheck_enabled)
1916 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
1674 return page; 1917 return page;
1918
1675} 1919}
1676EXPORT_SYMBOL(__alloc_pages_internal); 1920
1921/*
1922 * This is the 'heart' of the zoned buddy allocator.
1923 */
1924struct page *
1925__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1926 struct zonelist *zonelist, nodemask_t *nodemask)
1927{
1928 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1929 struct zone *preferred_zone;
1930 struct page *page;
1931 int migratetype = allocflags_to_migratetype(gfp_mask);
1932
1933 gfp_mask &= gfp_allowed_mask;
1934
1935 lockdep_trace_alloc(gfp_mask);
1936
1937 might_sleep_if(gfp_mask & __GFP_WAIT);
1938
1939 if (should_fail_alloc_page(gfp_mask, order))
1940 return NULL;
1941
1942 /*
1943 * Check the zones suitable for the gfp_mask contain at least one
1944 * valid zone. It's possible to have an empty zonelist as a result
1945 * of GFP_THISNODE and a memoryless node
1946 */
1947 if (unlikely(!zonelist->_zonerefs->zone))
1948 return NULL;
1949
1950 /* The preferred zone is used for statistics later */
1951 first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
1952 if (!preferred_zone)
1953 return NULL;
1954
1955 /* First allocation attempt */
1956 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
1957 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
1958 preferred_zone, migratetype);
1959 if (unlikely(!page))
1960 page = __alloc_pages_slowpath(gfp_mask, order,
1961 zonelist, high_zoneidx, nodemask,
1962 preferred_zone, migratetype);
1963
1964 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
1965 return page;
1966}
1967EXPORT_SYMBOL(__alloc_pages_nodemask);
1677 1968
1678/* 1969/*
1679 * Common helper functions. 1970 * Common helper functions.
1680 */ 1971 */
1681unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 1972unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
1682{ 1973{
1683 struct page * page; 1974 struct page *page;
1975
1976 /*
1977 * __get_free_pages() returns a 32-bit address, which cannot represent
1978 * a highmem page
1979 */
1980 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
1981
1684 page = alloc_pages(gfp_mask, order); 1982 page = alloc_pages(gfp_mask, order);
1685 if (!page) 1983 if (!page)
1686 return 0; 1984 return 0;
1687 return (unsigned long) page_address(page); 1985 return (unsigned long) page_address(page);
1688} 1986}
1689
1690EXPORT_SYMBOL(__get_free_pages); 1987EXPORT_SYMBOL(__get_free_pages);
1691 1988
1692unsigned long get_zeroed_page(gfp_t gfp_mask) 1989unsigned long get_zeroed_page(gfp_t gfp_mask)
1693{ 1990{
1694 struct page * page; 1991 return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
1695
1696 /*
1697 * get_zeroed_page() returns a 32-bit address, which cannot represent
1698 * a highmem page
1699 */
1700 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
1701
1702 page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
1703 if (page)
1704 return (unsigned long) page_address(page);
1705 return 0;
1706} 1992}
1707
1708EXPORT_SYMBOL(get_zeroed_page); 1993EXPORT_SYMBOL(get_zeroed_page);
1709 1994
1710void __pagevec_free(struct pagevec *pvec) 1995void __pagevec_free(struct pagevec *pvec)
1711{ 1996{
1712 int i = pagevec_count(pvec); 1997 int i = pagevec_count(pvec);
1713 1998
1714 while (--i >= 0) 1999 while (--i >= 0) {
2000 trace_mm_pagevec_free(pvec->pages[i], pvec->cold);
1715 free_hot_cold_page(pvec->pages[i], pvec->cold); 2001 free_hot_cold_page(pvec->pages[i], pvec->cold);
2002 }
1716} 2003}
1717 2004
1718void __free_pages(struct page *page, unsigned int order) 2005void __free_pages(struct page *page, unsigned int order)
1719{ 2006{
1720 if (put_page_testzero(page)) { 2007 if (put_page_testzero(page)) {
2008 trace_mm_page_free_direct(page, order);
1721 if (order == 0) 2009 if (order == 0)
1722 free_hot_page(page); 2010 free_hot_page(page);
1723 else 2011 else
@@ -1760,7 +2048,7 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
1760 unsigned long alloc_end = addr + (PAGE_SIZE << order); 2048 unsigned long alloc_end = addr + (PAGE_SIZE << order);
1761 unsigned long used = addr + PAGE_ALIGN(size); 2049 unsigned long used = addr + PAGE_ALIGN(size);
1762 2050
1763 split_page(virt_to_page(addr), order); 2051 split_page(virt_to_page((void *)addr), order);
1764 while (used < alloc_end) { 2052 while (used < alloc_end) {
1765 free_page(used); 2053 free_page(used);
1766 used += PAGE_SIZE; 2054 used += PAGE_SIZE;
@@ -1802,7 +2090,7 @@ static unsigned int nr_free_zone_pages(int offset)
1802 2090
1803 for_each_zone_zonelist(zone, z, zonelist, offset) { 2091 for_each_zone_zonelist(zone, z, zonelist, offset) {
1804 unsigned long size = zone->present_pages; 2092 unsigned long size = zone->present_pages;
1805 unsigned long high = zone->pages_high; 2093 unsigned long high = high_wmark_pages(zone);
1806 if (size > high) 2094 if (size > high)
1807 sum += size - high; 2095 sum += size - high;
1808 } 2096 }
@@ -1892,28 +2180,27 @@ void show_free_areas(void)
1892 } 2180 }
1893 } 2181 }
1894 2182
1895 printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" 2183 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
1896 " inactive_file:%lu" 2184 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
1897//TODO: check/adjust line lengths
1898#ifdef CONFIG_UNEVICTABLE_LRU
1899 " unevictable:%lu" 2185 " unevictable:%lu"
1900#endif
1901 " dirty:%lu writeback:%lu unstable:%lu\n" 2186 " dirty:%lu writeback:%lu unstable:%lu\n"
1902 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", 2187 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
2188 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",
1903 global_page_state(NR_ACTIVE_ANON), 2189 global_page_state(NR_ACTIVE_ANON),
1904 global_page_state(NR_ACTIVE_FILE),
1905 global_page_state(NR_INACTIVE_ANON), 2190 global_page_state(NR_INACTIVE_ANON),
2191 global_page_state(NR_ISOLATED_ANON),
2192 global_page_state(NR_ACTIVE_FILE),
1906 global_page_state(NR_INACTIVE_FILE), 2193 global_page_state(NR_INACTIVE_FILE),
1907#ifdef CONFIG_UNEVICTABLE_LRU 2194 global_page_state(NR_ISOLATED_FILE),
1908 global_page_state(NR_UNEVICTABLE), 2195 global_page_state(NR_UNEVICTABLE),
1909#endif
1910 global_page_state(NR_FILE_DIRTY), 2196 global_page_state(NR_FILE_DIRTY),
1911 global_page_state(NR_WRITEBACK), 2197 global_page_state(NR_WRITEBACK),
1912 global_page_state(NR_UNSTABLE_NFS), 2198 global_page_state(NR_UNSTABLE_NFS),
1913 global_page_state(NR_FREE_PAGES), 2199 global_page_state(NR_FREE_PAGES),
1914 global_page_state(NR_SLAB_RECLAIMABLE) + 2200 global_page_state(NR_SLAB_RECLAIMABLE),
1915 global_page_state(NR_SLAB_UNRECLAIMABLE), 2201 global_page_state(NR_SLAB_UNRECLAIMABLE),
1916 global_page_state(NR_FILE_MAPPED), 2202 global_page_state(NR_FILE_MAPPED),
2203 global_page_state(NR_SHMEM),
1917 global_page_state(NR_PAGETABLE), 2204 global_page_state(NR_PAGETABLE),
1918 global_page_state(NR_BOUNCE)); 2205 global_page_state(NR_BOUNCE));
1919 2206
@@ -1930,26 +2217,51 @@ void show_free_areas(void)
1930 " inactive_anon:%lukB" 2217 " inactive_anon:%lukB"
1931 " active_file:%lukB" 2218 " active_file:%lukB"
1932 " inactive_file:%lukB" 2219 " inactive_file:%lukB"
1933#ifdef CONFIG_UNEVICTABLE_LRU
1934 " unevictable:%lukB" 2220 " unevictable:%lukB"
1935#endif 2221 " isolated(anon):%lukB"
2222 " isolated(file):%lukB"
1936 " present:%lukB" 2223 " present:%lukB"
2224 " mlocked:%lukB"
2225 " dirty:%lukB"
2226 " writeback:%lukB"
2227 " mapped:%lukB"
2228 " shmem:%lukB"
2229 " slab_reclaimable:%lukB"
2230 " slab_unreclaimable:%lukB"
2231 " kernel_stack:%lukB"
2232 " pagetables:%lukB"
2233 " unstable:%lukB"
2234 " bounce:%lukB"
2235 " writeback_tmp:%lukB"
1937 " pages_scanned:%lu" 2236 " pages_scanned:%lu"
1938 " all_unreclaimable? %s" 2237 " all_unreclaimable? %s"
1939 "\n", 2238 "\n",
1940 zone->name, 2239 zone->name,
1941 K(zone_page_state(zone, NR_FREE_PAGES)), 2240 K(zone_page_state(zone, NR_FREE_PAGES)),
1942 K(zone->pages_min), 2241 K(min_wmark_pages(zone)),
1943 K(zone->pages_low), 2242 K(low_wmark_pages(zone)),
1944 K(zone->pages_high), 2243 K(high_wmark_pages(zone)),
1945 K(zone_page_state(zone, NR_ACTIVE_ANON)), 2244 K(zone_page_state(zone, NR_ACTIVE_ANON)),
1946 K(zone_page_state(zone, NR_INACTIVE_ANON)), 2245 K(zone_page_state(zone, NR_INACTIVE_ANON)),
1947 K(zone_page_state(zone, NR_ACTIVE_FILE)), 2246 K(zone_page_state(zone, NR_ACTIVE_FILE)),
1948 K(zone_page_state(zone, NR_INACTIVE_FILE)), 2247 K(zone_page_state(zone, NR_INACTIVE_FILE)),
1949#ifdef CONFIG_UNEVICTABLE_LRU
1950 K(zone_page_state(zone, NR_UNEVICTABLE)), 2248 K(zone_page_state(zone, NR_UNEVICTABLE)),
1951#endif 2249 K(zone_page_state(zone, NR_ISOLATED_ANON)),
2250 K(zone_page_state(zone, NR_ISOLATED_FILE)),
1952 K(zone->present_pages), 2251 K(zone->present_pages),
2252 K(zone_page_state(zone, NR_MLOCK)),
2253 K(zone_page_state(zone, NR_FILE_DIRTY)),
2254 K(zone_page_state(zone, NR_WRITEBACK)),
2255 K(zone_page_state(zone, NR_FILE_MAPPED)),
2256 K(zone_page_state(zone, NR_SHMEM)),
2257 K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
2258 K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
2259 zone_page_state(zone, NR_KERNEL_STACK) *
2260 THREAD_SIZE / 1024,
2261 K(zone_page_state(zone, NR_PAGETABLE)),
2262 K(zone_page_state(zone, NR_UNSTABLE_NFS)),
2263 K(zone_page_state(zone, NR_BOUNCE)),
2264 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
1953 zone->pages_scanned, 2265 zone->pages_scanned,
1954 (zone_is_all_unreclaimable(zone) ? "yes" : "no") 2266 (zone_is_all_unreclaimable(zone) ? "yes" : "no")
1955 ); 2267 );
@@ -2078,7 +2390,7 @@ early_param("numa_zonelist_order", setup_numa_zonelist_order);
2078 * sysctl handler for numa_zonelist_order 2390 * sysctl handler for numa_zonelist_order
2079 */ 2391 */
2080int numa_zonelist_order_handler(ctl_table *table, int write, 2392int numa_zonelist_order_handler(ctl_table *table, int write,
2081 struct file *file, void __user *buffer, size_t *length, 2393 void __user *buffer, size_t *length,
2082 loff_t *ppos) 2394 loff_t *ppos)
2083{ 2395{
2084 char saved_string[NUMA_ZONELIST_ORDER_LEN]; 2396 char saved_string[NUMA_ZONELIST_ORDER_LEN];
@@ -2087,7 +2399,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2087 if (write) 2399 if (write)
2088 strncpy(saved_string, (char*)table->data, 2400 strncpy(saved_string, (char*)table->data,
2089 NUMA_ZONELIST_ORDER_LEN); 2401 NUMA_ZONELIST_ORDER_LEN);
2090 ret = proc_dostring(table, write, file, buffer, length, ppos); 2402 ret = proc_dostring(table, write, buffer, length, ppos);
2091 if (ret) 2403 if (ret)
2092 return ret; 2404 return ret;
2093 if (write) { 2405 if (write) {
@@ -2106,7 +2418,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2106} 2418}
2107 2419
2108 2420
2109#define MAX_NODE_LOAD (num_online_nodes()) 2421#define MAX_NODE_LOAD (nr_online_nodes)
2110static int node_load[MAX_NUMNODES]; 2422static int node_load[MAX_NUMNODES];
2111 2423
2112/** 2424/**
@@ -2128,7 +2440,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
2128 int n, val; 2440 int n, val;
2129 int min_val = INT_MAX; 2441 int min_val = INT_MAX;
2130 int best_node = -1; 2442 int best_node = -1;
2131 node_to_cpumask_ptr(tmp, 0); 2443 const struct cpumask *tmp = cpumask_of_node(0);
2132 2444
2133 /* Use the local node if we haven't already */ 2445 /* Use the local node if we haven't already */
2134 if (!node_isset(node, *used_node_mask)) { 2446 if (!node_isset(node, *used_node_mask)) {
@@ -2149,8 +2461,8 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
2149 val += (n < node); 2461 val += (n < node);
2150 2462
2151 /* Give preference to headless and unused nodes */ 2463 /* Give preference to headless and unused nodes */
2152 node_to_cpumask_ptr_next(tmp, n); 2464 tmp = cpumask_of_node(n);
2153 if (!cpus_empty(*tmp)) 2465 if (!cpumask_empty(tmp))
2154 val += PENALTY_FOR_NODE_WITH_CPUS; 2466 val += PENALTY_FOR_NODE_WITH_CPUS;
2155 2467
2156 /* Slight preference for less loaded node */ 2468 /* Slight preference for less loaded node */
@@ -2315,11 +2627,10 @@ static void build_zonelists(pg_data_t *pgdat)
2315 2627
2316 /* NUMA-aware ordering of nodes */ 2628 /* NUMA-aware ordering of nodes */
2317 local_node = pgdat->node_id; 2629 local_node = pgdat->node_id;
2318 load = num_online_nodes(); 2630 load = nr_online_nodes;
2319 prev_node = local_node; 2631 prev_node = local_node;
2320 nodes_clear(used_mask); 2632 nodes_clear(used_mask);
2321 2633
2322 memset(node_load, 0, sizeof(node_load));
2323 memset(node_order, 0, sizeof(node_order)); 2634 memset(node_order, 0, sizeof(node_order));
2324 j = 0; 2635 j = 0;
2325 2636
@@ -2428,6 +2739,9 @@ static int __build_all_zonelists(void *dummy)
2428{ 2739{
2429 int nid; 2740 int nid;
2430 2741
2742#ifdef CONFIG_NUMA
2743 memset(node_load, 0, sizeof(node_load));
2744#endif
2431 for_each_online_node(nid) { 2745 for_each_online_node(nid) {
2432 pg_data_t *pgdat = NODE_DATA(nid); 2746 pg_data_t *pgdat = NODE_DATA(nid);
2433 2747
@@ -2466,7 +2780,7 @@ void build_all_zonelists(void)
2466 2780
2467 printk("Built %i zonelists in %s order, mobility grouping %s. " 2781 printk("Built %i zonelists in %s order, mobility grouping %s. "
2468 "Total pages: %ld\n", 2782 "Total pages: %ld\n",
2469 num_online_nodes(), 2783 nr_online_nodes,
2470 zonelist_order_name[current_zonelist_order], 2784 zonelist_order_name[current_zonelist_order],
2471 page_group_by_mobility_disabled ? "off" : "on", 2785 page_group_by_mobility_disabled ? "off" : "on",
2472 vm_total_pages); 2786 vm_total_pages);
@@ -2545,8 +2859,8 @@ static inline unsigned long wait_table_bits(unsigned long size)
2545 2859
2546/* 2860/*
2547 * Mark a number of pageblocks as MIGRATE_RESERVE. The number 2861 * Mark a number of pageblocks as MIGRATE_RESERVE. The number
2548 * of blocks reserved is based on zone->pages_min. The memory within the 2862 * of blocks reserved is based on min_wmark_pages(zone). The memory within
2549 * reserve will tend to store contiguous free pages. Setting min_free_kbytes 2863 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
2550 * higher will lead to a bigger reserve which will get freed as contiguous 2864 * higher will lead to a bigger reserve which will get freed as contiguous
2551 * blocks as reclaim kicks in 2865 * blocks as reclaim kicks in
2552 */ 2866 */
@@ -2554,14 +2868,24 @@ static void setup_zone_migrate_reserve(struct zone *zone)
2554{ 2868{
2555 unsigned long start_pfn, pfn, end_pfn; 2869 unsigned long start_pfn, pfn, end_pfn;
2556 struct page *page; 2870 struct page *page;
2557 unsigned long reserve, block_migratetype; 2871 unsigned long block_migratetype;
2872 int reserve;
2558 2873
2559 /* Get the start pfn, end pfn and the number of blocks to reserve */ 2874 /* Get the start pfn, end pfn and the number of blocks to reserve */
2560 start_pfn = zone->zone_start_pfn; 2875 start_pfn = zone->zone_start_pfn;
2561 end_pfn = start_pfn + zone->spanned_pages; 2876 end_pfn = start_pfn + zone->spanned_pages;
2562 reserve = roundup(zone->pages_min, pageblock_nr_pages) >> 2877 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
2563 pageblock_order; 2878 pageblock_order;
2564 2879
2880 /*
2881 * Reserve blocks are generally in place to help high-order atomic
2882 * allocations that are short-lived. A min_free_kbytes value that
2883 * would result in more than 2 reserve blocks for atomic allocations
2884 * is assumed to be in place to help anti-fragmentation for the
2885 * future allocation of hugepages at runtime.
2886 */
2887 reserve = min(2, reserve);
2888
2565 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 2889 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
2566 if (!pfn_valid(pfn)) 2890 if (!pfn_valid(pfn))
2567 continue; 2891 continue;
@@ -2681,6 +3005,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
2681 3005
2682static int zone_batchsize(struct zone *zone) 3006static int zone_batchsize(struct zone *zone)
2683{ 3007{
3008#ifdef CONFIG_MMU
2684 int batch; 3009 int batch;
2685 3010
2686 /* 3011 /*
@@ -2706,14 +3031,32 @@ static int zone_batchsize(struct zone *zone)
2706 * of pages of one half of the possible page colors 3031 * of pages of one half of the possible page colors
2707 * and the other with pages of the other colors. 3032 * and the other with pages of the other colors.
2708 */ 3033 */
2709 batch = (1 << (fls(batch + batch/2)-1)) - 1; 3034 batch = rounddown_pow_of_two(batch + batch/2) - 1;
2710 3035
2711 return batch; 3036 return batch;
3037
3038#else
3039 /* The deferral and batching of frees should be suppressed under NOMMU
3040 * conditions.
3041 *
3042 * The problem is that NOMMU needs to be able to allocate large chunks
3043 * of contiguous memory as there's no hardware page translation to
3044 * assemble apparent contiguous memory from discontiguous pages.
3045 *
3046 * Queueing large contiguous runs of pages for batching, however,
3047 * causes the pages to actually be freed in smaller chunks. As there
3048 * can be a significant delay between the individual batches being
3049 * recycled, this leads to the once large chunks of space being
3050 * fragmented and becoming unavailable for high-order allocations.
3051 */
3052 return 0;
3053#endif
2712} 3054}
2713 3055
2714static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 3056static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
2715{ 3057{
2716 struct per_cpu_pages *pcp; 3058 struct per_cpu_pages *pcp;
3059 int migratetype;
2717 3060
2718 memset(p, 0, sizeof(*p)); 3061 memset(p, 0, sizeof(*p));
2719 3062
@@ -2721,7 +3064,8 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
2721 pcp->count = 0; 3064 pcp->count = 0;
2722 pcp->high = 6 * batch; 3065 pcp->high = 6 * batch;
2723 pcp->batch = max(1UL, 1 * batch); 3066 pcp->batch = max(1UL, 1 * batch);
2724 INIT_LIST_HEAD(&pcp->list); 3067 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
3068 INIT_LIST_HEAD(&pcp->lists[migratetype]);
2725} 3069}
2726 3070
2727/* 3071/*
@@ -2794,7 +3138,7 @@ bad:
2794 if (dzone == zone) 3138 if (dzone == zone)
2795 break; 3139 break;
2796 kfree(zone_pcp(dzone, cpu)); 3140 kfree(zone_pcp(dzone, cpu));
2797 zone_pcp(dzone, cpu) = NULL; 3141 zone_pcp(dzone, cpu) = &boot_pageset[cpu];
2798 } 3142 }
2799 return -ENOMEM; 3143 return -ENOMEM;
2800} 3144}
@@ -2809,7 +3153,7 @@ static inline void free_zone_pagesets(int cpu)
2809 /* Free per_cpu_pageset if it is slab allocated */ 3153 /* Free per_cpu_pageset if it is slab allocated */
2810 if (pset != &boot_pageset[cpu]) 3154 if (pset != &boot_pageset[cpu])
2811 kfree(pset); 3155 kfree(pset);
2812 zone_pcp(zone, cpu) = NULL; 3156 zone_pcp(zone, cpu) = &boot_pageset[cpu];
2813 } 3157 }
2814} 3158}
2815 3159
@@ -2899,6 +3243,32 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
2899 return 0; 3243 return 0;
2900} 3244}
2901 3245
3246static int __zone_pcp_update(void *data)
3247{
3248 struct zone *zone = data;
3249 int cpu;
3250 unsigned long batch = zone_batchsize(zone), flags;
3251
3252 for (cpu = 0; cpu < NR_CPUS; cpu++) {
3253 struct per_cpu_pageset *pset;
3254 struct per_cpu_pages *pcp;
3255
3256 pset = zone_pcp(zone, cpu);
3257 pcp = &pset->pcp;
3258
3259 local_irq_save(flags);
3260 free_pcppages_bulk(zone, pcp->count, pcp);
3261 setup_pageset(pset, batch);
3262 local_irq_restore(flags);
3263 }
3264 return 0;
3265}
3266
3267void zone_pcp_update(struct zone *zone)
3268{
3269 stop_machine(__zone_pcp_update, zone, NULL);
3270}
3271
2902static __meminit void zone_pcp_init(struct zone *zone) 3272static __meminit void zone_pcp_init(struct zone *zone)
2903{ 3273{
2904 int cpu; 3274 int cpu;
@@ -3085,64 +3455,6 @@ void __init sparse_memory_present_with_active_regions(int nid)
3085} 3455}
3086 3456
3087/** 3457/**
3088 * push_node_boundaries - Push node boundaries to at least the requested boundary
3089 * @nid: The nid of the node to push the boundary for
3090 * @start_pfn: The start pfn of the node
3091 * @end_pfn: The end pfn of the node
3092 *
3093 * In reserve-based hot-add, mem_map is allocated that is unused until hotadd
3094 * time. Specifically, on x86_64, SRAT will report ranges that can potentially
3095 * be hotplugged even though no physical memory exists. This function allows
3096 * an arch to push out the node boundaries so mem_map is allocated that can
3097 * be used later.
3098 */
3099#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
3100void __init push_node_boundaries(unsigned int nid,
3101 unsigned long start_pfn, unsigned long end_pfn)
3102{
3103 mminit_dprintk(MMINIT_TRACE, "zoneboundary",
3104 "Entering push_node_boundaries(%u, %lu, %lu)\n",
3105 nid, start_pfn, end_pfn);
3106
3107 /* Initialise the boundary for this node if necessary */
3108 if (node_boundary_end_pfn[nid] == 0)
3109 node_boundary_start_pfn[nid] = -1UL;
3110
3111 /* Update the boundaries */
3112 if (node_boundary_start_pfn[nid] > start_pfn)
3113 node_boundary_start_pfn[nid] = start_pfn;
3114 if (node_boundary_end_pfn[nid] < end_pfn)
3115 node_boundary_end_pfn[nid] = end_pfn;
3116}
3117
3118/* If necessary, push the node boundary out for reserve hotadd */
3119static void __meminit account_node_boundary(unsigned int nid,
3120 unsigned long *start_pfn, unsigned long *end_pfn)
3121{
3122 mminit_dprintk(MMINIT_TRACE, "zoneboundary",
3123 "Entering account_node_boundary(%u, %lu, %lu)\n",
3124 nid, *start_pfn, *end_pfn);
3125
3126 /* Return if boundary information has not been provided */
3127 if (node_boundary_end_pfn[nid] == 0)
3128 return;
3129
3130 /* Check the boundaries and update if necessary */
3131 if (node_boundary_start_pfn[nid] < *start_pfn)
3132 *start_pfn = node_boundary_start_pfn[nid];
3133 if (node_boundary_end_pfn[nid] > *end_pfn)
3134 *end_pfn = node_boundary_end_pfn[nid];
3135}
3136#else
3137void __init push_node_boundaries(unsigned int nid,
3138 unsigned long start_pfn, unsigned long end_pfn) {}
3139
3140static void __meminit account_node_boundary(unsigned int nid,
3141 unsigned long *start_pfn, unsigned long *end_pfn) {}
3142#endif
3143
3144
3145/**
3146 * get_pfn_range_for_nid - Return the start and end page frames for a node 3458 * get_pfn_range_for_nid - Return the start and end page frames for a node
3147 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. 3459 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
3148 * @start_pfn: Passed by reference. On return, it will have the node start_pfn. 3460 * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
@@ -3167,9 +3479,6 @@ void __meminit get_pfn_range_for_nid(unsigned int nid,
3167 3479
3168 if (*start_pfn == -1UL) 3480 if (*start_pfn == -1UL)
3169 *start_pfn = 0; 3481 *start_pfn = 0;
3170
3171 /* Push the node boundaries out if requested */
3172 account_node_boundary(nid, start_pfn, end_pfn);
3173} 3482}
3174 3483
3175/* 3484/*
@@ -3534,7 +3843,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3534 zone_pcp_init(zone); 3843 zone_pcp_init(zone);
3535 for_each_lru(l) { 3844 for_each_lru(l) {
3536 INIT_LIST_HEAD(&zone->lru[l].list); 3845 INIT_LIST_HEAD(&zone->lru[l].list);
3537 zone->lru[l].nr_scan = 0; 3846 zone->reclaim_stat.nr_saved_scan[l] = 0;
3538 } 3847 }
3539 zone->reclaim_stat.recent_rotated[0] = 0; 3848 zone->reclaim_stat.recent_rotated[0] = 0;
3540 zone->reclaim_stat.recent_rotated[1] = 0; 3849 zone->reclaim_stat.recent_rotated[1] = 0;
@@ -3775,10 +4084,6 @@ void __init remove_all_active_ranges(void)
3775{ 4084{
3776 memset(early_node_map, 0, sizeof(early_node_map)); 4085 memset(early_node_map, 0, sizeof(early_node_map));
3777 nr_nodemap_entries = 0; 4086 nr_nodemap_entries = 0;
3778#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
3779 memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
3780 memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
3781#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
3782} 4087}
3783 4088
3784/* Compare two active node_active_regions */ 4089/* Compare two active node_active_regions */
@@ -3865,6 +4170,8 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
3865 int i, nid; 4170 int i, nid;
3866 unsigned long usable_startpfn; 4171 unsigned long usable_startpfn;
3867 unsigned long kernelcore_node, kernelcore_remaining; 4172 unsigned long kernelcore_node, kernelcore_remaining;
4173 /* save the state before borrow the nodemask */
4174 nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];
3868 unsigned long totalpages = early_calculate_totalpages(); 4175 unsigned long totalpages = early_calculate_totalpages();
3869 int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); 4176 int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
3870 4177
@@ -3892,7 +4199,7 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
3892 4199
3893 /* If kernelcore was not specified, there is no ZONE_MOVABLE */ 4200 /* If kernelcore was not specified, there is no ZONE_MOVABLE */
3894 if (!required_kernelcore) 4201 if (!required_kernelcore)
3895 return; 4202 goto out;
3896 4203
3897 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 4204 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
3898 find_usable_zone_for_movable(); 4205 find_usable_zone_for_movable();
@@ -3991,6 +4298,10 @@ restart:
3991 for (nid = 0; nid < MAX_NUMNODES; nid++) 4298 for (nid = 0; nid < MAX_NUMNODES; nid++)
3992 zone_movable_pfn[nid] = 4299 zone_movable_pfn[nid] =
3993 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); 4300 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
4301
4302out:
4303 /* restore the node_state */
4304 node_states[N_HIGH_MEMORY] = saved_node_state;
3994} 4305}
3995 4306
3996/* Any regular memory on that node ? */ 4307/* Any regular memory on that node ? */
@@ -4209,8 +4520,8 @@ static void calculate_totalreserve_pages(void)
4209 max = zone->lowmem_reserve[j]; 4520 max = zone->lowmem_reserve[j];
4210 } 4521 }
4211 4522
4212 /* we treat pages_high as reserved pages. */ 4523 /* we treat the high watermark as reserved pages. */
4213 max += zone->pages_high; 4524 max += high_wmark_pages(zone);
4214 4525
4215 if (max > zone->present_pages) 4526 if (max > zone->present_pages)
4216 max = zone->present_pages; 4527 max = zone->present_pages;
@@ -4260,12 +4571,13 @@ static void setup_per_zone_lowmem_reserve(void)
4260} 4571}
4261 4572
4262/** 4573/**
4263 * setup_per_zone_pages_min - called when min_free_kbytes changes. 4574 * setup_per_zone_wmarks - called when min_free_kbytes changes
4575 * or when memory is hot-{added|removed}
4264 * 4576 *
4265 * Ensures that the pages_{min,low,high} values for each zone are set correctly 4577 * Ensures that the watermark[min,low,high] values for each zone are set
4266 * with respect to min_free_kbytes. 4578 * correctly with respect to min_free_kbytes.
4267 */ 4579 */
4268void setup_per_zone_pages_min(void) 4580void setup_per_zone_wmarks(void)
4269{ 4581{
4270 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 4582 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
4271 unsigned long lowmem_pages = 0; 4583 unsigned long lowmem_pages = 0;
@@ -4290,7 +4602,7 @@ void setup_per_zone_pages_min(void)
4290 * need highmem pages, so cap pages_min to a small 4602 * need highmem pages, so cap pages_min to a small
4291 * value here. 4603 * value here.
4292 * 4604 *
4293 * The (pages_high-pages_low) and (pages_low-pages_min) 4605 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
4294 * deltas controls asynch page reclaim, and so should 4606 * deltas controls asynch page reclaim, and so should
4295 * not be capped for highmem. 4607 * not be capped for highmem.
4296 */ 4608 */
@@ -4301,17 +4613,17 @@ void setup_per_zone_pages_min(void)
4301 min_pages = SWAP_CLUSTER_MAX; 4613 min_pages = SWAP_CLUSTER_MAX;
4302 if (min_pages > 128) 4614 if (min_pages > 128)
4303 min_pages = 128; 4615 min_pages = 128;
4304 zone->pages_min = min_pages; 4616 zone->watermark[WMARK_MIN] = min_pages;
4305 } else { 4617 } else {
4306 /* 4618 /*
4307 * If it's a lowmem zone, reserve a number of pages 4619 * If it's a lowmem zone, reserve a number of pages
4308 * proportionate to the zone's size. 4620 * proportionate to the zone's size.
4309 */ 4621 */
4310 zone->pages_min = tmp; 4622 zone->watermark[WMARK_MIN] = tmp;
4311 } 4623 }
4312 4624
4313 zone->pages_low = zone->pages_min + (tmp >> 2); 4625 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
4314 zone->pages_high = zone->pages_min + (tmp >> 1); 4626 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
4315 setup_zone_migrate_reserve(zone); 4627 setup_zone_migrate_reserve(zone);
4316 spin_unlock_irqrestore(&zone->lock, flags); 4628 spin_unlock_irqrestore(&zone->lock, flags);
4317 } 4629 }
@@ -4320,9 +4632,7 @@ void setup_per_zone_pages_min(void)
4320 calculate_totalreserve_pages(); 4632 calculate_totalreserve_pages();
4321} 4633}
4322 4634
4323/** 4635/*
4324 * setup_per_zone_inactive_ratio - called when min_free_kbytes changes.
4325 *
4326 * The inactive anon list should be small enough that the VM never has to 4636 * The inactive anon list should be small enough that the VM never has to
4327 * do too much work, but large enough that each inactive page has a chance 4637 * do too much work, but large enough that each inactive page has a chance
4328 * to be referenced again before it is swapped out. 4638 * to be referenced again before it is swapped out.
@@ -4343,21 +4653,26 @@ void setup_per_zone_pages_min(void)
4343 * 1TB 101 10GB 4653 * 1TB 101 10GB
4344 * 10TB 320 32GB 4654 * 10TB 320 32GB
4345 */ 4655 */
4346static void setup_per_zone_inactive_ratio(void) 4656void calculate_zone_inactive_ratio(struct zone *zone)
4347{ 4657{
4348 struct zone *zone; 4658 unsigned int gb, ratio;
4349 4659
4350 for_each_zone(zone) { 4660 /* Zone size in gigabytes */
4351 unsigned int gb, ratio; 4661 gb = zone->present_pages >> (30 - PAGE_SHIFT);
4352 4662 if (gb)
4353 /* Zone size in gigabytes */
4354 gb = zone->present_pages >> (30 - PAGE_SHIFT);
4355 ratio = int_sqrt(10 * gb); 4663 ratio = int_sqrt(10 * gb);
4356 if (!ratio) 4664 else
4357 ratio = 1; 4665 ratio = 1;
4358 4666
4359 zone->inactive_ratio = ratio; 4667 zone->inactive_ratio = ratio;
4360 } 4668}
4669
4670static void __init setup_per_zone_inactive_ratio(void)
4671{
4672 struct zone *zone;
4673
4674 for_each_zone(zone)
4675 calculate_zone_inactive_ratio(zone);
4361} 4676}
4362 4677
4363/* 4678/*
@@ -4384,7 +4699,7 @@ static void setup_per_zone_inactive_ratio(void)
4384 * 8192MB: 11584k 4699 * 8192MB: 11584k
4385 * 16384MB: 16384k 4700 * 16384MB: 16384k
4386 */ 4701 */
4387static int __init init_per_zone_pages_min(void) 4702static int __init init_per_zone_wmark_min(void)
4388{ 4703{
4389 unsigned long lowmem_kbytes; 4704 unsigned long lowmem_kbytes;
4390 4705
@@ -4395,12 +4710,12 @@ static int __init init_per_zone_pages_min(void)
4395 min_free_kbytes = 128; 4710 min_free_kbytes = 128;
4396 if (min_free_kbytes > 65536) 4711 if (min_free_kbytes > 65536)
4397 min_free_kbytes = 65536; 4712 min_free_kbytes = 65536;
4398 setup_per_zone_pages_min(); 4713 setup_per_zone_wmarks();
4399 setup_per_zone_lowmem_reserve(); 4714 setup_per_zone_lowmem_reserve();
4400 setup_per_zone_inactive_ratio(); 4715 setup_per_zone_inactive_ratio();
4401 return 0; 4716 return 0;
4402} 4717}
4403module_init(init_per_zone_pages_min) 4718module_init(init_per_zone_wmark_min)
4404 4719
4405/* 4720/*
4406 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 4721 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
@@ -4408,22 +4723,22 @@ module_init(init_per_zone_pages_min)
4408 * changes. 4723 * changes.
4409 */ 4724 */
4410int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 4725int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
4411 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4726 void __user *buffer, size_t *length, loff_t *ppos)
4412{ 4727{
4413 proc_dointvec(table, write, file, buffer, length, ppos); 4728 proc_dointvec(table, write, buffer, length, ppos);
4414 if (write) 4729 if (write)
4415 setup_per_zone_pages_min(); 4730 setup_per_zone_wmarks();
4416 return 0; 4731 return 0;
4417} 4732}
4418 4733
4419#ifdef CONFIG_NUMA 4734#ifdef CONFIG_NUMA
4420int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, 4735int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
4421 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4736 void __user *buffer, size_t *length, loff_t *ppos)
4422{ 4737{
4423 struct zone *zone; 4738 struct zone *zone;
4424 int rc; 4739 int rc;
4425 4740
4426 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4741 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
4427 if (rc) 4742 if (rc)
4428 return rc; 4743 return rc;
4429 4744
@@ -4434,12 +4749,12 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
4434} 4749}
4435 4750
4436int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, 4751int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
4437 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4752 void __user *buffer, size_t *length, loff_t *ppos)
4438{ 4753{
4439 struct zone *zone; 4754 struct zone *zone;
4440 int rc; 4755 int rc;
4441 4756
4442 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4757 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
4443 if (rc) 4758 if (rc)
4444 return rc; 4759 return rc;
4445 4760
@@ -4456,13 +4771,13 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
4456 * whenever sysctl_lowmem_reserve_ratio changes. 4771 * whenever sysctl_lowmem_reserve_ratio changes.
4457 * 4772 *
4458 * The reserve ratio obviously has absolutely no relation with the 4773 * The reserve ratio obviously has absolutely no relation with the
4459 * pages_min watermarks. The lowmem reserve ratio can only make sense 4774 * minimum watermarks. The lowmem reserve ratio can only make sense
4460 * if in function of the boot time zone sizes. 4775 * if in function of the boot time zone sizes.
4461 */ 4776 */
4462int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 4777int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
4463 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4778 void __user *buffer, size_t *length, loff_t *ppos)
4464{ 4779{
4465 proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4780 proc_dointvec_minmax(table, write, buffer, length, ppos);
4466 setup_per_zone_lowmem_reserve(); 4781 setup_per_zone_lowmem_reserve();
4467 return 0; 4782 return 0;
4468} 4783}
@@ -4474,16 +4789,16 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
4474 */ 4789 */
4475 4790
4476int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, 4791int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
4477 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4792 void __user *buffer, size_t *length, loff_t *ppos)
4478{ 4793{
4479 struct zone *zone; 4794 struct zone *zone;
4480 unsigned int cpu; 4795 unsigned int cpu;
4481 int ret; 4796 int ret;
4482 4797
4483 ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4798 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
4484 if (!write || (ret == -EINVAL)) 4799 if (!write || (ret == -EINVAL))
4485 return ret; 4800 return ret;
4486 for_each_zone(zone) { 4801 for_each_populated_zone(zone) {
4487 for_each_online_cpu(cpu) { 4802 for_each_online_cpu(cpu) {
4488 unsigned long high; 4803 unsigned long high;
4489 high = zone->present_pages / percpu_pagelist_fraction; 4804 high = zone->present_pages / percpu_pagelist_fraction;
@@ -4540,7 +4855,14 @@ void *__init alloc_large_system_hash(const char *tablename,
4540 numentries <<= (PAGE_SHIFT - scale); 4855 numentries <<= (PAGE_SHIFT - scale);
4541 4856
4542 /* Make sure we've got at least a 0-order allocation.. */ 4857 /* Make sure we've got at least a 0-order allocation.. */
4543 if (unlikely((numentries * bucketsize) < PAGE_SIZE)) 4858 if (unlikely(flags & HASH_SMALL)) {
4859 /* Makes no sense without HASH_EARLY */
4860 WARN_ON(!(flags & HASH_EARLY));
4861 if (!(numentries >> *_hash_shift)) {
4862 numentries = 1UL << *_hash_shift;
4863 BUG_ON(!numentries);
4864 }
4865 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
4544 numentries = PAGE_SIZE / bucketsize; 4866 numentries = PAGE_SIZE / bucketsize;
4545 } 4867 }
4546 numentries = roundup_pow_of_two(numentries); 4868 numentries = roundup_pow_of_two(numentries);
@@ -4563,22 +4885,14 @@ void *__init alloc_large_system_hash(const char *tablename,
4563 else if (hashdist) 4885 else if (hashdist)
4564 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 4886 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
4565 else { 4887 else {
4566 unsigned long order = get_order(size);
4567 table = (void*) __get_free_pages(GFP_ATOMIC, order);
4568 /* 4888 /*
4569 * If bucketsize is not a power-of-two, we may free 4889 * If bucketsize is not a power-of-two, we may free
4570 * some pages at the end of hash table. 4890 * some pages at the end of hash table which
4891 * alloc_pages_exact() automatically does
4571 */ 4892 */
4572 if (table) { 4893 if (get_order(size) < MAX_ORDER) {
4573 unsigned long alloc_end = (unsigned long)table + 4894 table = alloc_pages_exact(size, GFP_ATOMIC);
4574 (PAGE_SIZE << order); 4895 kmemleak_alloc(table, size, 1, GFP_ATOMIC);
4575 unsigned long used = (unsigned long)table +
4576 PAGE_ALIGN(size);
4577 split_page(virt_to_page(table), order);
4578 while (used < alloc_end) {
4579 free_page(used);
4580 used += PAGE_SIZE;
4581 }
4582 } 4896 }
4583 } 4897 }
4584 } while (!table && size > PAGE_SIZE && --log2qty); 4898 } while (!table && size > PAGE_SIZE && --log2qty);
@@ -4690,13 +5004,16 @@ int set_migratetype_isolate(struct page *page)
4690 struct zone *zone; 5004 struct zone *zone;
4691 unsigned long flags; 5005 unsigned long flags;
4692 int ret = -EBUSY; 5006 int ret = -EBUSY;
5007 int zone_idx;
4693 5008
4694 zone = page_zone(page); 5009 zone = page_zone(page);
5010 zone_idx = zone_idx(zone);
4695 spin_lock_irqsave(&zone->lock, flags); 5011 spin_lock_irqsave(&zone->lock, flags);
4696 /* 5012 /*
4697 * In future, more migrate types will be able to be isolation target. 5013 * In future, more migrate types will be able to be isolation target.
4698 */ 5014 */
4699 if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE) 5015 if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE &&
5016 zone_idx != ZONE_MOVABLE)
4700 goto out; 5017 goto out;
4701 set_pageblock_migratetype(page, MIGRATE_ISOLATE); 5018 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
4702 move_freepages_block(zone, page, MIGRATE_ISOLATE); 5019 move_freepages_block(zone, page, MIGRATE_ISOLATE);
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index ceecfbb143fa..3d535d594826 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -69,7 +69,7 @@ static int __init alloc_node_page_cgroup(int nid)
69 return 0; 69 return 0;
70} 70}
71 71
72void __init page_cgroup_init(void) 72void __init page_cgroup_init_flatmem(void)
73{ 73{
74 74
75 int nid, fail; 75 int nid, fail;
@@ -83,12 +83,12 @@ void __init page_cgroup_init(void)
83 goto fail; 83 goto fail;
84 } 84 }
85 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); 85 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
86 printk(KERN_INFO "please try cgroup_disable=memory option if you" 86 printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
87 " don't want\n"); 87 " don't want memory cgroups\n");
88 return; 88 return;
89fail: 89fail:
90 printk(KERN_CRIT "allocation of page_cgroup was failed.\n"); 90 printk(KERN_CRIT "allocation of page_cgroup failed.\n");
91 printk(KERN_CRIT "please try cgroup_disable=memory boot option\n"); 91 printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
92 panic("Out of memory"); 92 panic("Out of memory");
93} 93}
94 94
@@ -99,6 +99,8 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
99 unsigned long pfn = page_to_pfn(page); 99 unsigned long pfn = page_to_pfn(page);
100 struct mem_section *section = __pfn_to_section(pfn); 100 struct mem_section *section = __pfn_to_section(pfn);
101 101
102 if (!section->page_cgroup)
103 return NULL;
102 return section->page_cgroup + pfn; 104 return section->page_cgroup + pfn;
103} 105}
104 106
@@ -113,15 +115,16 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn)
113 if (!section->page_cgroup) { 115 if (!section->page_cgroup) {
114 nid = page_to_nid(pfn_to_page(pfn)); 116 nid = page_to_nid(pfn_to_page(pfn));
115 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; 117 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
116 if (slab_is_available()) { 118 VM_BUG_ON(!slab_is_available());
119 if (node_state(nid, N_HIGH_MEMORY)) {
117 base = kmalloc_node(table_size, 120 base = kmalloc_node(table_size,
118 GFP_KERNEL | __GFP_NOWARN, nid); 121 GFP_KERNEL | __GFP_NOWARN, nid);
119 if (!base) 122 if (!base)
120 base = vmalloc_node(table_size, nid); 123 base = vmalloc_node(table_size, nid);
121 } else { 124 } else {
122 base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), 125 base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN);
123 table_size, 126 if (!base)
124 PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); 127 base = vmalloc(table_size);
125 } 128 }
126 } else { 129 } else {
127 /* 130 /*
@@ -257,14 +260,14 @@ void __init page_cgroup_init(void)
257 fail = init_section_page_cgroup(pfn); 260 fail = init_section_page_cgroup(pfn);
258 } 261 }
259 if (fail) { 262 if (fail) {
260 printk(KERN_CRIT "try cgroup_disable=memory boot option\n"); 263 printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
261 panic("Out of memory"); 264 panic("Out of memory");
262 } else { 265 } else {
263 hotplug_memory_notifier(page_cgroup_callback, 0); 266 hotplug_memory_notifier(page_cgroup_callback, 0);
264 } 267 }
265 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); 268 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
266 printk(KERN_INFO "please try cgroup_disable=memory option if you don't" 269 printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't"
267 " want\n"); 270 " want memory cgroups\n");
268} 271}
269 272
270void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) 273void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
@@ -285,12 +288,8 @@ struct swap_cgroup_ctrl {
285 288
286struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; 289struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
287 290
288/*
289 * This 8bytes seems big..maybe we can reduce this when we can use "id" for
290 * cgroup rather than pointer.
291 */
292struct swap_cgroup { 291struct swap_cgroup {
293 struct mem_cgroup *val; 292 unsigned short id;
294}; 293};
295#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) 294#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup))
296#define SC_POS_MASK (SC_PER_PAGE - 1) 295#define SC_POS_MASK (SC_PER_PAGE - 1)
@@ -318,8 +317,6 @@ static int swap_cgroup_prepare(int type)
318 struct swap_cgroup_ctrl *ctrl; 317 struct swap_cgroup_ctrl *ctrl;
319 unsigned long idx, max; 318 unsigned long idx, max;
320 319
321 if (!do_swap_account)
322 return 0;
323 ctrl = &swap_cgroup_ctrl[type]; 320 ctrl = &swap_cgroup_ctrl[type];
324 321
325 for (idx = 0; idx < ctrl->length; idx++) { 322 for (idx = 0; idx < ctrl->length; idx++) {
@@ -342,10 +339,10 @@ not_enough_page:
342 * @ent: swap entry to be recorded into 339 * @ent: swap entry to be recorded into
343 * @mem: mem_cgroup to be recorded 340 * @mem: mem_cgroup to be recorded
344 * 341 *
345 * Returns old value at success, NULL at failure. 342 * Returns old value at success, 0 at failure.
346 * (Of course, old value can be NULL.) 343 * (Of course, old value can be 0.)
347 */ 344 */
348struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem) 345unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
349{ 346{
350 int type = swp_type(ent); 347 int type = swp_type(ent);
351 unsigned long offset = swp_offset(ent); 348 unsigned long offset = swp_offset(ent);
@@ -354,18 +351,15 @@ struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem)
354 struct swap_cgroup_ctrl *ctrl; 351 struct swap_cgroup_ctrl *ctrl;
355 struct page *mappage; 352 struct page *mappage;
356 struct swap_cgroup *sc; 353 struct swap_cgroup *sc;
357 struct mem_cgroup *old; 354 unsigned short old;
358
359 if (!do_swap_account)
360 return NULL;
361 355
362 ctrl = &swap_cgroup_ctrl[type]; 356 ctrl = &swap_cgroup_ctrl[type];
363 357
364 mappage = ctrl->map[idx]; 358 mappage = ctrl->map[idx];
365 sc = page_address(mappage); 359 sc = page_address(mappage);
366 sc += pos; 360 sc += pos;
367 old = sc->val; 361 old = sc->id;
368 sc->val = mem; 362 sc->id = id;
369 363
370 return old; 364 return old;
371} 365}
@@ -374,9 +368,9 @@ struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem)
374 * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry 368 * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry
375 * @ent: swap entry to be looked up. 369 * @ent: swap entry to be looked up.
376 * 370 *
377 * Returns pointer to mem_cgroup at success. NULL at failure. 371 * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
378 */ 372 */
379struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent) 373unsigned short lookup_swap_cgroup(swp_entry_t ent)
380{ 374{
381 int type = swp_type(ent); 375 int type = swp_type(ent);
382 unsigned long offset = swp_offset(ent); 376 unsigned long offset = swp_offset(ent);
@@ -385,16 +379,13 @@ struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent)
385 struct swap_cgroup_ctrl *ctrl; 379 struct swap_cgroup_ctrl *ctrl;
386 struct page *mappage; 380 struct page *mappage;
387 struct swap_cgroup *sc; 381 struct swap_cgroup *sc;
388 struct mem_cgroup *ret; 382 unsigned short ret;
389
390 if (!do_swap_account)
391 return NULL;
392 383
393 ctrl = &swap_cgroup_ctrl[type]; 384 ctrl = &swap_cgroup_ctrl[type];
394 mappage = ctrl->map[idx]; 385 mappage = ctrl->map[idx];
395 sc = page_address(mappage); 386 sc = page_address(mappage);
396 sc += pos; 387 sc += pos;
397 ret = sc->val; 388 ret = sc->id;
398 return ret; 389 return ret;
399} 390}
400 391
@@ -430,13 +421,6 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
430 } 421 }
431 mutex_unlock(&swap_cgroup_mutex); 422 mutex_unlock(&swap_cgroup_mutex);
432 423
433 printk(KERN_INFO
434 "swap_cgroup: uses %ld bytes of vmalloc for pointer array space"
435 " and %ld bytes to hold mem_cgroup pointers on swap\n",
436 array_size, length * PAGE_SIZE);
437 printk(KERN_INFO
438 "swap_cgroup can be disabled by noswapaccount boot option.\n");
439
440 return 0; 424 return 0;
441nomem: 425nomem:
442 printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); 426 printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
diff --git a/mm/page_io.c b/mm/page_io.c
index 3023c475e041..c6f3e5071de3 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -120,7 +120,7 @@ out:
120 return ret; 120 return ret;
121} 121}
122 122
123int swap_readpage(struct file *file, struct page *page) 123int swap_readpage(struct page *page)
124{ 124{
125 struct bio *bio; 125 struct bio *bio;
126 int ret = 0; 126 int ret = 0;
diff --git a/mm/pdflush.c b/mm/pdflush.c
deleted file mode 100644
index 118905e3d788..000000000000
--- a/mm/pdflush.c
+++ /dev/null
@@ -1,251 +0,0 @@
1/*
2 * mm/pdflush.c - worker threads for writing back filesystem data
3 *
4 * Copyright (C) 2002, Linus Torvalds.
5 *
6 * 09Apr2002 Andrew Morton
7 * Initial version
8 * 29Feb2004 kaos@sgi.com
9 * Move worker thread creation to kthread to avoid chewing
10 * up stack space with nested calls to kernel_thread.
11 */
12
13#include <linux/sched.h>
14#include <linux/list.h>
15#include <linux/signal.h>
16#include <linux/spinlock.h>
17#include <linux/gfp.h>
18#include <linux/init.h>
19#include <linux/module.h>
20#include <linux/fs.h> /* Needed by writeback.h */
21#include <linux/writeback.h> /* Prototypes pdflush_operation() */
22#include <linux/kthread.h>
23#include <linux/cpuset.h>
24#include <linux/freezer.h>
25
26
27/*
28 * Minimum and maximum number of pdflush instances
29 */
30#define MIN_PDFLUSH_THREADS 2
31#define MAX_PDFLUSH_THREADS 8
32
33static void start_one_pdflush_thread(void);
34
35
36/*
37 * The pdflush threads are worker threads for writing back dirty data.
38 * Ideally, we'd like one thread per active disk spindle. But the disk
39 * topology is very hard to divine at this level. Instead, we take
40 * care in various places to prevent more than one pdflush thread from
41 * performing writeback against a single filesystem. pdflush threads
42 * have the PF_FLUSHER flag set in current->flags to aid in this.
43 */
44
45/*
46 * All the pdflush threads. Protected by pdflush_lock
47 */
48static LIST_HEAD(pdflush_list);
49static DEFINE_SPINLOCK(pdflush_lock);
50
51/*
52 * The count of currently-running pdflush threads. Protected
53 * by pdflush_lock.
54 *
55 * Readable by sysctl, but not writable. Published to userspace at
56 * /proc/sys/vm/nr_pdflush_threads.
57 */
58int nr_pdflush_threads = 0;
59
60/*
61 * The time at which the pdflush thread pool last went empty
62 */
63static unsigned long last_empty_jifs;
64
65/*
66 * The pdflush thread.
67 *
68 * Thread pool management algorithm:
69 *
70 * - The minimum and maximum number of pdflush instances are bound
71 * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
72 *
73 * - If there have been no idle pdflush instances for 1 second, create
74 * a new one.
75 *
76 * - If the least-recently-went-to-sleep pdflush thread has been asleep
77 * for more than one second, terminate a thread.
78 */
79
80/*
81 * A structure for passing work to a pdflush thread. Also for passing
82 * state information between pdflush threads. Protected by pdflush_lock.
83 */
84struct pdflush_work {
85 struct task_struct *who; /* The thread */
86 void (*fn)(unsigned long); /* A callback function */
87 unsigned long arg0; /* An argument to the callback */
88 struct list_head list; /* On pdflush_list, when idle */
89 unsigned long when_i_went_to_sleep;
90};
91
92static int __pdflush(struct pdflush_work *my_work)
93{
94 current->flags |= PF_FLUSHER | PF_SWAPWRITE;
95 set_freezable();
96 my_work->fn = NULL;
97 my_work->who = current;
98 INIT_LIST_HEAD(&my_work->list);
99
100 spin_lock_irq(&pdflush_lock);
101 nr_pdflush_threads++;
102 for ( ; ; ) {
103 struct pdflush_work *pdf;
104
105 set_current_state(TASK_INTERRUPTIBLE);
106 list_move(&my_work->list, &pdflush_list);
107 my_work->when_i_went_to_sleep = jiffies;
108 spin_unlock_irq(&pdflush_lock);
109 schedule();
110 try_to_freeze();
111 spin_lock_irq(&pdflush_lock);
112 if (!list_empty(&my_work->list)) {
113 /*
114 * Someone woke us up, but without removing our control
115 * structure from the global list. swsusp will do this
116 * in try_to_freeze()->refrigerator(). Handle it.
117 */
118 my_work->fn = NULL;
119 continue;
120 }
121 if (my_work->fn == NULL) {
122 printk("pdflush: bogus wakeup\n");
123 continue;
124 }
125 spin_unlock_irq(&pdflush_lock);
126
127 (*my_work->fn)(my_work->arg0);
128
129 /*
130 * Thread creation: For how long have there been zero
131 * available threads?
132 */
133 if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
134 /* unlocked list_empty() test is OK here */
135 if (list_empty(&pdflush_list)) {
136 /* unlocked test is OK here */
137 if (nr_pdflush_threads < MAX_PDFLUSH_THREADS)
138 start_one_pdflush_thread();
139 }
140 }
141
142 spin_lock_irq(&pdflush_lock);
143 my_work->fn = NULL;
144
145 /*
146 * Thread destruction: For how long has the sleepiest
147 * thread slept?
148 */
149 if (list_empty(&pdflush_list))
150 continue;
151 if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
152 continue;
153 pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
154 if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
155 /* Limit exit rate */
156 pdf->when_i_went_to_sleep = jiffies;
157 break; /* exeunt */
158 }
159 }
160 nr_pdflush_threads--;
161 spin_unlock_irq(&pdflush_lock);
162 return 0;
163}
164
165/*
166 * Of course, my_work wants to be just a local in __pdflush(). It is
167 * separated out in this manner to hopefully prevent the compiler from
168 * performing unfortunate optimisations against the auto variables. Because
169 * these are visible to other tasks and CPUs. (No problem has actually
170 * been observed. This is just paranoia).
171 */
172static int pdflush(void *dummy)
173{
174 struct pdflush_work my_work;
175 cpumask_var_t cpus_allowed;
176
177 /*
178 * Since the caller doesn't even check kthread_run() worked, let's not
179 * freak out too much if this fails.
180 */
181 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
182 printk(KERN_WARNING "pdflush failed to allocate cpumask\n");
183 return 0;
184 }
185
186 /*
187 * pdflush can spend a lot of time doing encryption via dm-crypt. We
188 * don't want to do that at keventd's priority.
189 */
190 set_user_nice(current, 0);
191
192 /*
193 * Some configs put our parent kthread in a limited cpuset,
194 * which kthread() overrides, forcing cpus_allowed == cpu_all_mask.
195 * Our needs are more modest - cut back to our cpusets cpus_allowed.
196 * This is needed as pdflush's are dynamically created and destroyed.
197 * The boottime pdflush's are easily placed w/o these 2 lines.
198 */
199 cpuset_cpus_allowed(current, cpus_allowed);
200 set_cpus_allowed_ptr(current, cpus_allowed);
201 free_cpumask_var(cpus_allowed);
202
203 return __pdflush(&my_work);
204}
205
206/*
207 * Attempt to wake up a pdflush thread, and get it to do some work for you.
208 * Returns zero if it indeed managed to find a worker thread, and passed your
209 * payload to it.
210 */
211int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
212{
213 unsigned long flags;
214 int ret = 0;
215
216 BUG_ON(fn == NULL); /* Hard to diagnose if it's deferred */
217
218 spin_lock_irqsave(&pdflush_lock, flags);
219 if (list_empty(&pdflush_list)) {
220 ret = -1;
221 } else {
222 struct pdflush_work *pdf;
223
224 pdf = list_entry(pdflush_list.next, struct pdflush_work, list);
225 list_del_init(&pdf->list);
226 if (list_empty(&pdflush_list))
227 last_empty_jifs = jiffies;
228 pdf->fn = fn;
229 pdf->arg0 = arg0;
230 wake_up_process(pdf->who);
231 }
232 spin_unlock_irqrestore(&pdflush_lock, flags);
233
234 return ret;
235}
236
237static void start_one_pdflush_thread(void)
238{
239 kthread_run(pdflush, NULL, "pdflush");
240}
241
242static int __init pdflush_init(void)
243{
244 int i;
245
246 for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
247 start_one_pdflush_thread();
248 return 0;
249}
250
251module_init(pdflush_init);
diff --git a/mm/percpu.c b/mm/percpu.c
index 1aa5d8fbca12..5adfc268b408 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -8,12 +8,13 @@
8 * 8 *
9 * This is percpu allocator which can handle both static and dynamic 9 * This is percpu allocator which can handle both static and dynamic
10 * areas. Percpu areas are allocated in chunks in vmalloc area. Each 10 * areas. Percpu areas are allocated in chunks in vmalloc area. Each
11 * chunk is consisted of num_possible_cpus() units and the first chunk 11 * chunk is consisted of boot-time determined number of units and the
12 * is used for static percpu variables in the kernel image (special 12 * first chunk is used for static percpu variables in the kernel image
13 * boot time alloc/init handling necessary as these areas need to be 13 * (special boot time alloc/init handling necessary as these areas
14 * brought up before allocation services are running). Unit grows as 14 * need to be brought up before allocation services are running).
15 * necessary and all units grow or shrink in unison. When a chunk is 15 * Unit grows as necessary and all units grow or shrink in unison.
16 * filled up, another chunk is allocated. ie. in vmalloc area 16 * When a chunk is filled up, another chunk is allocated. ie. in
17 * vmalloc area
17 * 18 *
18 * c0 c1 c2 19 * c0 c1 c2
19 * ------------------- ------------------- ------------ 20 * ------------------- ------------------- ------------
@@ -22,11 +23,13 @@
22 * 23 *
23 * Allocation is done in offset-size areas of single unit space. Ie, 24 * Allocation is done in offset-size areas of single unit space. Ie,
24 * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, 25 * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
25 * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring 26 * c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to
26 * percpu base registers UNIT_SIZE apart. 27 * cpus. On NUMA, the mapping can be non-linear and even sparse.
28 * Percpu access can be done by configuring percpu base registers
29 * according to cpu to unit mapping and pcpu_unit_size.
27 * 30 *
28 * There are usually many small percpu allocations many of them as 31 * There are usually many small percpu allocations many of them being
29 * small as 4 bytes. The allocator organizes chunks into lists 32 * as small as 4 bytes. The allocator organizes chunks into lists
30 * according to free size and tries to allocate from the fullest one. 33 * according to free size and tries to allocate from the fullest one.
31 * Each chunk keeps the maximum contiguous area size hint which is 34 * Each chunk keeps the maximum contiguous area size hint which is
32 * guaranteed to be eqaul to or larger than the maximum contiguous 35 * guaranteed to be eqaul to or larger than the maximum contiguous
@@ -38,12 +41,12 @@
38 * region and negative allocated. Allocation inside a chunk is done 41 * region and negative allocated. Allocation inside a chunk is done
39 * by scanning this map sequentially and serving the first matching 42 * by scanning this map sequentially and serving the first matching
40 * entry. This is mostly copied from the percpu_modalloc() allocator. 43 * entry. This is mostly copied from the percpu_modalloc() allocator.
41 * Chunks are also linked into a rb tree to ease address to chunk 44 * Chunks can be determined from the address using the index field
42 * mapping during free. 45 * in the page struct. The index field contains a pointer to the chunk.
43 * 46 *
44 * To use this allocator, arch code should do the followings. 47 * To use this allocator, arch code should do the followings.
45 * 48 *
46 * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA 49 * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA
47 * 50 *
48 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate 51 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
49 * regular address to percpu pointer and back if they need to be 52 * regular address to percpu pointer and back if they need to be
@@ -55,13 +58,14 @@
55 58
56#include <linux/bitmap.h> 59#include <linux/bitmap.h>
57#include <linux/bootmem.h> 60#include <linux/bootmem.h>
61#include <linux/err.h>
58#include <linux/list.h> 62#include <linux/list.h>
63#include <linux/log2.h>
59#include <linux/mm.h> 64#include <linux/mm.h>
60#include <linux/module.h> 65#include <linux/module.h>
61#include <linux/mutex.h> 66#include <linux/mutex.h>
62#include <linux/percpu.h> 67#include <linux/percpu.h>
63#include <linux/pfn.h> 68#include <linux/pfn.h>
64#include <linux/rbtree.h>
65#include <linux/slab.h> 69#include <linux/slab.h>
66#include <linux/spinlock.h> 70#include <linux/spinlock.h>
67#include <linux/vmalloc.h> 71#include <linux/vmalloc.h>
@@ -88,44 +92,71 @@
88 92
89struct pcpu_chunk { 93struct pcpu_chunk {
90 struct list_head list; /* linked to pcpu_slot lists */ 94 struct list_head list; /* linked to pcpu_slot lists */
91 struct rb_node rb_node; /* key is chunk->vm->addr */
92 int free_size; /* free bytes in the chunk */ 95 int free_size; /* free bytes in the chunk */
93 int contig_hint; /* max contiguous size hint */ 96 int contig_hint; /* max contiguous size hint */
94 struct vm_struct *vm; /* mapped vmalloc region */ 97 void *base_addr; /* base address of this chunk */
95 int map_used; /* # of map entries used */ 98 int map_used; /* # of map entries used */
96 int map_alloc; /* # of map entries allocated */ 99 int map_alloc; /* # of map entries allocated */
97 int *map; /* allocation map */ 100 int *map; /* allocation map */
101 struct vm_struct **vms; /* mapped vmalloc regions */
98 bool immutable; /* no [de]population allowed */ 102 bool immutable; /* no [de]population allowed */
99 struct page **page; /* points to page array */ 103 unsigned long populated[]; /* populated bitmap */
100 struct page *page_ar[]; /* #cpus * UNIT_PAGES */
101}; 104};
102 105
103static int pcpu_unit_pages __read_mostly; 106static int pcpu_unit_pages __read_mostly;
104static int pcpu_unit_size __read_mostly; 107static int pcpu_unit_size __read_mostly;
105static int pcpu_chunk_size __read_mostly; 108static int pcpu_nr_units __read_mostly;
109static int pcpu_atom_size __read_mostly;
106static int pcpu_nr_slots __read_mostly; 110static int pcpu_nr_slots __read_mostly;
107static size_t pcpu_chunk_struct_size __read_mostly; 111static size_t pcpu_chunk_struct_size __read_mostly;
108 112
113/* cpus with the lowest and highest unit numbers */
114static unsigned int pcpu_first_unit_cpu __read_mostly;
115static unsigned int pcpu_last_unit_cpu __read_mostly;
116
109/* the address of the first chunk which starts with the kernel static area */ 117/* the address of the first chunk which starts with the kernel static area */
110void *pcpu_base_addr __read_mostly; 118void *pcpu_base_addr __read_mostly;
111EXPORT_SYMBOL_GPL(pcpu_base_addr); 119EXPORT_SYMBOL_GPL(pcpu_base_addr);
112 120
113/* optional reserved chunk, only accessible for reserved allocations */ 121static const int *pcpu_unit_map __read_mostly; /* cpu -> unit */
122const unsigned long *pcpu_unit_offsets __read_mostly; /* cpu -> unit offset */
123
124/* group information, used for vm allocation */
125static int pcpu_nr_groups __read_mostly;
126static const unsigned long *pcpu_group_offsets __read_mostly;
127static const size_t *pcpu_group_sizes __read_mostly;
128
129/*
130 * The first chunk which always exists. Note that unlike other
131 * chunks, this one can be allocated and mapped in several different
132 * ways and thus often doesn't live in the vmalloc area.
133 */
134static struct pcpu_chunk *pcpu_first_chunk;
135
136/*
137 * Optional reserved chunk. This chunk reserves part of the first
138 * chunk and serves it for reserved allocations. The amount of
139 * reserved offset is in pcpu_reserved_chunk_limit. When reserved
140 * area doesn't exist, the following variables contain NULL and 0
141 * respectively.
142 */
114static struct pcpu_chunk *pcpu_reserved_chunk; 143static struct pcpu_chunk *pcpu_reserved_chunk;
115/* offset limit of the reserved chunk */
116static int pcpu_reserved_chunk_limit; 144static int pcpu_reserved_chunk_limit;
117 145
118/* 146/*
119 * Synchronization rules. 147 * Synchronization rules.
120 * 148 *
121 * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former 149 * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
122 * protects allocation/reclaim paths, chunks and chunk->page arrays. 150 * protects allocation/reclaim paths, chunks, populated bitmap and
123 * The latter is a spinlock and protects the index data structures - 151 * vmalloc mapping. The latter is a spinlock and protects the index
124 * chunk slots, rbtree, chunks and area maps in chunks. 152 * data structures - chunk slots, chunks and area maps in chunks.
125 * 153 *
126 * During allocation, pcpu_alloc_mutex is kept locked all the time and 154 * During allocation, pcpu_alloc_mutex is kept locked all the time and
127 * pcpu_lock is grabbed and released as necessary. All actual memory 155 * pcpu_lock is grabbed and released as necessary. All actual memory
128 * allocations are done using GFP_KERNEL with pcpu_lock released. 156 * allocations are done using GFP_KERNEL with pcpu_lock released. In
157 * general, percpu memory can't be allocated with irq off but
158 * irqsave/restore are still used in alloc path so that it can be used
159 * from early init path - sched_init() specifically.
129 * 160 *
130 * Free path accesses and alters only the index data structures, so it 161 * Free path accesses and alters only the index data structures, so it
131 * can be safely called from atomic context. When memory needs to be 162 * can be safely called from atomic context. When memory needs to be
@@ -140,7 +171,6 @@ static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */
140static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ 171static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */
141 172
142static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ 173static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
143static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */
144 174
145/* reclaim work to release fully free chunks, scheduled from free path */ 175/* reclaim work to release fully free chunks, scheduled from free path */
146static void pcpu_reclaim(struct work_struct *work); 176static void pcpu_reclaim(struct work_struct *work);
@@ -169,28 +199,65 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
169 199
170static int pcpu_page_idx(unsigned int cpu, int page_idx) 200static int pcpu_page_idx(unsigned int cpu, int page_idx)
171{ 201{
172 return cpu * pcpu_unit_pages + page_idx; 202 return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
173} 203}
174 204
175static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk, 205static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
176 unsigned int cpu, int page_idx) 206 unsigned int cpu, int page_idx)
177{ 207{
178 return &chunk->page[pcpu_page_idx(cpu, page_idx)]; 208 return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +
209 (page_idx << PAGE_SHIFT);
179} 210}
180 211
181static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, 212static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
182 unsigned int cpu, int page_idx) 213 unsigned int cpu, int page_idx)
214{
215 /* must not be used on pre-mapped chunk */
216 WARN_ON(chunk->immutable);
217
218 return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
219}
220
221/* set the pointer to a chunk in a page struct */
222static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
223{
224 page->index = (unsigned long)pcpu;
225}
226
227/* obtain pointer to a chunk from a page struct */
228static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
229{
230 return (struct pcpu_chunk *)page->index;
231}
232
233static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
183{ 234{
184 return (unsigned long)chunk->vm->addr + 235 *rs = find_next_zero_bit(chunk->populated, end, *rs);
185 (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT); 236 *re = find_next_bit(chunk->populated, end, *rs + 1);
186} 237}
187 238
188static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, 239static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
189 int page_idx)
190{ 240{
191 return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; 241 *rs = find_next_bit(chunk->populated, end, *rs);
242 *re = find_next_zero_bit(chunk->populated, end, *rs + 1);
192} 243}
193 244
245/*
246 * (Un)populated page region iterators. Iterate over (un)populated
247 * page regions betwen @start and @end in @chunk. @rs and @re should
248 * be integer variables and will be set to start and end page index of
249 * the current region.
250 */
251#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \
252 for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \
253 (rs) < (re); \
254 (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))
255
256#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \
257 for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \
258 (rs) < (re); \
259 (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
260
194/** 261/**
195 * pcpu_mem_alloc - allocate memory 262 * pcpu_mem_alloc - allocate memory
196 * @size: bytes to allocate 263 * @size: bytes to allocate
@@ -257,152 +324,117 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
257 } 324 }
258} 325}
259 326
260static struct rb_node **pcpu_chunk_rb_search(void *addr,
261 struct rb_node **parentp)
262{
263 struct rb_node **p = &pcpu_addr_root.rb_node;
264 struct rb_node *parent = NULL;
265 struct pcpu_chunk *chunk;
266
267 while (*p) {
268 parent = *p;
269 chunk = rb_entry(parent, struct pcpu_chunk, rb_node);
270
271 if (addr < chunk->vm->addr)
272 p = &(*p)->rb_left;
273 else if (addr > chunk->vm->addr)
274 p = &(*p)->rb_right;
275 else
276 break;
277 }
278
279 if (parentp)
280 *parentp = parent;
281 return p;
282}
283
284/** 327/**
285 * pcpu_chunk_addr_search - search for chunk containing specified address 328 * pcpu_chunk_addr_search - determine chunk containing specified address
286 * @addr: address to search for 329 * @addr: address for which the chunk needs to be determined.
287 *
288 * Look for chunk which might contain @addr. More specifically, it
289 * searchs for the chunk with the highest start address which isn't
290 * beyond @addr.
291 *
292 * CONTEXT:
293 * pcpu_lock.
294 * 330 *
295 * RETURNS: 331 * RETURNS:
296 * The address of the found chunk. 332 * The address of the found chunk.
297 */ 333 */
298static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) 334static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
299{ 335{
300 struct rb_node *n, *parent; 336 void *first_start = pcpu_first_chunk->base_addr;
301 struct pcpu_chunk *chunk;
302 337
303 /* is it in the reserved chunk? */ 338 /* is it in the first chunk? */
304 if (pcpu_reserved_chunk) { 339 if (addr >= first_start && addr < first_start + pcpu_unit_size) {
305 void *start = pcpu_reserved_chunk->vm->addr; 340 /* is it in the reserved area? */
306 341 if (addr < first_start + pcpu_reserved_chunk_limit)
307 if (addr >= start && addr < start + pcpu_reserved_chunk_limit)
308 return pcpu_reserved_chunk; 342 return pcpu_reserved_chunk;
343 return pcpu_first_chunk;
309 } 344 }
310 345
311 /* nah... search the regular ones */ 346 /*
312 n = *pcpu_chunk_rb_search(addr, &parent); 347 * The address is relative to unit0 which might be unused and
313 if (!n) { 348 * thus unmapped. Offset the address to the unit space of the
314 /* no exactly matching chunk, the parent is the closest */ 349 * current processor before looking it up in the vmalloc
315 n = parent; 350 * space. Note that any possible cpu id can be used here, so
316 BUG_ON(!n); 351 * there's no need to worry about preemption or cpu hotplug.
317 } 352 */
318 chunk = rb_entry(n, struct pcpu_chunk, rb_node); 353 addr += pcpu_unit_offsets[raw_smp_processor_id()];
319 354 return pcpu_get_page_chunk(vmalloc_to_page(addr));
320 if (addr < chunk->vm->addr) {
321 /* the parent was the next one, look for the previous one */
322 n = rb_prev(n);
323 BUG_ON(!n);
324 chunk = rb_entry(n, struct pcpu_chunk, rb_node);
325 }
326
327 return chunk;
328} 355}
329 356
330/** 357/**
331 * pcpu_chunk_addr_insert - insert chunk into address rb tree 358 * pcpu_need_to_extend - determine whether chunk area map needs to be extended
332 * @new: chunk to insert 359 * @chunk: chunk of interest
333 * 360 *
334 * Insert @new into address rb tree. 361 * Determine whether area map of @chunk needs to be extended to
362 * accomodate a new allocation.
335 * 363 *
336 * CONTEXT: 364 * CONTEXT:
337 * pcpu_lock. 365 * pcpu_lock.
366 *
367 * RETURNS:
368 * New target map allocation length if extension is necessary, 0
369 * otherwise.
338 */ 370 */
339static void pcpu_chunk_addr_insert(struct pcpu_chunk *new) 371static int pcpu_need_to_extend(struct pcpu_chunk *chunk)
340{ 372{
341 struct rb_node **p, *parent; 373 int new_alloc;
342 374
343 p = pcpu_chunk_rb_search(new->vm->addr, &parent); 375 if (chunk->map_alloc >= chunk->map_used + 2)
344 BUG_ON(*p); 376 return 0;
345 rb_link_node(&new->rb_node, parent, p); 377
346 rb_insert_color(&new->rb_node, &pcpu_addr_root); 378 new_alloc = PCPU_DFL_MAP_ALLOC;
379 while (new_alloc < chunk->map_used + 2)
380 new_alloc *= 2;
381
382 return new_alloc;
347} 383}
348 384
349/** 385/**
350 * pcpu_extend_area_map - extend area map for allocation 386 * pcpu_extend_area_map - extend area map of a chunk
351 * @chunk: target chunk 387 * @chunk: chunk of interest
388 * @new_alloc: new target allocation length of the area map
352 * 389 *
353 * Extend area map of @chunk so that it can accomodate an allocation. 390 * Extend area map of @chunk to have @new_alloc entries.
354 * A single allocation can split an area into three areas, so this
355 * function makes sure that @chunk->map has at least two extra slots.
356 * 391 *
357 * CONTEXT: 392 * CONTEXT:
358 * pcpu_alloc_mutex, pcpu_lock. pcpu_lock is released and reacquired 393 * Does GFP_KERNEL allocation. Grabs and releases pcpu_lock.
359 * if area map is extended.
360 * 394 *
361 * RETURNS: 395 * RETURNS:
362 * 0 if noop, 1 if successfully extended, -errno on failure. 396 * 0 on success, -errno on failure.
363 */ 397 */
364static int pcpu_extend_area_map(struct pcpu_chunk *chunk) 398static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
365{ 399{
366 int new_alloc; 400 int *old = NULL, *new = NULL;
367 int *new; 401 size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
368 size_t size; 402 unsigned long flags;
369
370 /* has enough? */
371 if (chunk->map_alloc >= chunk->map_used + 2)
372 return 0;
373
374 spin_unlock_irq(&pcpu_lock);
375
376 new_alloc = PCPU_DFL_MAP_ALLOC;
377 while (new_alloc < chunk->map_used + 2)
378 new_alloc *= 2;
379 403
380 new = pcpu_mem_alloc(new_alloc * sizeof(new[0])); 404 new = pcpu_mem_alloc(new_size);
381 if (!new) { 405 if (!new)
382 spin_lock_irq(&pcpu_lock);
383 return -ENOMEM; 406 return -ENOMEM;
384 }
385 407
386 /* 408 /* acquire pcpu_lock and switch to new area map */
387 * Acquire pcpu_lock and switch to new area map. Only free 409 spin_lock_irqsave(&pcpu_lock, flags);
388 * could have happened inbetween, so map_used couldn't have
389 * grown.
390 */
391 spin_lock_irq(&pcpu_lock);
392 BUG_ON(new_alloc < chunk->map_used + 2);
393 410
394 size = chunk->map_alloc * sizeof(chunk->map[0]); 411 if (new_alloc <= chunk->map_alloc)
395 memcpy(new, chunk->map, size); 412 goto out_unlock;
413
414 old_size = chunk->map_alloc * sizeof(chunk->map[0]);
415 memcpy(new, chunk->map, old_size);
396 416
397 /* 417 /*
398 * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is 418 * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is
399 * one of the first chunks and still using static map. 419 * one of the first chunks and still using static map.
400 */ 420 */
401 if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC) 421 if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
402 pcpu_mem_free(chunk->map, size); 422 old = chunk->map;
403 423
404 chunk->map_alloc = new_alloc; 424 chunk->map_alloc = new_alloc;
405 chunk->map = new; 425 chunk->map = new;
426 new = NULL;
427
428out_unlock:
429 spin_unlock_irqrestore(&pcpu_lock, flags);
430
431 /*
432 * pcpu_mem_free() might end up calling vfree() which uses
433 * IRQ-unsafe lock and thus can't be called under pcpu_lock.
434 */
435 pcpu_mem_free(old, old_size);
436 pcpu_mem_free(new, new_size);
437
406 return 0; 438 return 0;
407} 439}
408 440
@@ -591,126 +623,327 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
591} 623}
592 624
593/** 625/**
594 * pcpu_unmap - unmap pages out of a pcpu_chunk 626 * pcpu_get_pages_and_bitmap - get temp pages array and bitmap
595 * @chunk: chunk of interest 627 * @chunk: chunk of interest
596 * @page_start: page index of the first page to unmap 628 * @bitmapp: output parameter for bitmap
597 * @page_end: page index of the last page to unmap + 1 629 * @may_alloc: may allocate the array
598 * @flush: whether to flush cache and tlb or not
599 * 630 *
600 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. 631 * Returns pointer to array of pointers to struct page and bitmap,
601 * If @flush is true, vcache is flushed before unmapping and tlb 632 * both of which can be indexed with pcpu_page_idx(). The returned
602 * after. 633 * array is cleared to zero and *@bitmapp is copied from
634 * @chunk->populated. Note that there is only one array and bitmap
635 * and access exclusion is the caller's responsibility.
636 *
637 * CONTEXT:
638 * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
639 * Otherwise, don't care.
640 *
641 * RETURNS:
642 * Pointer to temp pages array on success, NULL on failure.
603 */ 643 */
604static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, 644static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
605 bool flush) 645 unsigned long **bitmapp,
646 bool may_alloc)
606{ 647{
607 unsigned int last = num_possible_cpus() - 1; 648 static struct page **pages;
608 unsigned int cpu; 649 static unsigned long *bitmap;
650 size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
651 size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
652 sizeof(unsigned long);
653
654 if (!pages || !bitmap) {
655 if (may_alloc && !pages)
656 pages = pcpu_mem_alloc(pages_size);
657 if (may_alloc && !bitmap)
658 bitmap = pcpu_mem_alloc(bitmap_size);
659 if (!pages || !bitmap)
660 return NULL;
661 }
609 662
610 /* unmap must not be done on immutable chunk */ 663 memset(pages, 0, pages_size);
611 WARN_ON(chunk->immutable); 664 bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
612 665
613 /* 666 *bitmapp = bitmap;
614 * Each flushing trial can be very expensive, issue flush on 667 return pages;
615 * the whole region at once rather than doing it for each cpu. 668}
616 * This could be an overkill but is more scalable.
617 */
618 if (flush)
619 flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
620 pcpu_chunk_addr(chunk, last, page_end));
621 669
622 for_each_possible_cpu(cpu) 670/**
623 unmap_kernel_range_noflush( 671 * pcpu_free_pages - free pages which were allocated for @chunk
624 pcpu_chunk_addr(chunk, cpu, page_start), 672 * @chunk: chunk pages were allocated for
625 (page_end - page_start) << PAGE_SHIFT); 673 * @pages: array of pages to be freed, indexed by pcpu_page_idx()
626 674 * @populated: populated bitmap
627 /* ditto as flush_cache_vunmap() */ 675 * @page_start: page index of the first page to be freed
628 if (flush) 676 * @page_end: page index of the last page to be freed + 1
629 flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), 677 *
630 pcpu_chunk_addr(chunk, last, page_end)); 678 * Free pages [@page_start and @page_end) in @pages for all units.
679 * The pages were allocated for @chunk.
680 */
681static void pcpu_free_pages(struct pcpu_chunk *chunk,
682 struct page **pages, unsigned long *populated,
683 int page_start, int page_end)
684{
685 unsigned int cpu;
686 int i;
687
688 for_each_possible_cpu(cpu) {
689 for (i = page_start; i < page_end; i++) {
690 struct page *page = pages[pcpu_page_idx(cpu, i)];
691
692 if (page)
693 __free_page(page);
694 }
695 }
631} 696}
632 697
633/** 698/**
634 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk 699 * pcpu_alloc_pages - allocates pages for @chunk
635 * @chunk: chunk to depopulate 700 * @chunk: target chunk
636 * @off: offset to the area to depopulate 701 * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
637 * @size: size of the area to depopulate in bytes 702 * @populated: populated bitmap
638 * @flush: whether to flush cache and tlb or not 703 * @page_start: page index of the first page to be allocated
639 * 704 * @page_end: page index of the last page to be allocated + 1
640 * For each cpu, depopulate and unmap pages [@page_start,@page_end) 705 *
641 * from @chunk. If @flush is true, vcache is flushed before unmapping 706 * Allocate pages [@page_start,@page_end) into @pages for all units.
642 * and tlb after. 707 * The allocation is for @chunk. Percpu core doesn't care about the
643 * 708 * content of @pages and will pass it verbatim to pcpu_map_pages().
644 * CONTEXT:
645 * pcpu_alloc_mutex.
646 */ 709 */
647static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, 710static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
648 bool flush) 711 struct page **pages, unsigned long *populated,
712 int page_start, int page_end)
649{ 713{
650 int page_start = PFN_DOWN(off); 714 const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
651 int page_end = PFN_UP(off + size);
652 int unmap_start = -1;
653 int uninitialized_var(unmap_end);
654 unsigned int cpu; 715 unsigned int cpu;
655 int i; 716 int i;
656 717
657 for (i = page_start; i < page_end; i++) { 718 for_each_possible_cpu(cpu) {
658 for_each_possible_cpu(cpu) { 719 for (i = page_start; i < page_end; i++) {
659 struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); 720 struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
721
722 *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
723 if (!*pagep) {
724 pcpu_free_pages(chunk, pages, populated,
725 page_start, page_end);
726 return -ENOMEM;
727 }
728 }
729 }
730 return 0;
731}
660 732
661 if (!*pagep) 733/**
662 continue; 734 * pcpu_pre_unmap_flush - flush cache prior to unmapping
735 * @chunk: chunk the regions to be flushed belongs to
736 * @page_start: page index of the first page to be flushed
737 * @page_end: page index of the last page to be flushed + 1
738 *
739 * Pages in [@page_start,@page_end) of @chunk are about to be
740 * unmapped. Flush cache. As each flushing trial can be very
741 * expensive, issue flush on the whole region at once rather than
742 * doing it for each cpu. This could be an overkill but is more
743 * scalable.
744 */
745static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
746 int page_start, int page_end)
747{
748 flush_cache_vunmap(
749 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
750 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
751}
663 752
664 __free_page(*pagep); 753static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
754{
755 unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
756}
665 757
666 /* 758/**
667 * If it's partial depopulation, it might get 759 * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
668 * populated or depopulated again. Mark the 760 * @chunk: chunk of interest
669 * page gone. 761 * @pages: pages array which can be used to pass information to free
670 */ 762 * @populated: populated bitmap
671 *pagep = NULL; 763 * @page_start: page index of the first page to unmap
764 * @page_end: page index of the last page to unmap + 1
765 *
766 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
767 * Corresponding elements in @pages were cleared by the caller and can
768 * be used to carry information to pcpu_free_pages() which will be
769 * called after all unmaps are finished. The caller should call
770 * proper pre/post flush functions.
771 */
772static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
773 struct page **pages, unsigned long *populated,
774 int page_start, int page_end)
775{
776 unsigned int cpu;
777 int i;
672 778
673 unmap_start = unmap_start < 0 ? i : unmap_start; 779 for_each_possible_cpu(cpu) {
674 unmap_end = i + 1; 780 for (i = page_start; i < page_end; i++) {
781 struct page *page;
782
783 page = pcpu_chunk_page(chunk, cpu, i);
784 WARN_ON(!page);
785 pages[pcpu_page_idx(cpu, i)] = page;
675 } 786 }
787 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
788 page_end - page_start);
676 } 789 }
677 790
678 if (unmap_start >= 0) 791 for (i = page_start; i < page_end; i++)
679 pcpu_unmap(chunk, unmap_start, unmap_end, flush); 792 __clear_bit(i, populated);
680} 793}
681 794
682/** 795/**
683 * pcpu_map - map pages into a pcpu_chunk 796 * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
797 * @chunk: pcpu_chunk the regions to be flushed belong to
798 * @page_start: page index of the first page to be flushed
799 * @page_end: page index of the last page to be flushed + 1
800 *
801 * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush
802 * TLB for the regions. This can be skipped if the area is to be
803 * returned to vmalloc as vmalloc will handle TLB flushing lazily.
804 *
805 * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
806 * for the whole region.
807 */
808static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
809 int page_start, int page_end)
810{
811 flush_tlb_kernel_range(
812 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
813 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
814}
815
816static int __pcpu_map_pages(unsigned long addr, struct page **pages,
817 int nr_pages)
818{
819 return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
820 PAGE_KERNEL, pages);
821}
822
823/**
824 * pcpu_map_pages - map pages into a pcpu_chunk
684 * @chunk: chunk of interest 825 * @chunk: chunk of interest
826 * @pages: pages array containing pages to be mapped
827 * @populated: populated bitmap
685 * @page_start: page index of the first page to map 828 * @page_start: page index of the first page to map
686 * @page_end: page index of the last page to map + 1 829 * @page_end: page index of the last page to map + 1
687 * 830 *
688 * For each cpu, map pages [@page_start,@page_end) into @chunk. 831 * For each cpu, map pages [@page_start,@page_end) into @chunk. The
689 * vcache is flushed afterwards. 832 * caller is responsible for calling pcpu_post_map_flush() after all
833 * mappings are complete.
834 *
835 * This function is responsible for setting corresponding bits in
836 * @chunk->populated bitmap and whatever is necessary for reverse
837 * lookup (addr -> chunk).
690 */ 838 */
691static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) 839static int pcpu_map_pages(struct pcpu_chunk *chunk,
840 struct page **pages, unsigned long *populated,
841 int page_start, int page_end)
692{ 842{
693 unsigned int last = num_possible_cpus() - 1; 843 unsigned int cpu, tcpu;
694 unsigned int cpu; 844 int i, err;
695 int err;
696
697 /* map must not be done on immutable chunk */
698 WARN_ON(chunk->immutable);
699 845
700 for_each_possible_cpu(cpu) { 846 for_each_possible_cpu(cpu) {
701 err = map_kernel_range_noflush( 847 err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
702 pcpu_chunk_addr(chunk, cpu, page_start), 848 &pages[pcpu_page_idx(cpu, page_start)],
703 (page_end - page_start) << PAGE_SHIFT, 849 page_end - page_start);
704 PAGE_KERNEL,
705 pcpu_chunk_pagep(chunk, cpu, page_start));
706 if (err < 0) 850 if (err < 0)
707 return err; 851 goto err;
852 }
853
854 /* mapping successful, link chunk and mark populated */
855 for (i = page_start; i < page_end; i++) {
856 for_each_possible_cpu(cpu)
857 pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
858 chunk);
859 __set_bit(i, populated);
708 } 860 }
709 861
710 /* flush at once, please read comments in pcpu_unmap() */
711 flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
712 pcpu_chunk_addr(chunk, last, page_end));
713 return 0; 862 return 0;
863
864err:
865 for_each_possible_cpu(tcpu) {
866 if (tcpu == cpu)
867 break;
868 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
869 page_end - page_start);
870 }
871 return err;
872}
873
874/**
875 * pcpu_post_map_flush - flush cache after mapping
876 * @chunk: pcpu_chunk the regions to be flushed belong to
877 * @page_start: page index of the first page to be flushed
878 * @page_end: page index of the last page to be flushed + 1
879 *
880 * Pages [@page_start,@page_end) of @chunk have been mapped. Flush
881 * cache.
882 *
883 * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
884 * for the whole region.
885 */
886static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
887 int page_start, int page_end)
888{
889 flush_cache_vmap(
890 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
891 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
892}
893
894/**
895 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
896 * @chunk: chunk to depopulate
897 * @off: offset to the area to depopulate
898 * @size: size of the area to depopulate in bytes
899 * @flush: whether to flush cache and tlb or not
900 *
901 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
902 * from @chunk. If @flush is true, vcache is flushed before unmapping
903 * and tlb after.
904 *
905 * CONTEXT:
906 * pcpu_alloc_mutex.
907 */
908static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
909{
910 int page_start = PFN_DOWN(off);
911 int page_end = PFN_UP(off + size);
912 struct page **pages;
913 unsigned long *populated;
914 int rs, re;
915
916 /* quick path, check whether it's empty already */
917 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
918 if (rs == page_start && re == page_end)
919 return;
920 break;
921 }
922
923 /* immutable chunks can't be depopulated */
924 WARN_ON(chunk->immutable);
925
926 /*
927 * If control reaches here, there must have been at least one
928 * successful population attempt so the temp pages array must
929 * be available now.
930 */
931 pages = pcpu_get_pages_and_bitmap(chunk, &populated, false);
932 BUG_ON(!pages);
933
934 /* unmap and free */
935 pcpu_pre_unmap_flush(chunk, page_start, page_end);
936
937 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
938 pcpu_unmap_pages(chunk, pages, populated, rs, re);
939
940 /* no need to flush tlb, vmalloc will handle it lazily */
941
942 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
943 pcpu_free_pages(chunk, pages, populated, rs, re);
944
945 /* commit new bitmap */
946 bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
714} 947}
715 948
716/** 949/**
@@ -727,57 +960,68 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
727 */ 960 */
728static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) 961static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
729{ 962{
730 const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
731 int page_start = PFN_DOWN(off); 963 int page_start = PFN_DOWN(off);
732 int page_end = PFN_UP(off + size); 964 int page_end = PFN_UP(off + size);
733 int map_start = -1; 965 int free_end = page_start, unmap_end = page_start;
734 int uninitialized_var(map_end); 966 struct page **pages;
967 unsigned long *populated;
735 unsigned int cpu; 968 unsigned int cpu;
736 int i; 969 int rs, re, rc;
737 970
738 for (i = page_start; i < page_end; i++) { 971 /* quick path, check whether all pages are already there */
739 if (pcpu_chunk_page_occupied(chunk, i)) { 972 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) {
740 if (map_start >= 0) { 973 if (rs == page_start && re == page_end)
741 if (pcpu_map(chunk, map_start, map_end)) 974 goto clear;
742 goto err; 975 break;
743 map_start = -1; 976 }
744 }
745 continue;
746 }
747 977
748 map_start = map_start < 0 ? i : map_start; 978 /* need to allocate and map pages, this chunk can't be immutable */
749 map_end = i + 1; 979 WARN_ON(chunk->immutable);
750 980
751 for_each_possible_cpu(cpu) { 981 pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
752 struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); 982 if (!pages)
983 return -ENOMEM;
753 984
754 *pagep = alloc_pages_node(cpu_to_node(cpu), 985 /* alloc and map */
755 alloc_mask, 0); 986 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
756 if (!*pagep) 987 rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
757 goto err; 988 if (rc)
758 } 989 goto err_free;
990 free_end = re;
759 } 991 }
760 992
761 if (map_start >= 0 && pcpu_map(chunk, map_start, map_end)) 993 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
762 goto err; 994 rc = pcpu_map_pages(chunk, pages, populated, rs, re);
995 if (rc)
996 goto err_unmap;
997 unmap_end = re;
998 }
999 pcpu_post_map_flush(chunk, page_start, page_end);
763 1000
1001 /* commit new bitmap */
1002 bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
1003clear:
764 for_each_possible_cpu(cpu) 1004 for_each_possible_cpu(cpu)
765 memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0, 1005 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
766 size);
767
768 return 0; 1006 return 0;
769err: 1007
770 /* likely under heavy memory pressure, give memory back */ 1008err_unmap:
771 pcpu_depopulate_chunk(chunk, off, size, true); 1009 pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
772 return -ENOMEM; 1010 pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
1011 pcpu_unmap_pages(chunk, pages, populated, rs, re);
1012 pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
1013err_free:
1014 pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
1015 pcpu_free_pages(chunk, pages, populated, rs, re);
1016 return rc;
773} 1017}
774 1018
775static void free_pcpu_chunk(struct pcpu_chunk *chunk) 1019static void free_pcpu_chunk(struct pcpu_chunk *chunk)
776{ 1020{
777 if (!chunk) 1021 if (!chunk)
778 return; 1022 return;
779 if (chunk->vm) 1023 if (chunk->vms)
780 free_vm_area(chunk->vm); 1024 pcpu_free_vm_areas(chunk->vms, pcpu_nr_groups);
781 pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); 1025 pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
782 kfree(chunk); 1026 kfree(chunk);
783} 1027}
@@ -793,10 +1037,11 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
793 chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); 1037 chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
794 chunk->map_alloc = PCPU_DFL_MAP_ALLOC; 1038 chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
795 chunk->map[chunk->map_used++] = pcpu_unit_size; 1039 chunk->map[chunk->map_used++] = pcpu_unit_size;
796 chunk->page = chunk->page_ar;
797 1040
798 chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL); 1041 chunk->vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
799 if (!chunk->vm) { 1042 pcpu_nr_groups, pcpu_atom_size,
1043 GFP_KERNEL);
1044 if (!chunk->vms) {
800 free_pcpu_chunk(chunk); 1045 free_pcpu_chunk(chunk);
801 return NULL; 1046 return NULL;
802 } 1047 }
@@ -804,6 +1049,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
804 INIT_LIST_HEAD(&chunk->list); 1049 INIT_LIST_HEAD(&chunk->list);
805 chunk->free_size = pcpu_unit_size; 1050 chunk->free_size = pcpu_unit_size;
806 chunk->contig_hint = pcpu_unit_size; 1051 chunk->contig_hint = pcpu_unit_size;
1052 chunk->base_addr = chunk->vms[0]->addr - pcpu_group_offsets[0];
807 1053
808 return chunk; 1054 return chunk;
809} 1055}
@@ -824,8 +1070,11 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
824 */ 1070 */
825static void *pcpu_alloc(size_t size, size_t align, bool reserved) 1071static void *pcpu_alloc(size_t size, size_t align, bool reserved)
826{ 1072{
1073 static int warn_limit = 10;
827 struct pcpu_chunk *chunk; 1074 struct pcpu_chunk *chunk;
828 int slot, off; 1075 const char *err;
1076 int slot, off, new_alloc;
1077 unsigned long flags;
829 1078
830 if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { 1079 if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
831 WARN(true, "illegal size (%zu) or align (%zu) for " 1080 WARN(true, "illegal size (%zu) or align (%zu) for "
@@ -834,17 +1083,31 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
834 } 1083 }
835 1084
836 mutex_lock(&pcpu_alloc_mutex); 1085 mutex_lock(&pcpu_alloc_mutex);
837 spin_lock_irq(&pcpu_lock); 1086 spin_lock_irqsave(&pcpu_lock, flags);
838 1087
839 /* serve reserved allocations from the reserved chunk if available */ 1088 /* serve reserved allocations from the reserved chunk if available */
840 if (reserved && pcpu_reserved_chunk) { 1089 if (reserved && pcpu_reserved_chunk) {
841 chunk = pcpu_reserved_chunk; 1090 chunk = pcpu_reserved_chunk;
842 if (size > chunk->contig_hint || 1091
843 pcpu_extend_area_map(chunk) < 0) 1092 if (size > chunk->contig_hint) {
1093 err = "alloc from reserved chunk failed";
844 goto fail_unlock; 1094 goto fail_unlock;
1095 }
1096
1097 while ((new_alloc = pcpu_need_to_extend(chunk))) {
1098 spin_unlock_irqrestore(&pcpu_lock, flags);
1099 if (pcpu_extend_area_map(chunk, new_alloc) < 0) {
1100 err = "failed to extend area map of reserved chunk";
1101 goto fail_unlock_mutex;
1102 }
1103 spin_lock_irqsave(&pcpu_lock, flags);
1104 }
1105
845 off = pcpu_alloc_area(chunk, size, align); 1106 off = pcpu_alloc_area(chunk, size, align);
846 if (off >= 0) 1107 if (off >= 0)
847 goto area_found; 1108 goto area_found;
1109
1110 err = "alloc from reserved chunk failed";
848 goto fail_unlock; 1111 goto fail_unlock;
849 } 1112 }
850 1113
@@ -855,13 +1118,20 @@ restart:
855 if (size > chunk->contig_hint) 1118 if (size > chunk->contig_hint)
856 continue; 1119 continue;
857 1120
858 switch (pcpu_extend_area_map(chunk)) { 1121 new_alloc = pcpu_need_to_extend(chunk);
859 case 0: 1122 if (new_alloc) {
860 break; 1123 spin_unlock_irqrestore(&pcpu_lock, flags);
861 case 1: 1124 if (pcpu_extend_area_map(chunk,
862 goto restart; /* pcpu_lock dropped, restart */ 1125 new_alloc) < 0) {
863 default: 1126 err = "failed to extend area map";
864 goto fail_unlock; 1127 goto fail_unlock_mutex;
1128 }
1129 spin_lock_irqsave(&pcpu_lock, flags);
1130 /*
1131 * pcpu_lock has been dropped, need to
1132 * restart cpu_slot list walking.
1133 */
1134 goto restart;
865 } 1135 }
866 1136
867 off = pcpu_alloc_area(chunk, size, align); 1137 off = pcpu_alloc_area(chunk, size, align);
@@ -871,35 +1141,45 @@ restart:
871 } 1141 }
872 1142
873 /* hmmm... no space left, create a new chunk */ 1143 /* hmmm... no space left, create a new chunk */
874 spin_unlock_irq(&pcpu_lock); 1144 spin_unlock_irqrestore(&pcpu_lock, flags);
875 1145
876 chunk = alloc_pcpu_chunk(); 1146 chunk = alloc_pcpu_chunk();
877 if (!chunk) 1147 if (!chunk) {
1148 err = "failed to allocate new chunk";
878 goto fail_unlock_mutex; 1149 goto fail_unlock_mutex;
1150 }
879 1151
880 spin_lock_irq(&pcpu_lock); 1152 spin_lock_irqsave(&pcpu_lock, flags);
881 pcpu_chunk_relocate(chunk, -1); 1153 pcpu_chunk_relocate(chunk, -1);
882 pcpu_chunk_addr_insert(chunk);
883 goto restart; 1154 goto restart;
884 1155
885area_found: 1156area_found:
886 spin_unlock_irq(&pcpu_lock); 1157 spin_unlock_irqrestore(&pcpu_lock, flags);
887 1158
888 /* populate, map and clear the area */ 1159 /* populate, map and clear the area */
889 if (pcpu_populate_chunk(chunk, off, size)) { 1160 if (pcpu_populate_chunk(chunk, off, size)) {
890 spin_lock_irq(&pcpu_lock); 1161 spin_lock_irqsave(&pcpu_lock, flags);
891 pcpu_free_area(chunk, off); 1162 pcpu_free_area(chunk, off);
1163 err = "failed to populate";
892 goto fail_unlock; 1164 goto fail_unlock;
893 } 1165 }
894 1166
895 mutex_unlock(&pcpu_alloc_mutex); 1167 mutex_unlock(&pcpu_alloc_mutex);
896 1168
897 return __addr_to_pcpu_ptr(chunk->vm->addr + off); 1169 /* return address relative to base address */
1170 return __addr_to_pcpu_ptr(chunk->base_addr + off);
898 1171
899fail_unlock: 1172fail_unlock:
900 spin_unlock_irq(&pcpu_lock); 1173 spin_unlock_irqrestore(&pcpu_lock, flags);
901fail_unlock_mutex: 1174fail_unlock_mutex:
902 mutex_unlock(&pcpu_alloc_mutex); 1175 mutex_unlock(&pcpu_alloc_mutex);
1176 if (warn_limit) {
1177 pr_warning("PERCPU: allocation failed, size=%zu align=%zu, "
1178 "%s\n", size, align, err);
1179 dump_stack();
1180 if (!--warn_limit)
1181 pr_info("PERCPU: limit reached, disable warning\n");
1182 }
903 return NULL; 1183 return NULL;
904} 1184}
905 1185
@@ -968,17 +1248,17 @@ static void pcpu_reclaim(struct work_struct *work)
968 if (chunk == list_first_entry(head, struct pcpu_chunk, list)) 1248 if (chunk == list_first_entry(head, struct pcpu_chunk, list))
969 continue; 1249 continue;
970 1250
971 rb_erase(&chunk->rb_node, &pcpu_addr_root);
972 list_move(&chunk->list, &todo); 1251 list_move(&chunk->list, &todo);
973 } 1252 }
974 1253
975 spin_unlock_irq(&pcpu_lock); 1254 spin_unlock_irq(&pcpu_lock);
976 mutex_unlock(&pcpu_alloc_mutex);
977 1255
978 list_for_each_entry_safe(chunk, next, &todo, list) { 1256 list_for_each_entry_safe(chunk, next, &todo, list) {
979 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); 1257 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
980 free_pcpu_chunk(chunk); 1258 free_pcpu_chunk(chunk);
981 } 1259 }
1260
1261 mutex_unlock(&pcpu_alloc_mutex);
982} 1262}
983 1263
984/** 1264/**
@@ -1003,7 +1283,7 @@ void free_percpu(void *ptr)
1003 spin_lock_irqsave(&pcpu_lock, flags); 1283 spin_lock_irqsave(&pcpu_lock, flags);
1004 1284
1005 chunk = pcpu_chunk_addr_search(addr); 1285 chunk = pcpu_chunk_addr_search(addr);
1006 off = addr - chunk->vm->addr; 1286 off = addr - chunk->base_addr;
1007 1287
1008 pcpu_free_area(chunk, off); 1288 pcpu_free_area(chunk, off);
1009 1289
@@ -1022,30 +1302,299 @@ void free_percpu(void *ptr)
1022} 1302}
1023EXPORT_SYMBOL_GPL(free_percpu); 1303EXPORT_SYMBOL_GPL(free_percpu);
1024 1304
1305static inline size_t pcpu_calc_fc_sizes(size_t static_size,
1306 size_t reserved_size,
1307 ssize_t *dyn_sizep)
1308{
1309 size_t size_sum;
1310
1311 size_sum = PFN_ALIGN(static_size + reserved_size +
1312 (*dyn_sizep >= 0 ? *dyn_sizep : 0));
1313 if (*dyn_sizep != 0)
1314 *dyn_sizep = size_sum - static_size - reserved_size;
1315
1316 return size_sum;
1317}
1318
1025/** 1319/**
1026 * pcpu_setup_first_chunk - initialize the first percpu chunk 1320 * pcpu_alloc_alloc_info - allocate percpu allocation info
1027 * @get_page_fn: callback to fetch page pointer 1321 * @nr_groups: the number of groups
1028 * @static_size: the size of static percpu area in bytes 1322 * @nr_units: the number of units
1323 *
1324 * Allocate ai which is large enough for @nr_groups groups containing
1325 * @nr_units units. The returned ai's groups[0].cpu_map points to the
1326 * cpu_map array which is long enough for @nr_units and filled with
1327 * NR_CPUS. It's the caller's responsibility to initialize cpu_map
1328 * pointer of other groups.
1329 *
1330 * RETURNS:
1331 * Pointer to the allocated pcpu_alloc_info on success, NULL on
1332 * failure.
1333 */
1334struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
1335 int nr_units)
1336{
1337 struct pcpu_alloc_info *ai;
1338 size_t base_size, ai_size;
1339 void *ptr;
1340 int unit;
1341
1342 base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]),
1343 __alignof__(ai->groups[0].cpu_map[0]));
1344 ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
1345
1346 ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size));
1347 if (!ptr)
1348 return NULL;
1349 ai = ptr;
1350 ptr += base_size;
1351
1352 ai->groups[0].cpu_map = ptr;
1353
1354 for (unit = 0; unit < nr_units; unit++)
1355 ai->groups[0].cpu_map[unit] = NR_CPUS;
1356
1357 ai->nr_groups = nr_groups;
1358 ai->__ai_size = PFN_ALIGN(ai_size);
1359
1360 return ai;
1361}
1362
1363/**
1364 * pcpu_free_alloc_info - free percpu allocation info
1365 * @ai: pcpu_alloc_info to free
1366 *
1367 * Free @ai which was allocated by pcpu_alloc_alloc_info().
1368 */
1369void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
1370{
1371 free_bootmem(__pa(ai), ai->__ai_size);
1372}
1373
1374/**
1375 * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
1029 * @reserved_size: the size of reserved percpu area in bytes 1376 * @reserved_size: the size of reserved percpu area in bytes
1030 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto 1377 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
1031 * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto 1378 * @atom_size: allocation atom size
1032 * @base_addr: mapped address, NULL for auto 1379 * @cpu_distance_fn: callback to determine distance between cpus, optional
1033 * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary 1380 *
1381 * This function determines grouping of units, their mappings to cpus
1382 * and other parameters considering needed percpu size, allocation
1383 * atom size and distances between CPUs.
1384 *
1385 * Groups are always mutliples of atom size and CPUs which are of
1386 * LOCAL_DISTANCE both ways are grouped together and share space for
1387 * units in the same group. The returned configuration is guaranteed
1388 * to have CPUs on different nodes on different groups and >=75% usage
1389 * of allocated virtual address space.
1390 *
1391 * RETURNS:
1392 * On success, pointer to the new allocation_info is returned. On
1393 * failure, ERR_PTR value is returned.
1394 */
1395struct pcpu_alloc_info * __init pcpu_build_alloc_info(
1396 size_t reserved_size, ssize_t dyn_size,
1397 size_t atom_size,
1398 pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
1399{
1400 static int group_map[NR_CPUS] __initdata;
1401 static int group_cnt[NR_CPUS] __initdata;
1402 const size_t static_size = __per_cpu_end - __per_cpu_start;
1403 int group_cnt_max = 0, nr_groups = 1, nr_units = 0;
1404 size_t size_sum, min_unit_size, alloc_size;
1405 int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
1406 int last_allocs, group, unit;
1407 unsigned int cpu, tcpu;
1408 struct pcpu_alloc_info *ai;
1409 unsigned int *cpu_map;
1410
1411 /* this function may be called multiple times */
1412 memset(group_map, 0, sizeof(group_map));
1413 memset(group_cnt, 0, sizeof(group_map));
1414
1415 /*
1416 * Determine min_unit_size, alloc_size and max_upa such that
1417 * alloc_size is multiple of atom_size and is the smallest
1418 * which can accomodate 4k aligned segments which are equal to
1419 * or larger than min_unit_size.
1420 */
1421 size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
1422 min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
1423
1424 alloc_size = roundup(min_unit_size, atom_size);
1425 upa = alloc_size / min_unit_size;
1426 while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
1427 upa--;
1428 max_upa = upa;
1429
1430 /* group cpus according to their proximity */
1431 for_each_possible_cpu(cpu) {
1432 group = 0;
1433 next_group:
1434 for_each_possible_cpu(tcpu) {
1435 if (cpu == tcpu)
1436 break;
1437 if (group_map[tcpu] == group && cpu_distance_fn &&
1438 (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
1439 cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
1440 group++;
1441 nr_groups = max(nr_groups, group + 1);
1442 goto next_group;
1443 }
1444 }
1445 group_map[cpu] = group;
1446 group_cnt[group]++;
1447 group_cnt_max = max(group_cnt_max, group_cnt[group]);
1448 }
1449
1450 /*
1451 * Expand unit size until address space usage goes over 75%
1452 * and then as much as possible without using more address
1453 * space.
1454 */
1455 last_allocs = INT_MAX;
1456 for (upa = max_upa; upa; upa--) {
1457 int allocs = 0, wasted = 0;
1458
1459 if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
1460 continue;
1461
1462 for (group = 0; group < nr_groups; group++) {
1463 int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
1464 allocs += this_allocs;
1465 wasted += this_allocs * upa - group_cnt[group];
1466 }
1467
1468 /*
1469 * Don't accept if wastage is over 25%. The
1470 * greater-than comparison ensures upa==1 always
1471 * passes the following check.
1472 */
1473 if (wasted > num_possible_cpus() / 3)
1474 continue;
1475
1476 /* and then don't consume more memory */
1477 if (allocs > last_allocs)
1478 break;
1479 last_allocs = allocs;
1480 best_upa = upa;
1481 }
1482 upa = best_upa;
1483
1484 /* allocate and fill alloc_info */
1485 for (group = 0; group < nr_groups; group++)
1486 nr_units += roundup(group_cnt[group], upa);
1487
1488 ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
1489 if (!ai)
1490 return ERR_PTR(-ENOMEM);
1491 cpu_map = ai->groups[0].cpu_map;
1492
1493 for (group = 0; group < nr_groups; group++) {
1494 ai->groups[group].cpu_map = cpu_map;
1495 cpu_map += roundup(group_cnt[group], upa);
1496 }
1497
1498 ai->static_size = static_size;
1499 ai->reserved_size = reserved_size;
1500 ai->dyn_size = dyn_size;
1501 ai->unit_size = alloc_size / upa;
1502 ai->atom_size = atom_size;
1503 ai->alloc_size = alloc_size;
1504
1505 for (group = 0, unit = 0; group_cnt[group]; group++) {
1506 struct pcpu_group_info *gi = &ai->groups[group];
1507
1508 /*
1509 * Initialize base_offset as if all groups are located
1510 * back-to-back. The caller should update this to
1511 * reflect actual allocation.
1512 */
1513 gi->base_offset = unit * ai->unit_size;
1514
1515 for_each_possible_cpu(cpu)
1516 if (group_map[cpu] == group)
1517 gi->cpu_map[gi->nr_units++] = cpu;
1518 gi->nr_units = roundup(gi->nr_units, upa);
1519 unit += gi->nr_units;
1520 }
1521 BUG_ON(unit != nr_units);
1522
1523 return ai;
1524}
1525
1526/**
1527 * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
1528 * @lvl: loglevel
1529 * @ai: allocation info to dump
1530 *
1531 * Print out information about @ai using loglevel @lvl.
1532 */
1533static void pcpu_dump_alloc_info(const char *lvl,
1534 const struct pcpu_alloc_info *ai)
1535{
1536 int group_width = 1, cpu_width = 1, width;
1537 char empty_str[] = "--------";
1538 int alloc = 0, alloc_end = 0;
1539 int group, v;
1540 int upa, apl; /* units per alloc, allocs per line */
1541
1542 v = ai->nr_groups;
1543 while (v /= 10)
1544 group_width++;
1545
1546 v = num_possible_cpus();
1547 while (v /= 10)
1548 cpu_width++;
1549 empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
1550
1551 upa = ai->alloc_size / ai->unit_size;
1552 width = upa * (cpu_width + 1) + group_width + 3;
1553 apl = rounddown_pow_of_two(max(60 / width, 1));
1554
1555 printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
1556 lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
1557 ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
1558
1559 for (group = 0; group < ai->nr_groups; group++) {
1560 const struct pcpu_group_info *gi = &ai->groups[group];
1561 int unit = 0, unit_end = 0;
1562
1563 BUG_ON(gi->nr_units % upa);
1564 for (alloc_end += gi->nr_units / upa;
1565 alloc < alloc_end; alloc++) {
1566 if (!(alloc % apl)) {
1567 printk("\n");
1568 printk("%spcpu-alloc: ", lvl);
1569 }
1570 printk("[%0*d] ", group_width, group);
1571
1572 for (unit_end += upa; unit < unit_end; unit++)
1573 if (gi->cpu_map[unit] != NR_CPUS)
1574 printk("%0*d ", cpu_width,
1575 gi->cpu_map[unit]);
1576 else
1577 printk("%s ", empty_str);
1578 }
1579 }
1580 printk("\n");
1581}
1582
1583/**
1584 * pcpu_setup_first_chunk - initialize the first percpu chunk
1585 * @ai: pcpu_alloc_info describing how to percpu area is shaped
1586 * @base_addr: mapped address
1034 * 1587 *
1035 * Initialize the first percpu chunk which contains the kernel static 1588 * Initialize the first percpu chunk which contains the kernel static
1036 * perpcu area. This function is to be called from arch percpu area 1589 * perpcu area. This function is to be called from arch percpu area
1037 * setup path. The first two parameters are mandatory. The rest are 1590 * setup path.
1038 * optional. 1591 *
1039 * 1592 * @ai contains all information necessary to initialize the first
1040 * @get_page_fn() should return pointer to percpu page given cpu 1593 * chunk and prime the dynamic percpu allocator.
1041 * number and page number. It should at least return enough pages to 1594 *
1042 * cover the static area. The returned pages for static area should 1595 * @ai->static_size is the size of static percpu area.
1043 * have been initialized with valid data. If @unit_size is specified, 1596 *
1044 * it can also return pages after the static area. NULL return 1597 * @ai->reserved_size, if non-zero, specifies the amount of bytes to
1045 * indicates end of pages for the cpu. Note that @get_page_fn() must
1046 * return the same number of pages for all cpus.
1047 *
1048 * @reserved_size, if non-zero, specifies the amount of bytes to
1049 * reserve after the static area in the first chunk. This reserves 1598 * reserve after the static area in the first chunk. This reserves
1050 * the first chunk such that it's available only through reserved 1599 * the first chunk such that it's available only through reserved
1051 * percpu allocation. This is primarily used to serve module percpu 1600 * percpu allocation. This is primarily used to serve module percpu
@@ -1053,22 +1602,29 @@ EXPORT_SYMBOL_GPL(free_percpu);
1053 * limited offset range for symbol relocations to guarantee module 1602 * limited offset range for symbol relocations to guarantee module
1054 * percpu symbols fall inside the relocatable range. 1603 * percpu symbols fall inside the relocatable range.
1055 * 1604 *
1056 * @dyn_size, if non-negative, determines the number of bytes 1605 * @ai->dyn_size determines the number of bytes available for dynamic
1057 * available for dynamic allocation in the first chunk. Specifying 1606 * allocation in the first chunk. The area between @ai->static_size +
1058 * non-negative value makes percpu leave alone the area beyond 1607 * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
1059 * @static_size + @reserved_size + @dyn_size. 1608 *
1609 * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
1610 * and equal to or larger than @ai->static_size + @ai->reserved_size +
1611 * @ai->dyn_size.
1612 *
1613 * @ai->atom_size is the allocation atom size and used as alignment
1614 * for vm areas.
1060 * 1615 *
1061 * @unit_size, if non-negative, specifies unit size and must be 1616 * @ai->alloc_size is the allocation size and always multiple of
1062 * aligned to PAGE_SIZE and equal to or larger than @static_size + 1617 * @ai->atom_size. This is larger than @ai->atom_size if
1063 * @reserved_size + if non-negative, @dyn_size. 1618 * @ai->unit_size is larger than @ai->atom_size.
1064 * 1619 *
1065 * Non-null @base_addr means that the caller already allocated virtual 1620 * @ai->nr_groups and @ai->groups describe virtual memory layout of
1066 * region for the first chunk and mapped it. percpu must not mess 1621 * percpu areas. Units which should be colocated are put into the
1067 * with the chunk. Note that @base_addr with 0 @unit_size or non-NULL 1622 * same group. Dynamic VM areas will be allocated according to these
1068 * @populate_pte_fn doesn't make any sense. 1623 * groupings. If @ai->nr_groups is zero, a single group containing
1624 * all units is assumed.
1069 * 1625 *
1070 * @populate_pte_fn is used to populate the pagetable. NULL means the 1626 * The caller should have mapped the first chunk at @base_addr and
1071 * caller already populated the pagetable. 1627 * copied static data to each unit.
1072 * 1628 *
1073 * If the first chunk ends up with both reserved and dynamic areas, it 1629 * If the first chunk ends up with both reserved and dynamic areas, it
1074 * is served by two chunks - one to serve the core static and reserved 1630 * is served by two chunks - one to serve the core static and reserved
@@ -1078,49 +1634,98 @@ EXPORT_SYMBOL_GPL(free_percpu);
1078 * and available for dynamic allocation like any other chunks. 1634 * and available for dynamic allocation like any other chunks.
1079 * 1635 *
1080 * RETURNS: 1636 * RETURNS:
1081 * The determined pcpu_unit_size which can be used to initialize 1637 * 0 on success, -errno on failure.
1082 * percpu access.
1083 */ 1638 */
1084size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, 1639int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1085 size_t static_size, size_t reserved_size, 1640 void *base_addr)
1086 ssize_t dyn_size, ssize_t unit_size,
1087 void *base_addr,
1088 pcpu_populate_pte_fn_t populate_pte_fn)
1089{ 1641{
1090 static struct vm_struct first_vm; 1642 static char cpus_buf[4096] __initdata;
1091 static int smap[2], dmap[2]; 1643 static int smap[2], dmap[2];
1092 size_t size_sum = static_size + reserved_size + 1644 size_t dyn_size = ai->dyn_size;
1093 (dyn_size >= 0 ? dyn_size : 0); 1645 size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
1094 struct pcpu_chunk *schunk, *dchunk = NULL; 1646 struct pcpu_chunk *schunk, *dchunk = NULL;
1647 unsigned long *group_offsets;
1648 size_t *group_sizes;
1649 unsigned long *unit_off;
1095 unsigned int cpu; 1650 unsigned int cpu;
1096 int nr_pages; 1651 int *unit_map;
1097 int err, i; 1652 int group, unit, i;
1098 1653
1099 /* santiy checks */ 1654 cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), cpu_possible_mask);
1655
1656#define PCPU_SETUP_BUG_ON(cond) do { \
1657 if (unlikely(cond)) { \
1658 pr_emerg("PERCPU: failed to initialize, %s", #cond); \
1659 pr_emerg("PERCPU: cpu_possible_mask=%s\n", cpus_buf); \
1660 pcpu_dump_alloc_info(KERN_EMERG, ai); \
1661 BUG(); \
1662 } \
1663} while (0)
1664
1665 /* sanity checks */
1100 BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || 1666 BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
1101 ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); 1667 ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
1102 BUG_ON(!static_size); 1668 PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
1103 if (unit_size >= 0) { 1669 PCPU_SETUP_BUG_ON(!ai->static_size);
1104 BUG_ON(unit_size < size_sum); 1670 PCPU_SETUP_BUG_ON(!base_addr);
1105 BUG_ON(unit_size & ~PAGE_MASK); 1671 PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
1106 BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); 1672 PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
1107 } else 1673 PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
1108 BUG_ON(base_addr); 1674
1109 BUG_ON(base_addr && populate_pte_fn); 1675 /* process group information and build config tables accordingly */
1110 1676 group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0]));
1111 if (unit_size >= 0) 1677 group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0]));
1112 pcpu_unit_pages = unit_size >> PAGE_SHIFT; 1678 unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0]));
1113 else 1679 unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0]));
1114 pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT, 1680
1115 PFN_UP(size_sum)); 1681 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
1682 unit_map[cpu] = UINT_MAX;
1683 pcpu_first_unit_cpu = NR_CPUS;
1684
1685 for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
1686 const struct pcpu_group_info *gi = &ai->groups[group];
1687
1688 group_offsets[group] = gi->base_offset;
1689 group_sizes[group] = gi->nr_units * ai->unit_size;
1690
1691 for (i = 0; i < gi->nr_units; i++) {
1692 cpu = gi->cpu_map[i];
1693 if (cpu == NR_CPUS)
1694 continue;
1116 1695
1117 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; 1696 PCPU_SETUP_BUG_ON(cpu > nr_cpu_ids);
1118 pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; 1697 PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
1119 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) 1698 PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
1120 + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); 1699
1700 unit_map[cpu] = unit + i;
1701 unit_off[cpu] = gi->base_offset + i * ai->unit_size;
1702
1703 if (pcpu_first_unit_cpu == NR_CPUS)
1704 pcpu_first_unit_cpu = cpu;
1705 }
1706 }
1707 pcpu_last_unit_cpu = cpu;
1708 pcpu_nr_units = unit;
1709
1710 for_each_possible_cpu(cpu)
1711 PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);
1712
1713 /* we're done parsing the input, undefine BUG macro and dump config */
1714#undef PCPU_SETUP_BUG_ON
1715 pcpu_dump_alloc_info(KERN_INFO, ai);
1121 1716
1122 if (dyn_size < 0) 1717 pcpu_nr_groups = ai->nr_groups;
1123 dyn_size = pcpu_unit_size - static_size - reserved_size; 1718 pcpu_group_offsets = group_offsets;
1719 pcpu_group_sizes = group_sizes;
1720 pcpu_unit_map = unit_map;
1721 pcpu_unit_offsets = unit_off;
1722
1723 /* determine basic parameters */
1724 pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
1725 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
1726 pcpu_atom_size = ai->atom_size;
1727 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
1728 BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
1124 1729
1125 /* 1730 /*
1126 * Allocate chunk slots. The additional last slot is for 1731 * Allocate chunk slots. The additional last slot is for
@@ -1140,187 +1745,368 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1140 */ 1745 */
1141 schunk = alloc_bootmem(pcpu_chunk_struct_size); 1746 schunk = alloc_bootmem(pcpu_chunk_struct_size);
1142 INIT_LIST_HEAD(&schunk->list); 1747 INIT_LIST_HEAD(&schunk->list);
1143 schunk->vm = &first_vm; 1748 schunk->base_addr = base_addr;
1144 schunk->map = smap; 1749 schunk->map = smap;
1145 schunk->map_alloc = ARRAY_SIZE(smap); 1750 schunk->map_alloc = ARRAY_SIZE(smap);
1146 schunk->page = schunk->page_ar; 1751 schunk->immutable = true;
1752 bitmap_fill(schunk->populated, pcpu_unit_pages);
1147 1753
1148 if (reserved_size) { 1754 if (ai->reserved_size) {
1149 schunk->free_size = reserved_size; 1755 schunk->free_size = ai->reserved_size;
1150 pcpu_reserved_chunk = schunk; /* not for dynamic alloc */ 1756 pcpu_reserved_chunk = schunk;
1757 pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size;
1151 } else { 1758 } else {
1152 schunk->free_size = dyn_size; 1759 schunk->free_size = dyn_size;
1153 dyn_size = 0; /* dynamic area covered */ 1760 dyn_size = 0; /* dynamic area covered */
1154 } 1761 }
1155 schunk->contig_hint = schunk->free_size; 1762 schunk->contig_hint = schunk->free_size;
1156 1763
1157 schunk->map[schunk->map_used++] = -static_size; 1764 schunk->map[schunk->map_used++] = -ai->static_size;
1158 if (schunk->free_size) 1765 if (schunk->free_size)
1159 schunk->map[schunk->map_used++] = schunk->free_size; 1766 schunk->map[schunk->map_used++] = schunk->free_size;
1160 1767
1161 pcpu_reserved_chunk_limit = static_size + schunk->free_size;
1162
1163 /* init dynamic chunk if necessary */ 1768 /* init dynamic chunk if necessary */
1164 if (dyn_size) { 1769 if (dyn_size) {
1165 dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); 1770 dchunk = alloc_bootmem(pcpu_chunk_struct_size);
1166 INIT_LIST_HEAD(&dchunk->list); 1771 INIT_LIST_HEAD(&dchunk->list);
1167 dchunk->vm = &first_vm; 1772 dchunk->base_addr = base_addr;
1168 dchunk->map = dmap; 1773 dchunk->map = dmap;
1169 dchunk->map_alloc = ARRAY_SIZE(dmap); 1774 dchunk->map_alloc = ARRAY_SIZE(dmap);
1170 dchunk->page = schunk->page_ar; /* share page map with schunk */ 1775 dchunk->immutable = true;
1776 bitmap_fill(dchunk->populated, pcpu_unit_pages);
1171 1777
1172 dchunk->contig_hint = dchunk->free_size = dyn_size; 1778 dchunk->contig_hint = dchunk->free_size = dyn_size;
1173 dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; 1779 dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
1174 dchunk->map[dchunk->map_used++] = dchunk->free_size; 1780 dchunk->map[dchunk->map_used++] = dchunk->free_size;
1175 } 1781 }
1176 1782
1177 /* allocate vm address */
1178 first_vm.flags = VM_ALLOC;
1179 first_vm.size = pcpu_chunk_size;
1180
1181 if (!base_addr)
1182 vm_area_register_early(&first_vm, PAGE_SIZE);
1183 else {
1184 /*
1185 * Pages already mapped. No need to remap into
1186 * vmalloc area. In this case the first chunks can't
1187 * be mapped or unmapped by percpu and are marked
1188 * immutable.
1189 */
1190 first_vm.addr = base_addr;
1191 schunk->immutable = true;
1192 if (dchunk)
1193 dchunk->immutable = true;
1194 }
1195
1196 /* assign pages */
1197 nr_pages = -1;
1198 for_each_possible_cpu(cpu) {
1199 for (i = 0; i < pcpu_unit_pages; i++) {
1200 struct page *page = get_page_fn(cpu, i);
1201
1202 if (!page)
1203 break;
1204 *pcpu_chunk_pagep(schunk, cpu, i) = page;
1205 }
1206
1207 BUG_ON(i < PFN_UP(static_size));
1208
1209 if (nr_pages < 0)
1210 nr_pages = i;
1211 else
1212 BUG_ON(nr_pages != i);
1213 }
1214
1215 /* map them */
1216 if (populate_pte_fn) {
1217 for_each_possible_cpu(cpu)
1218 for (i = 0; i < nr_pages; i++)
1219 populate_pte_fn(pcpu_chunk_addr(schunk,
1220 cpu, i));
1221
1222 err = pcpu_map(schunk, 0, nr_pages);
1223 if (err)
1224 panic("failed to setup static percpu area, err=%d\n",
1225 err);
1226 }
1227
1228 /* link the first chunk in */ 1783 /* link the first chunk in */
1229 if (!dchunk) { 1784 pcpu_first_chunk = dchunk ?: schunk;
1230 pcpu_chunk_relocate(schunk, -1); 1785 pcpu_chunk_relocate(pcpu_first_chunk, -1);
1231 pcpu_chunk_addr_insert(schunk);
1232 } else {
1233 pcpu_chunk_relocate(dchunk, -1);
1234 pcpu_chunk_addr_insert(dchunk);
1235 }
1236 1786
1237 /* we're done */ 1787 /* we're done */
1238 pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); 1788 pcpu_base_addr = base_addr;
1239 return pcpu_unit_size; 1789 return 0;
1240} 1790}
1241 1791
1242/* 1792const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {
1243 * Embedding first chunk setup helper. 1793 [PCPU_FC_AUTO] = "auto",
1244 */ 1794 [PCPU_FC_EMBED] = "embed",
1245static void *pcpue_ptr __initdata; 1795 [PCPU_FC_PAGE] = "page",
1246static size_t pcpue_size __initdata; 1796};
1247static size_t pcpue_unit_size __initdata;
1248 1797
1249static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) 1798enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
1250{
1251 size_t off = (size_t)pageno << PAGE_SHIFT;
1252 1799
1253 if (off >= pcpue_size) 1800static int __init percpu_alloc_setup(char *str)
1254 return NULL; 1801{
1802 if (0)
1803 /* nada */;
1804#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
1805 else if (!strcmp(str, "embed"))
1806 pcpu_chosen_fc = PCPU_FC_EMBED;
1807#endif
1808#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
1809 else if (!strcmp(str, "page"))
1810 pcpu_chosen_fc = PCPU_FC_PAGE;
1811#endif
1812 else
1813 pr_warning("PERCPU: unknown allocator %s specified\n", str);
1255 1814
1256 return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off); 1815 return 0;
1257} 1816}
1817early_param("percpu_alloc", percpu_alloc_setup);
1258 1818
1819#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
1820 !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
1259/** 1821/**
1260 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem 1822 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
1261 * @static_size: the size of static percpu area in bytes
1262 * @reserved_size: the size of reserved percpu area in bytes 1823 * @reserved_size: the size of reserved percpu area in bytes
1263 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto 1824 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
1264 * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto 1825 * @atom_size: allocation atom size
1826 * @cpu_distance_fn: callback to determine distance between cpus, optional
1827 * @alloc_fn: function to allocate percpu page
1828 * @free_fn: funtion to free percpu page
1265 * 1829 *
1266 * This is a helper to ease setting up embedded first percpu chunk and 1830 * This is a helper to ease setting up embedded first percpu chunk and
1267 * can be called where pcpu_setup_first_chunk() is expected. 1831 * can be called where pcpu_setup_first_chunk() is expected.
1268 * 1832 *
1269 * If this function is used to setup the first chunk, it is allocated 1833 * If this function is used to setup the first chunk, it is allocated
1270 * as a contiguous area using bootmem allocator and used as-is without 1834 * by calling @alloc_fn and used as-is without being mapped into
1271 * being mapped into vmalloc area. This enables the first chunk to 1835 * vmalloc area. Allocations are always whole multiples of @atom_size
1272 * piggy back on the linear physical mapping which often uses larger 1836 * aligned to @atom_size.
1273 * page size. 1837 *
1838 * This enables the first chunk to piggy back on the linear physical
1839 * mapping which often uses larger page size. Please note that this
1840 * can result in very sparse cpu->unit mapping on NUMA machines thus
1841 * requiring large vmalloc address space. Don't use this allocator if
1842 * vmalloc space is not orders of magnitude larger than distances
1843 * between node memory addresses (ie. 32bit NUMA machines).
1274 * 1844 *
1275 * When @dyn_size is positive, dynamic area might be larger than 1845 * When @dyn_size is positive, dynamic area might be larger than
1276 * specified to fill page alignment. Also, when @dyn_size is auto, 1846 * specified to fill page alignment. When @dyn_size is auto,
1277 * @dyn_size does not fill the whole first chunk but only what's 1847 * @dyn_size is just big enough to fill page alignment after static
1278 * necessary for page alignment after static and reserved areas. 1848 * and reserved areas.
1279 * 1849 *
1280 * If the needed size is smaller than the minimum or specified unit 1850 * If the needed size is smaller than the minimum or specified unit
1281 * size, the leftover is returned to the bootmem allocator. 1851 * size, the leftover is returned using @free_fn.
1282 * 1852 *
1283 * RETURNS: 1853 * RETURNS:
1284 * The determined pcpu_unit_size which can be used to initialize 1854 * 0 on success, -errno on failure.
1285 * percpu access on success, -errno on failure.
1286 */ 1855 */
1287ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, 1856int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size,
1288 ssize_t dyn_size, ssize_t unit_size) 1857 size_t atom_size,
1858 pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
1859 pcpu_fc_alloc_fn_t alloc_fn,
1860 pcpu_fc_free_fn_t free_fn)
1289{ 1861{
1290 unsigned int cpu; 1862 void *base = (void *)ULONG_MAX;
1863 void **areas = NULL;
1864 struct pcpu_alloc_info *ai;
1865 size_t size_sum, areas_size, max_distance;
1866 int group, i, rc;
1867
1868 ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
1869 cpu_distance_fn);
1870 if (IS_ERR(ai))
1871 return PTR_ERR(ai);
1872
1873 size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
1874 areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
1875
1876 areas = alloc_bootmem_nopanic(areas_size);
1877 if (!areas) {
1878 rc = -ENOMEM;
1879 goto out_free;
1880 }
1291 1881
1292 /* determine parameters and allocate */ 1882 /* allocate, copy and determine base address */
1293 pcpue_size = PFN_ALIGN(static_size + reserved_size + 1883 for (group = 0; group < ai->nr_groups; group++) {
1294 (dyn_size >= 0 ? dyn_size : 0)); 1884 struct pcpu_group_info *gi = &ai->groups[group];
1295 if (dyn_size != 0) 1885 unsigned int cpu = NR_CPUS;
1296 dyn_size = pcpue_size - static_size - reserved_size; 1886 void *ptr;
1297 1887
1298 if (unit_size >= 0) { 1888 for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
1299 BUG_ON(unit_size < pcpue_size); 1889 cpu = gi->cpu_map[i];
1300 pcpue_unit_size = unit_size; 1890 BUG_ON(cpu == NR_CPUS);
1301 } else 1891
1302 pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); 1892 /* allocate space for the whole group */
1303 1893 ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
1304 pcpue_ptr = __alloc_bootmem_nopanic( 1894 if (!ptr) {
1305 num_possible_cpus() * pcpue_unit_size, 1895 rc = -ENOMEM;
1306 PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); 1896 goto out_free_areas;
1307 if (!pcpue_ptr) 1897 }
1308 return -ENOMEM; 1898 areas[group] = ptr;
1309 1899
1310 /* return the leftover and copy */ 1900 base = min(ptr, base);
1311 for_each_possible_cpu(cpu) { 1901
1312 void *ptr = pcpue_ptr + cpu * pcpue_unit_size; 1902 for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
1903 if (gi->cpu_map[i] == NR_CPUS) {
1904 /* unused unit, free whole */
1905 free_fn(ptr, ai->unit_size);
1906 continue;
1907 }
1908 /* copy and return the unused part */
1909 memcpy(ptr, __per_cpu_load, ai->static_size);
1910 free_fn(ptr + size_sum, ai->unit_size - size_sum);
1911 }
1912 }
1913
1914 /* base address is now known, determine group base offsets */
1915 max_distance = 0;
1916 for (group = 0; group < ai->nr_groups; group++) {
1917 ai->groups[group].base_offset = areas[group] - base;
1918 max_distance = max_t(size_t, max_distance,
1919 ai->groups[group].base_offset);
1920 }
1921 max_distance += ai->unit_size;
1922
1923 /* warn if maximum distance is further than 75% of vmalloc space */
1924 if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) {
1925 pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc "
1926 "space 0x%lx\n",
1927 max_distance, VMALLOC_END - VMALLOC_START);
1928#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
1929 /* and fail if we have fallback */
1930 rc = -EINVAL;
1931 goto out_free;
1932#endif
1933 }
1934
1935 pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
1936 PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
1937 ai->dyn_size, ai->unit_size);
1938
1939 rc = pcpu_setup_first_chunk(ai, base);
1940 goto out_free;
1941
1942out_free_areas:
1943 for (group = 0; group < ai->nr_groups; group++)
1944 free_fn(areas[group],
1945 ai->groups[group].nr_units * ai->unit_size);
1946out_free:
1947 pcpu_free_alloc_info(ai);
1948 if (areas)
1949 free_bootmem(__pa(areas), areas_size);
1950 return rc;
1951}
1952#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK ||
1953 !CONFIG_HAVE_SETUP_PER_CPU_AREA */
1954
1955#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
1956/**
1957 * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
1958 * @reserved_size: the size of reserved percpu area in bytes
1959 * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
1960 * @free_fn: funtion to free percpu page, always called with PAGE_SIZE
1961 * @populate_pte_fn: function to populate pte
1962 *
1963 * This is a helper to ease setting up page-remapped first percpu
1964 * chunk and can be called where pcpu_setup_first_chunk() is expected.
1965 *
1966 * This is the basic allocator. Static percpu area is allocated
1967 * page-by-page into vmalloc area.
1968 *
1969 * RETURNS:
1970 * 0 on success, -errno on failure.
1971 */
1972int __init pcpu_page_first_chunk(size_t reserved_size,
1973 pcpu_fc_alloc_fn_t alloc_fn,
1974 pcpu_fc_free_fn_t free_fn,
1975 pcpu_fc_populate_pte_fn_t populate_pte_fn)
1976{
1977 static struct vm_struct vm;
1978 struct pcpu_alloc_info *ai;
1979 char psize_str[16];
1980 int unit_pages;
1981 size_t pages_size;
1982 struct page **pages;
1983 int unit, i, j, rc;
1984
1985 snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
1986
1987 ai = pcpu_build_alloc_info(reserved_size, -1, PAGE_SIZE, NULL);
1988 if (IS_ERR(ai))
1989 return PTR_ERR(ai);
1990 BUG_ON(ai->nr_groups != 1);
1991 BUG_ON(ai->groups[0].nr_units != num_possible_cpus());
1992
1993 unit_pages = ai->unit_size >> PAGE_SHIFT;
1994
1995 /* unaligned allocations can't be freed, round up to page size */
1996 pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
1997 sizeof(pages[0]));
1998 pages = alloc_bootmem(pages_size);
1999
2000 /* allocate pages */
2001 j = 0;
2002 for (unit = 0; unit < num_possible_cpus(); unit++)
2003 for (i = 0; i < unit_pages; i++) {
2004 unsigned int cpu = ai->groups[0].cpu_map[unit];
2005 void *ptr;
2006
2007 ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
2008 if (!ptr) {
2009 pr_warning("PERCPU: failed to allocate %s page "
2010 "for cpu%u\n", psize_str, cpu);
2011 goto enomem;
2012 }
2013 pages[j++] = virt_to_page(ptr);
2014 }
2015
2016 /* allocate vm area, map the pages and copy static data */
2017 vm.flags = VM_ALLOC;
2018 vm.size = num_possible_cpus() * ai->unit_size;
2019 vm_area_register_early(&vm, PAGE_SIZE);
2020
2021 for (unit = 0; unit < num_possible_cpus(); unit++) {
2022 unsigned long unit_addr =
2023 (unsigned long)vm.addr + unit * ai->unit_size;
2024
2025 for (i = 0; i < unit_pages; i++)
2026 populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
2027
2028 /* pte already populated, the following shouldn't fail */
2029 rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
2030 unit_pages);
2031 if (rc < 0)
2032 panic("failed to map percpu area, err=%d\n", rc);
1313 2033
1314 free_bootmem(__pa(ptr + pcpue_size), 2034 /*
1315 pcpue_unit_size - pcpue_size); 2035 * FIXME: Archs with virtual cache should flush local
1316 memcpy(ptr, __per_cpu_load, static_size); 2036 * cache for the linear mapping here - something
2037 * equivalent to flush_cache_vmap() on the local cpu.
2038 * flush_cache_vmap() can't be used as most supporting
2039 * data structures are not set up yet.
2040 */
2041
2042 /* copy static data */
2043 memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
1317 } 2044 }
1318 2045
1319 /* we're ready, commit */ 2046 /* we're ready, commit */
1320 pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", 2047 pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n",
1321 pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); 2048 unit_pages, psize_str, vm.addr, ai->static_size,
2049 ai->reserved_size, ai->dyn_size);
2050
2051 rc = pcpu_setup_first_chunk(ai, vm.addr);
2052 goto out_free_ar;
2053
2054enomem:
2055 while (--j >= 0)
2056 free_fn(page_address(pages[j]), PAGE_SIZE);
2057 rc = -ENOMEM;
2058out_free_ar:
2059 free_bootmem(__pa(pages), pages_size);
2060 pcpu_free_alloc_info(ai);
2061 return rc;
2062}
2063#endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */
2064
2065/*
2066 * Generic percpu area setup.
2067 *
2068 * The embedding helper is used because its behavior closely resembles
2069 * the original non-dynamic generic percpu area setup. This is
2070 * important because many archs have addressing restrictions and might
2071 * fail if the percpu area is located far away from the previous
2072 * location. As an added bonus, in non-NUMA cases, embedding is
2073 * generally a good idea TLB-wise because percpu area can piggy back
2074 * on the physical linear memory mapping which uses large page
2075 * mappings on applicable archs.
2076 */
2077#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
2078unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
2079EXPORT_SYMBOL(__per_cpu_offset);
1322 2080
1323 return pcpu_setup_first_chunk(pcpue_get_page, static_size, 2081static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
1324 reserved_size, dyn_size, 2082 size_t align)
1325 pcpue_unit_size, pcpue_ptr, NULL); 2083{
2084 return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));
2085}
2086
2087static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
2088{
2089 free_bootmem(__pa(ptr), size);
2090}
2091
2092void __init setup_per_cpu_areas(void)
2093{
2094 unsigned long delta;
2095 unsigned int cpu;
2096 int rc;
2097
2098 /*
2099 * Always reserve area for module percpu variables. That's
2100 * what the legacy allocator did.
2101 */
2102 rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
2103 PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
2104 pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
2105 if (rc < 0)
2106 panic("Failed to initialized percpu areas.");
2107
2108 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
2109 for_each_possible_cpu(cpu)
2110 __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
1326} 2111}
2112#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
diff --git a/mm/quicklist.c b/mm/quicklist.c
index 8dbb6805ef35..6633965bb27b 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -19,7 +19,7 @@
19#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/quicklist.h> 20#include <linux/quicklist.h>
21 21
22DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; 22DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist);
23 23
24#define FRACTION_OF_NODE_MEM 16 24#define FRACTION_OF_NODE_MEM 16
25 25
@@ -29,7 +29,6 @@ static unsigned long max_pages(unsigned long min_pages)
29 int node = numa_node_id(); 29 int node = numa_node_id();
30 struct zone *zones = NODE_DATA(node)->node_zones; 30 struct zone *zones = NODE_DATA(node)->node_zones;
31 int num_cpus_on_node; 31 int num_cpus_on_node;
32 node_to_cpumask_ptr(cpumask_on_node, node);
33 32
34 node_free_pages = 33 node_free_pages =
35#ifdef CONFIG_ZONE_DMA 34#ifdef CONFIG_ZONE_DMA
@@ -42,7 +41,7 @@ static unsigned long max_pages(unsigned long min_pages)
42 41
43 max = node_free_pages / FRACTION_OF_NODE_MEM; 42 max = node_free_pages / FRACTION_OF_NODE_MEM;
44 43
45 num_cpus_on_node = cpus_weight_nr(*cpumask_on_node); 44 num_cpus_on_node = cpumask_weight(cpumask_of_node(node));
46 max /= num_cpus_on_node; 45 max /= num_cpus_on_node;
47 46
48 return max(max, min_pages); 47 return max(max, min_pages);
diff --git a/mm/readahead.c b/mm/readahead.c
index 9ce303d4b810..aa1aa2345235 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -31,6 +31,42 @@ EXPORT_SYMBOL_GPL(file_ra_state_init);
31 31
32#define list_to_page(head) (list_entry((head)->prev, struct page, lru)) 32#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
33 33
34/*
35 * see if a page needs releasing upon read_cache_pages() failure
36 * - the caller of read_cache_pages() may have set PG_private or PG_fscache
37 * before calling, such as the NFS fs marking pages that are cached locally
38 * on disk, thus we need to give the fs a chance to clean up in the event of
39 * an error
40 */
41static void read_cache_pages_invalidate_page(struct address_space *mapping,
42 struct page *page)
43{
44 if (page_has_private(page)) {
45 if (!trylock_page(page))
46 BUG();
47 page->mapping = mapping;
48 do_invalidatepage(page, 0);
49 page->mapping = NULL;
50 unlock_page(page);
51 }
52 page_cache_release(page);
53}
54
55/*
56 * release a list of pages, invalidating them first if need be
57 */
58static void read_cache_pages_invalidate_pages(struct address_space *mapping,
59 struct list_head *pages)
60{
61 struct page *victim;
62
63 while (!list_empty(pages)) {
64 victim = list_to_page(pages);
65 list_del(&victim->lru);
66 read_cache_pages_invalidate_page(mapping, victim);
67 }
68}
69
34/** 70/**
35 * read_cache_pages - populate an address space with some pages & start reads against them 71 * read_cache_pages - populate an address space with some pages & start reads against them
36 * @mapping: the address_space 72 * @mapping: the address_space
@@ -52,14 +88,14 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
52 list_del(&page->lru); 88 list_del(&page->lru);
53 if (add_to_page_cache_lru(page, mapping, 89 if (add_to_page_cache_lru(page, mapping,
54 page->index, GFP_KERNEL)) { 90 page->index, GFP_KERNEL)) {
55 page_cache_release(page); 91 read_cache_pages_invalidate_page(mapping, page);
56 continue; 92 continue;
57 } 93 }
58 page_cache_release(page); 94 page_cache_release(page);
59 95
60 ret = filler(data, page); 96 ret = filler(data, page);
61 if (unlikely(ret)) { 97 if (unlikely(ret)) {
62 put_pages_list(pages); 98 read_cache_pages_invalidate_pages(mapping, pages);
63 break; 99 break;
64 } 100 }
65 task_io_account_read(PAGE_CACHE_SIZE); 101 task_io_account_read(PAGE_CACHE_SIZE);
@@ -97,15 +133,12 @@ out:
97} 133}
98 134
99/* 135/*
100 * do_page_cache_readahead actually reads a chunk of disk. It allocates all 136 * __do_page_cache_readahead() actually reads a chunk of disk. It allocates all
101 * the pages first, then submits them all for I/O. This avoids the very bad 137 * the pages first, then submits them all for I/O. This avoids the very bad
102 * behaviour which would occur if page allocations are causing VM writeback. 138 * behaviour which would occur if page allocations are causing VM writeback.
103 * We really don't want to intermingle reads and writes like that. 139 * We really don't want to intermingle reads and writes like that.
104 * 140 *
105 * Returns the number of pages requested, or the maximum amount of I/O allowed. 141 * Returns the number of pages requested, or the maximum amount of I/O allowed.
106 *
107 * do_page_cache_readahead() returns -1 if it encountered request queue
108 * congestion.
109 */ 142 */
110static int 143static int
111__do_page_cache_readahead(struct address_space *mapping, struct file *filp, 144__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
@@ -174,6 +207,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
174 if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) 207 if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
175 return -EINVAL; 208 return -EINVAL;
176 209
210 nr_to_read = max_sane_readahead(nr_to_read);
177 while (nr_to_read) { 211 while (nr_to_read) {
178 int err; 212 int err;
179 213
@@ -195,22 +229,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
195} 229}
196 230
197/* 231/*
198 * This version skips the IO if the queue is read-congested, and will tell the
199 * block layer to abandon the readahead if request allocation would block.
200 *
201 * force_page_cache_readahead() will ignore queue congestion and will block on
202 * request queues.
203 */
204int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
205 pgoff_t offset, unsigned long nr_to_read)
206{
207 if (bdi_read_congested(mapping->backing_dev_info))
208 return -1;
209
210 return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0);
211}
212
213/*
214 * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a 232 * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
215 * sensible upper limit. 233 * sensible upper limit.
216 */ 234 */
@@ -223,7 +241,7 @@ unsigned long max_sane_readahead(unsigned long nr)
223/* 241/*
224 * Submit IO for the read-ahead request in file_ra_state. 242 * Submit IO for the read-ahead request in file_ra_state.
225 */ 243 */
226static unsigned long ra_submit(struct file_ra_state *ra, 244unsigned long ra_submit(struct file_ra_state *ra,
227 struct address_space *mapping, struct file *filp) 245 struct address_space *mapping, struct file *filp)
228{ 246{
229 int actual; 247 int actual;
@@ -312,6 +330,59 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra,
312 */ 330 */
313 331
314/* 332/*
333 * Count contiguously cached pages from @offset-1 to @offset-@max,
334 * this count is a conservative estimation of
335 * - length of the sequential read sequence, or
336 * - thrashing threshold in memory tight systems
337 */
338static pgoff_t count_history_pages(struct address_space *mapping,
339 struct file_ra_state *ra,
340 pgoff_t offset, unsigned long max)
341{
342 pgoff_t head;
343
344 rcu_read_lock();
345 head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max);
346 rcu_read_unlock();
347
348 return offset - 1 - head;
349}
350
351/*
352 * page cache context based read-ahead
353 */
354static int try_context_readahead(struct address_space *mapping,
355 struct file_ra_state *ra,
356 pgoff_t offset,
357 unsigned long req_size,
358 unsigned long max)
359{
360 pgoff_t size;
361
362 size = count_history_pages(mapping, ra, offset, max);
363
364 /*
365 * no history pages:
366 * it could be a random read
367 */
368 if (!size)
369 return 0;
370
371 /*
372 * starts from beginning of file:
373 * it is a strong indication of long-run stream (or whole-file-read)
374 */
375 if (size >= offset)
376 size *= 2;
377
378 ra->start = offset;
379 ra->size = get_init_ra_size(size + req_size, max);
380 ra->async_size = ra->size;
381
382 return 1;
383}
384
385/*
315 * A minimal readahead algorithm for trivial sequential/random reads. 386 * A minimal readahead algorithm for trivial sequential/random reads.
316 */ 387 */
317static unsigned long 388static unsigned long
@@ -320,34 +391,26 @@ ondemand_readahead(struct address_space *mapping,
320 bool hit_readahead_marker, pgoff_t offset, 391 bool hit_readahead_marker, pgoff_t offset,
321 unsigned long req_size) 392 unsigned long req_size)
322{ 393{
323 int max = ra->ra_pages; /* max readahead pages */ 394 unsigned long max = max_sane_readahead(ra->ra_pages);
324 pgoff_t prev_offset; 395
325 int sequential; 396 /*
397 * start of file
398 */
399 if (!offset)
400 goto initial_readahead;
326 401
327 /* 402 /*
328 * It's the expected callback offset, assume sequential access. 403 * It's the expected callback offset, assume sequential access.
329 * Ramp up sizes, and push forward the readahead window. 404 * Ramp up sizes, and push forward the readahead window.
330 */ 405 */
331 if (offset && (offset == (ra->start + ra->size - ra->async_size) || 406 if ((offset == (ra->start + ra->size - ra->async_size) ||
332 offset == (ra->start + ra->size))) { 407 offset == (ra->start + ra->size))) {
333 ra->start += ra->size; 408 ra->start += ra->size;
334 ra->size = get_next_ra_size(ra, max); 409 ra->size = get_next_ra_size(ra, max);
335 ra->async_size = ra->size; 410 ra->async_size = ra->size;
336 goto readit; 411 goto readit;
337 } 412 }
338 413
339 prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT;
340 sequential = offset - prev_offset <= 1UL || req_size > max;
341
342 /*
343 * Standalone, small read.
344 * Read as is, and do not pollute the readahead state.
345 */
346 if (!hit_readahead_marker && !sequential) {
347 return __do_page_cache_readahead(mapping, filp,
348 offset, req_size, 0);
349 }
350
351 /* 414 /*
352 * Hit a marked page without valid readahead state. 415 * Hit a marked page without valid readahead state.
353 * E.g. interleaved reads. 416 * E.g. interleaved reads.
@@ -358,7 +421,7 @@ ondemand_readahead(struct address_space *mapping,
358 pgoff_t start; 421 pgoff_t start;
359 422
360 rcu_read_lock(); 423 rcu_read_lock();
361 start = radix_tree_next_hole(&mapping->page_tree, offset,max+1); 424 start = radix_tree_next_hole(&mapping->page_tree, offset+1,max);
362 rcu_read_unlock(); 425 rcu_read_unlock();
363 426
364 if (!start || start - offset > max) 427 if (!start || start - offset > max)
@@ -366,23 +429,53 @@ ondemand_readahead(struct address_space *mapping,
366 429
367 ra->start = start; 430 ra->start = start;
368 ra->size = start - offset; /* old async_size */ 431 ra->size = start - offset; /* old async_size */
432 ra->size += req_size;
369 ra->size = get_next_ra_size(ra, max); 433 ra->size = get_next_ra_size(ra, max);
370 ra->async_size = ra->size; 434 ra->async_size = ra->size;
371 goto readit; 435 goto readit;
372 } 436 }
373 437
374 /* 438 /*
375 * It may be one of 439 * oversize read
376 * - first read on start of file
377 * - sequential cache miss
378 * - oversize random read
379 * Start readahead for it.
380 */ 440 */
441 if (req_size > max)
442 goto initial_readahead;
443
444 /*
445 * sequential cache miss
446 */
447 if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL)
448 goto initial_readahead;
449
450 /*
451 * Query the page cache and look for the traces(cached history pages)
452 * that a sequential stream would leave behind.
453 */
454 if (try_context_readahead(mapping, ra, offset, req_size, max))
455 goto readit;
456
457 /*
458 * standalone, small random read
459 * Read as is, and do not pollute the readahead state.
460 */
461 return __do_page_cache_readahead(mapping, filp, offset, req_size, 0);
462
463initial_readahead:
381 ra->start = offset; 464 ra->start = offset;
382 ra->size = get_init_ra_size(req_size, max); 465 ra->size = get_init_ra_size(req_size, max);
383 ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; 466 ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
384 467
385readit: 468readit:
469 /*
470 * Will this read hit the readahead marker made by itself?
471 * If so, trigger the readahead marker hit now, and merge
472 * the resulted next readahead window into the current one.
473 */
474 if (offset == ra->start && ra->size == ra->async_size) {
475 ra->async_size = get_next_ra_size(ra, max);
476 ra->size += ra->async_size;
477 }
478
386 return ra_submit(ra, mapping, filp); 479 return ra_submit(ra, mapping, filp);
387} 480}
388 481
diff --git a/mm/rmap.c b/mm/rmap.c
index 16521664010d..dd43373a483f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -14,7 +14,7 @@
14 * Original design by Rik van Riel <riel@conectiva.com.br> 2001 14 * Original design by Rik van Riel <riel@conectiva.com.br> 2001
15 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 15 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
16 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 16 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
17 * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004 17 * Contributions by Hugh Dickins 2003, 2004
18 */ 18 */
19 19
20/* 20/*
@@ -36,6 +36,11 @@
36 * mapping->tree_lock (widely used, in set_page_dirty, 36 * mapping->tree_lock (widely used, in set_page_dirty,
37 * in arch-dependent flush_dcache_mmap_lock, 37 * in arch-dependent flush_dcache_mmap_lock,
38 * within inode_lock in __sync_single_inode) 38 * within inode_lock in __sync_single_inode)
39 *
40 * (code doesn't rely on that order so it could be switched around)
41 * ->tasklist_lock
42 * anon_vma->lock (memory_failure, collect_procs_anon)
43 * pte map lock
39 */ 44 */
40 45
41#include <linux/mm.h> 46#include <linux/mm.h>
@@ -191,7 +196,7 @@ void __init anon_vma_init(void)
191 * Getting a lock on a stable anon_vma from a page off the LRU is 196 * Getting a lock on a stable anon_vma from a page off the LRU is
192 * tricky: page_lock_anon_vma rely on RCU to guard against the races. 197 * tricky: page_lock_anon_vma rely on RCU to guard against the races.
193 */ 198 */
194static struct anon_vma *page_lock_anon_vma(struct page *page) 199struct anon_vma *page_lock_anon_vma(struct page *page)
195{ 200{
196 struct anon_vma *anon_vma; 201 struct anon_vma *anon_vma;
197 unsigned long anon_mapping; 202 unsigned long anon_mapping;
@@ -211,7 +216,7 @@ out:
211 return NULL; 216 return NULL;
212} 217}
213 218
214static void page_unlock_anon_vma(struct anon_vma *anon_vma) 219void page_unlock_anon_vma(struct anon_vma *anon_vma)
215{ 220{
216 spin_unlock(&anon_vma->lock); 221 spin_unlock(&anon_vma->lock);
217 rcu_read_unlock(); 222 rcu_read_unlock();
@@ -237,8 +242,8 @@ vma_address(struct page *page, struct vm_area_struct *vma)
237} 242}
238 243
239/* 244/*
240 * At what user virtual address is page expected in vma? checking that the 245 * At what user virtual address is page expected in vma?
241 * page matches the vma: currently only used on anon pages, by unuse_vma; 246 * checking that the page matches the vma.
242 */ 247 */
243unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 248unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
244{ 249{
@@ -311,7 +316,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
311 * if the page is not mapped into the page tables of this VMA. Only 316 * if the page is not mapped into the page tables of this VMA. Only
312 * valid for normal file or anonymous VMAs. 317 * valid for normal file or anonymous VMAs.
313 */ 318 */
314static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) 319int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
315{ 320{
316 unsigned long address; 321 unsigned long address;
317 pte_t *pte; 322 pte_t *pte;
@@ -333,7 +338,9 @@ static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
333 * repeatedly from either page_referenced_anon or page_referenced_file. 338 * repeatedly from either page_referenced_anon or page_referenced_file.
334 */ 339 */
335static int page_referenced_one(struct page *page, 340static int page_referenced_one(struct page *page,
336 struct vm_area_struct *vma, unsigned int *mapcount) 341 struct vm_area_struct *vma,
342 unsigned int *mapcount,
343 unsigned long *vm_flags)
337{ 344{
338 struct mm_struct *mm = vma->vm_mm; 345 struct mm_struct *mm = vma->vm_mm;
339 unsigned long address; 346 unsigned long address;
@@ -356,6 +363,7 @@ static int page_referenced_one(struct page *page,
356 */ 363 */
357 if (vma->vm_flags & VM_LOCKED) { 364 if (vma->vm_flags & VM_LOCKED) {
358 *mapcount = 1; /* break early from loop */ 365 *mapcount = 1; /* break early from loop */
366 *vm_flags |= VM_LOCKED;
359 goto out_unmap; 367 goto out_unmap;
360 } 368 }
361 369
@@ -381,11 +389,14 @@ out_unmap:
381 (*mapcount)--; 389 (*mapcount)--;
382 pte_unmap_unlock(pte, ptl); 390 pte_unmap_unlock(pte, ptl);
383out: 391out:
392 if (referenced)
393 *vm_flags |= vma->vm_flags;
384 return referenced; 394 return referenced;
385} 395}
386 396
387static int page_referenced_anon(struct page *page, 397static int page_referenced_anon(struct page *page,
388 struct mem_cgroup *mem_cont) 398 struct mem_cgroup *mem_cont,
399 unsigned long *vm_flags)
389{ 400{
390 unsigned int mapcount; 401 unsigned int mapcount;
391 struct anon_vma *anon_vma; 402 struct anon_vma *anon_vma;
@@ -405,7 +416,8 @@ static int page_referenced_anon(struct page *page,
405 */ 416 */
406 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 417 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
407 continue; 418 continue;
408 referenced += page_referenced_one(page, vma, &mapcount); 419 referenced += page_referenced_one(page, vma,
420 &mapcount, vm_flags);
409 if (!mapcount) 421 if (!mapcount)
410 break; 422 break;
411 } 423 }
@@ -418,6 +430,7 @@ static int page_referenced_anon(struct page *page,
418 * page_referenced_file - referenced check for object-based rmap 430 * page_referenced_file - referenced check for object-based rmap
419 * @page: the page we're checking references on. 431 * @page: the page we're checking references on.
420 * @mem_cont: target memory controller 432 * @mem_cont: target memory controller
433 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
421 * 434 *
422 * For an object-based mapped page, find all the places it is mapped and 435 * For an object-based mapped page, find all the places it is mapped and
423 * check/clear the referenced flag. This is done by following the page->mapping 436 * check/clear the referenced flag. This is done by following the page->mapping
@@ -427,7 +440,8 @@ static int page_referenced_anon(struct page *page,
427 * This function is only called from page_referenced for object-based pages. 440 * This function is only called from page_referenced for object-based pages.
428 */ 441 */
429static int page_referenced_file(struct page *page, 442static int page_referenced_file(struct page *page,
430 struct mem_cgroup *mem_cont) 443 struct mem_cgroup *mem_cont,
444 unsigned long *vm_flags)
431{ 445{
432 unsigned int mapcount; 446 unsigned int mapcount;
433 struct address_space *mapping = page->mapping; 447 struct address_space *mapping = page->mapping;
@@ -467,7 +481,8 @@ static int page_referenced_file(struct page *page,
467 */ 481 */
468 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 482 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
469 continue; 483 continue;
470 referenced += page_referenced_one(page, vma, &mapcount); 484 referenced += page_referenced_one(page, vma,
485 &mapcount, vm_flags);
471 if (!mapcount) 486 if (!mapcount)
472 break; 487 break;
473 } 488 }
@@ -481,29 +496,35 @@ static int page_referenced_file(struct page *page,
481 * @page: the page to test 496 * @page: the page to test
482 * @is_locked: caller holds lock on the page 497 * @is_locked: caller holds lock on the page
483 * @mem_cont: target memory controller 498 * @mem_cont: target memory controller
499 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
484 * 500 *
485 * Quick test_and_clear_referenced for all mappings to a page, 501 * Quick test_and_clear_referenced for all mappings to a page,
486 * returns the number of ptes which referenced the page. 502 * returns the number of ptes which referenced the page.
487 */ 503 */
488int page_referenced(struct page *page, int is_locked, 504int page_referenced(struct page *page,
489 struct mem_cgroup *mem_cont) 505 int is_locked,
506 struct mem_cgroup *mem_cont,
507 unsigned long *vm_flags)
490{ 508{
491 int referenced = 0; 509 int referenced = 0;
492 510
493 if (TestClearPageReferenced(page)) 511 if (TestClearPageReferenced(page))
494 referenced++; 512 referenced++;
495 513
514 *vm_flags = 0;
496 if (page_mapped(page) && page->mapping) { 515 if (page_mapped(page) && page->mapping) {
497 if (PageAnon(page)) 516 if (PageAnon(page))
498 referenced += page_referenced_anon(page, mem_cont); 517 referenced += page_referenced_anon(page, mem_cont,
518 vm_flags);
499 else if (is_locked) 519 else if (is_locked)
500 referenced += page_referenced_file(page, mem_cont); 520 referenced += page_referenced_file(page, mem_cont,
521 vm_flags);
501 else if (!trylock_page(page)) 522 else if (!trylock_page(page))
502 referenced++; 523 referenced++;
503 else { 524 else {
504 if (page->mapping) 525 if (page->mapping)
505 referenced += 526 referenced += page_referenced_file(page,
506 page_referenced_file(page, mem_cont); 527 mem_cont, vm_flags);
507 unlock_page(page); 528 unlock_page(page);
508 } 529 }
509 } 530 }
@@ -688,31 +709,12 @@ void page_add_new_anon_rmap(struct page *page,
688 */ 709 */
689void page_add_file_rmap(struct page *page) 710void page_add_file_rmap(struct page *page)
690{ 711{
691 if (atomic_inc_and_test(&page->_mapcount)) 712 if (atomic_inc_and_test(&page->_mapcount)) {
692 __inc_zone_page_state(page, NR_FILE_MAPPED); 713 __inc_zone_page_state(page, NR_FILE_MAPPED);
714 mem_cgroup_update_mapped_file_stat(page, 1);
715 }
693} 716}
694 717
695#ifdef CONFIG_DEBUG_VM
696/**
697 * page_dup_rmap - duplicate pte mapping to a page
698 * @page: the page to add the mapping to
699 * @vma: the vm area being duplicated
700 * @address: the user virtual address mapped
701 *
702 * For copy_page_range only: minimal extract from page_add_file_rmap /
703 * page_add_anon_rmap, avoiding unnecessary tests (already checked) so it's
704 * quicker.
705 *
706 * The caller needs to hold the pte lock.
707 */
708void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address)
709{
710 if (PageAnon(page))
711 __page_check_anon_rmap(page, vma, address);
712 atomic_inc(&page->_mapcount);
713}
714#endif
715
716/** 718/**
717 * page_remove_rmap - take down pte mapping from a page 719 * page_remove_rmap - take down pte mapping from a page
718 * @page: page to remove mapping from 720 * @page: page to remove mapping from
@@ -721,33 +723,37 @@ void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long
721 */ 723 */
722void page_remove_rmap(struct page *page) 724void page_remove_rmap(struct page *page)
723{ 725{
724 if (atomic_add_negative(-1, &page->_mapcount)) { 726 /* page still mapped by someone else? */
725 /* 727 if (!atomic_add_negative(-1, &page->_mapcount))
726 * Now that the last pte has gone, s390 must transfer dirty 728 return;
727 * flag from storage key to struct page. We can usually skip 729
728 * this if the page is anon, so about to be freed; but perhaps 730 /*
729 * not if it's in swapcache - there might be another pte slot 731 * Now that the last pte has gone, s390 must transfer dirty
730 * containing the swap entry, but page not yet written to swap. 732 * flag from storage key to struct page. We can usually skip
731 */ 733 * this if the page is anon, so about to be freed; but perhaps
732 if ((!PageAnon(page) || PageSwapCache(page)) && 734 * not if it's in swapcache - there might be another pte slot
733 page_test_dirty(page)) { 735 * containing the swap entry, but page not yet written to swap.
734 page_clear_dirty(page); 736 */
735 set_page_dirty(page); 737 if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) {
736 } 738 page_clear_dirty(page);
737 if (PageAnon(page)) 739 set_page_dirty(page);
738 mem_cgroup_uncharge_page(page); 740 }
739 __dec_zone_page_state(page, 741 if (PageAnon(page)) {
740 PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); 742 mem_cgroup_uncharge_page(page);
741 /* 743 __dec_zone_page_state(page, NR_ANON_PAGES);
742 * It would be tidy to reset the PageAnon mapping here, 744 } else {
743 * but that might overwrite a racing page_add_anon_rmap 745 __dec_zone_page_state(page, NR_FILE_MAPPED);
744 * which increments mapcount after us but sets mapping
745 * before us: so leave the reset to free_hot_cold_page,
746 * and remember that it's only reliable while mapped.
747 * Leaving it set also helps swapoff to reinstate ptes
748 * faster for those pages still in swapcache.
749 */
750 } 746 }
747 mem_cgroup_update_mapped_file_stat(page, -1);
748 /*
749 * It would be tidy to reset the PageAnon mapping here,
750 * but that might overwrite a racing page_add_anon_rmap
751 * which increments mapcount after us but sets mapping
752 * before us: so leave the reset to free_hot_cold_page,
753 * and remember that it's only reliable while mapped.
754 * Leaving it set also helps swapoff to reinstate ptes
755 * faster for those pages still in swapcache.
756 */
751} 757}
752 758
753/* 759/*
@@ -755,7 +761,7 @@ void page_remove_rmap(struct page *page)
755 * repeatedly from either try_to_unmap_anon or try_to_unmap_file. 761 * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
756 */ 762 */
757static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 763static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
758 int migration) 764 enum ttu_flags flags)
759{ 765{
760 struct mm_struct *mm = vma->vm_mm; 766 struct mm_struct *mm = vma->vm_mm;
761 unsigned long address; 767 unsigned long address;
@@ -777,11 +783,13 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
777 * If it's recently referenced (perhaps page_referenced 783 * If it's recently referenced (perhaps page_referenced
778 * skipped over this mm) then we should reactivate it. 784 * skipped over this mm) then we should reactivate it.
779 */ 785 */
780 if (!migration) { 786 if (!(flags & TTU_IGNORE_MLOCK)) {
781 if (vma->vm_flags & VM_LOCKED) { 787 if (vma->vm_flags & VM_LOCKED) {
782 ret = SWAP_MLOCK; 788 ret = SWAP_MLOCK;
783 goto out_unmap; 789 goto out_unmap;
784 } 790 }
791 }
792 if (!(flags & TTU_IGNORE_ACCESS)) {
785 if (ptep_clear_flush_young_notify(vma, address, pte)) { 793 if (ptep_clear_flush_young_notify(vma, address, pte)) {
786 ret = SWAP_FAIL; 794 ret = SWAP_FAIL;
787 goto out_unmap; 795 goto out_unmap;
@@ -799,7 +807,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
799 /* Update high watermark before we lower rss */ 807 /* Update high watermark before we lower rss */
800 update_hiwater_rss(mm); 808 update_hiwater_rss(mm);
801 809
802 if (PageAnon(page)) { 810 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
811 if (PageAnon(page))
812 dec_mm_counter(mm, anon_rss);
813 else
814 dec_mm_counter(mm, file_rss);
815 set_pte_at(mm, address, pte,
816 swp_entry_to_pte(make_hwpoison_entry(page)));
817 } else if (PageAnon(page)) {
803 swp_entry_t entry = { .val = page_private(page) }; 818 swp_entry_t entry = { .val = page_private(page) };
804 819
805 if (PageSwapCache(page)) { 820 if (PageSwapCache(page)) {
@@ -821,12 +836,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
821 * pte. do_swap_page() will wait until the migration 836 * pte. do_swap_page() will wait until the migration
822 * pte is removed and then restart fault handling. 837 * pte is removed and then restart fault handling.
823 */ 838 */
824 BUG_ON(!migration); 839 BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION);
825 entry = make_migration_entry(page, pte_write(pteval)); 840 entry = make_migration_entry(page, pte_write(pteval));
826 } 841 }
827 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 842 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
828 BUG_ON(pte_file(*pte)); 843 BUG_ON(pte_file(*pte));
829 } else if (PAGE_MIGRATION && migration) { 844 } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) {
830 /* Establish migration entry for a file page */ 845 /* Establish migration entry for a file page */
831 swp_entry_t entry; 846 swp_entry_t entry;
832 entry = make_migration_entry(page, pte_write(pteval)); 847 entry = make_migration_entry(page, pte_write(pteval));
@@ -995,12 +1010,13 @@ static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
995 * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1010 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
996 * 'LOCKED. 1011 * 'LOCKED.
997 */ 1012 */
998static int try_to_unmap_anon(struct page *page, int unlock, int migration) 1013static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
999{ 1014{
1000 struct anon_vma *anon_vma; 1015 struct anon_vma *anon_vma;
1001 struct vm_area_struct *vma; 1016 struct vm_area_struct *vma;
1002 unsigned int mlocked = 0; 1017 unsigned int mlocked = 0;
1003 int ret = SWAP_AGAIN; 1018 int ret = SWAP_AGAIN;
1019 int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
1004 1020
1005 if (MLOCK_PAGES && unlikely(unlock)) 1021 if (MLOCK_PAGES && unlikely(unlock))
1006 ret = SWAP_SUCCESS; /* default for try_to_munlock() */ 1022 ret = SWAP_SUCCESS; /* default for try_to_munlock() */
@@ -1016,7 +1032,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
1016 continue; /* must visit all unlocked vmas */ 1032 continue; /* must visit all unlocked vmas */
1017 ret = SWAP_MLOCK; /* saw at least one mlocked vma */ 1033 ret = SWAP_MLOCK; /* saw at least one mlocked vma */
1018 } else { 1034 } else {
1019 ret = try_to_unmap_one(page, vma, migration); 1035 ret = try_to_unmap_one(page, vma, flags);
1020 if (ret == SWAP_FAIL || !page_mapped(page)) 1036 if (ret == SWAP_FAIL || !page_mapped(page))
1021 break; 1037 break;
1022 } 1038 }
@@ -1040,8 +1056,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
1040/** 1056/**
1041 * try_to_unmap_file - unmap/unlock file page using the object-based rmap method 1057 * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
1042 * @page: the page to unmap/unlock 1058 * @page: the page to unmap/unlock
1043 * @unlock: request for unlock rather than unmap [unlikely] 1059 * @flags: action and flags
1044 * @migration: unmapping for migration - ignored if @unlock
1045 * 1060 *
1046 * Find all the mappings of a page using the mapping pointer and the vma chains 1061 * Find all the mappings of a page using the mapping pointer and the vma chains
1047 * contained in the address_space struct it points to. 1062 * contained in the address_space struct it points to.
@@ -1053,7 +1068,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
1053 * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1068 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1054 * 'LOCKED. 1069 * 'LOCKED.
1055 */ 1070 */
1056static int try_to_unmap_file(struct page *page, int unlock, int migration) 1071static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1057{ 1072{
1058 struct address_space *mapping = page->mapping; 1073 struct address_space *mapping = page->mapping;
1059 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1074 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -1065,6 +1080,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
1065 unsigned long max_nl_size = 0; 1080 unsigned long max_nl_size = 0;
1066 unsigned int mapcount; 1081 unsigned int mapcount;
1067 unsigned int mlocked = 0; 1082 unsigned int mlocked = 0;
1083 int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
1068 1084
1069 if (MLOCK_PAGES && unlikely(unlock)) 1085 if (MLOCK_PAGES && unlikely(unlock))
1070 ret = SWAP_SUCCESS; /* default for try_to_munlock() */ 1086 ret = SWAP_SUCCESS; /* default for try_to_munlock() */
@@ -1077,7 +1093,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
1077 continue; /* must visit all vmas */ 1093 continue; /* must visit all vmas */
1078 ret = SWAP_MLOCK; 1094 ret = SWAP_MLOCK;
1079 } else { 1095 } else {
1080 ret = try_to_unmap_one(page, vma, migration); 1096 ret = try_to_unmap_one(page, vma, flags);
1081 if (ret == SWAP_FAIL || !page_mapped(page)) 1097 if (ret == SWAP_FAIL || !page_mapped(page))
1082 goto out; 1098 goto out;
1083 } 1099 }
@@ -1102,7 +1118,8 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
1102 ret = SWAP_MLOCK; /* leave mlocked == 0 */ 1118 ret = SWAP_MLOCK; /* leave mlocked == 0 */
1103 goto out; /* no need to look further */ 1119 goto out; /* no need to look further */
1104 } 1120 }
1105 if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED)) 1121 if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) &&
1122 (vma->vm_flags & VM_LOCKED))
1106 continue; 1123 continue;
1107 cursor = (unsigned long) vma->vm_private_data; 1124 cursor = (unsigned long) vma->vm_private_data;
1108 if (cursor > max_nl_cursor) 1125 if (cursor > max_nl_cursor)
@@ -1136,7 +1153,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
1136 do { 1153 do {
1137 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1154 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
1138 shared.vm_set.list) { 1155 shared.vm_set.list) {
1139 if (!MLOCK_PAGES && !migration && 1156 if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) &&
1140 (vma->vm_flags & VM_LOCKED)) 1157 (vma->vm_flags & VM_LOCKED))
1141 continue; 1158 continue;
1142 cursor = (unsigned long) vma->vm_private_data; 1159 cursor = (unsigned long) vma->vm_private_data;
@@ -1176,7 +1193,7 @@ out:
1176/** 1193/**
1177 * try_to_unmap - try to remove all page table mappings to a page 1194 * try_to_unmap - try to remove all page table mappings to a page
1178 * @page: the page to get unmapped 1195 * @page: the page to get unmapped
1179 * @migration: migration flag 1196 * @flags: action and flags
1180 * 1197 *
1181 * Tries to remove all the page table entries which are mapping this 1198 * Tries to remove all the page table entries which are mapping this
1182 * page, used in the pageout path. Caller must hold the page lock. 1199 * page, used in the pageout path. Caller must hold the page lock.
@@ -1187,22 +1204,21 @@ out:
1187 * SWAP_FAIL - the page is unswappable 1204 * SWAP_FAIL - the page is unswappable
1188 * SWAP_MLOCK - page is mlocked. 1205 * SWAP_MLOCK - page is mlocked.
1189 */ 1206 */
1190int try_to_unmap(struct page *page, int migration) 1207int try_to_unmap(struct page *page, enum ttu_flags flags)
1191{ 1208{
1192 int ret; 1209 int ret;
1193 1210
1194 BUG_ON(!PageLocked(page)); 1211 BUG_ON(!PageLocked(page));
1195 1212
1196 if (PageAnon(page)) 1213 if (PageAnon(page))
1197 ret = try_to_unmap_anon(page, 0, migration); 1214 ret = try_to_unmap_anon(page, flags);
1198 else 1215 else
1199 ret = try_to_unmap_file(page, 0, migration); 1216 ret = try_to_unmap_file(page, flags);
1200 if (ret != SWAP_MLOCK && !page_mapped(page)) 1217 if (ret != SWAP_MLOCK && !page_mapped(page))
1201 ret = SWAP_SUCCESS; 1218 ret = SWAP_SUCCESS;
1202 return ret; 1219 return ret;
1203} 1220}
1204 1221
1205#ifdef CONFIG_UNEVICTABLE_LRU
1206/** 1222/**
1207 * try_to_munlock - try to munlock a page 1223 * try_to_munlock - try to munlock a page
1208 * @page: the page to be munlocked 1224 * @page: the page to be munlocked
@@ -1222,8 +1238,8 @@ int try_to_munlock(struct page *page)
1222 VM_BUG_ON(!PageLocked(page) || PageLRU(page)); 1238 VM_BUG_ON(!PageLocked(page) || PageLRU(page));
1223 1239
1224 if (PageAnon(page)) 1240 if (PageAnon(page))
1225 return try_to_unmap_anon(page, 1, 0); 1241 return try_to_unmap_anon(page, TTU_MUNLOCK);
1226 else 1242 else
1227 return try_to_unmap_file(page, 1, 0); 1243 return try_to_unmap_file(page, TTU_MUNLOCK);
1228} 1244}
1229#endif 1245
diff --git a/mm/shmem.c b/mm/shmem.c
index d94d2e9146bc..356dd99566ec 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -24,6 +24,7 @@
24#include <linux/init.h> 24#include <linux/init.h>
25#include <linux/vfs.h> 25#include <linux/vfs.h>
26#include <linux/mount.h> 26#include <linux/mount.h>
27#include <linux/pagemap.h>
27#include <linux/file.h> 28#include <linux/file.h>
28#include <linux/mm.h> 29#include <linux/mm.h>
29#include <linux/module.h> 30#include <linux/module.h>
@@ -43,13 +44,11 @@ static struct vfsmount *shm_mnt;
43#include <linux/exportfs.h> 44#include <linux/exportfs.h>
44#include <linux/generic_acl.h> 45#include <linux/generic_acl.h>
45#include <linux/mman.h> 46#include <linux/mman.h>
46#include <linux/pagemap.h>
47#include <linux/string.h> 47#include <linux/string.h>
48#include <linux/slab.h> 48#include <linux/slab.h>
49#include <linux/backing-dev.h> 49#include <linux/backing-dev.h>
50#include <linux/shmem_fs.h> 50#include <linux/shmem_fs.h>
51#include <linux/writeback.h> 51#include <linux/writeback.h>
52#include <linux/vfs.h>
53#include <linux/blkdev.h> 52#include <linux/blkdev.h>
54#include <linux/security.h> 53#include <linux/security.h>
55#include <linux/swapops.h> 54#include <linux/swapops.h>
@@ -65,13 +64,28 @@ static struct vfsmount *shm_mnt;
65#include <asm/div64.h> 64#include <asm/div64.h>
66#include <asm/pgtable.h> 65#include <asm/pgtable.h>
67 66
67/*
68 * The maximum size of a shmem/tmpfs file is limited by the maximum size of
69 * its triple-indirect swap vector - see illustration at shmem_swp_entry().
70 *
71 * With 4kB page size, maximum file size is just over 2TB on a 32-bit kernel,
72 * but one eighth of that on a 64-bit kernel. With 8kB page size, maximum
73 * file size is just over 4TB on a 64-bit kernel, but 16TB on a 32-bit kernel,
74 * MAX_LFS_FILESIZE being then more restrictive than swap vector layout.
75 *
76 * We use / and * instead of shifts in the definitions below, so that the swap
77 * vector can be tested with small even values (e.g. 20) for ENTRIES_PER_PAGE.
78 */
68#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long)) 79#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
69#define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) 80#define ENTRIES_PER_PAGEPAGE ((unsigned long long)ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
70#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) 81
82#define SHMSWP_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
83#define SHMSWP_MAX_BYTES (SHMSWP_MAX_INDEX << PAGE_CACHE_SHIFT)
71 84
72#define SHMEM_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1)) 85#define SHMEM_MAX_BYTES min_t(unsigned long long, SHMSWP_MAX_BYTES, MAX_LFS_FILESIZE)
73#define SHMEM_MAX_BYTES ((unsigned long long)SHMEM_MAX_INDEX << PAGE_CACHE_SHIFT) 86#define SHMEM_MAX_INDEX ((unsigned long)((SHMEM_MAX_BYTES+1) >> PAGE_CACHE_SHIFT))
74 87
88#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)
75#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) 89#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
76 90
77/* info->flags needs VM_flags to handle pagein/truncate races efficiently */ 91/* info->flags needs VM_flags to handle pagein/truncate races efficiently */
@@ -204,7 +218,7 @@ static const struct file_operations shmem_file_operations;
204static const struct inode_operations shmem_inode_operations; 218static const struct inode_operations shmem_inode_operations;
205static const struct inode_operations shmem_dir_inode_operations; 219static const struct inode_operations shmem_dir_inode_operations;
206static const struct inode_operations shmem_special_inode_operations; 220static const struct inode_operations shmem_special_inode_operations;
207static struct vm_operations_struct shmem_vm_ops; 221static const struct vm_operations_struct shmem_vm_ops;
208 222
209static struct backing_dev_info shmem_backing_dev_info __read_mostly = { 223static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
210 .ra_pages = 0, /* No readahead */ 224 .ra_pages = 0, /* No readahead */
@@ -1032,8 +1046,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1032 * sync from ever calling shmem_writepage; but a stacking filesystem 1046 * sync from ever calling shmem_writepage; but a stacking filesystem
1033 * may use the ->writepage of its underlying filesystem, in which case 1047 * may use the ->writepage of its underlying filesystem, in which case
1034 * tmpfs should write out to swap only in response to memory pressure, 1048 * tmpfs should write out to swap only in response to memory pressure,
1035 * and not for pdflush or sync. However, in those cases, we do still 1049 * and not for the writeback threads or sync. However, in those cases,
1036 * want to check if there's a redundant swappage to be discarded. 1050 * we do still want to check if there's a redundant swappage to be
1051 * discarded.
1037 */ 1052 */
1038 if (wbc->for_reclaim) 1053 if (wbc->for_reclaim)
1039 swap = get_swap_page(); 1054 swap = get_swap_page();
@@ -1082,7 +1097,11 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1082 shmem_swp_unmap(entry); 1097 shmem_swp_unmap(entry);
1083unlock: 1098unlock:
1084 spin_unlock(&info->lock); 1099 spin_unlock(&info->lock);
1085 swap_free(swap); 1100 /*
1101 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
1102 * clear SWAP_HAS_CACHE flag.
1103 */
1104 swapcache_free(swap, NULL);
1086redirty: 1105redirty:
1087 set_page_dirty(page); 1106 set_page_dirty(page);
1088 if (wbc->for_reclaim) 1107 if (wbc->for_reclaim)
@@ -1325,8 +1344,12 @@ repeat:
1325 shmem_swp_unmap(entry); 1344 shmem_swp_unmap(entry);
1326 spin_unlock(&info->lock); 1345 spin_unlock(&info->lock);
1327 if (error == -ENOMEM) { 1346 if (error == -ENOMEM) {
1328 /* allow reclaim from this memory cgroup */ 1347 /*
1329 error = mem_cgroup_shrink_usage(swappage, 1348 * reclaim from proper memory cgroup and
1349 * call memcg's OOM if needed.
1350 */
1351 error = mem_cgroup_shmem_charge_fallback(
1352 swappage,
1330 current->mm, 1353 current->mm,
1331 gfp); 1354 gfp);
1332 if (error) { 1355 if (error) {
@@ -1539,6 +1562,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, int mode,
1539 spin_lock_init(&info->lock); 1562 spin_lock_init(&info->lock);
1540 info->flags = flags & VM_NORESERVE; 1563 info->flags = flags & VM_NORESERVE;
1541 INIT_LIST_HEAD(&info->swaplist); 1564 INIT_LIST_HEAD(&info->swaplist);
1565 cache_no_acl(inode);
1542 1566
1543 switch (mode & S_IFMT) { 1567 switch (mode & S_IFMT) {
1544 default: 1568 default:
@@ -1610,8 +1634,8 @@ shmem_write_end(struct file *file, struct address_space *mapping,
1610 if (pos + copied > inode->i_size) 1634 if (pos + copied > inode->i_size)
1611 i_size_write(inode, pos + copied); 1635 i_size_write(inode, pos + copied);
1612 1636
1613 unlock_page(page);
1614 set_page_dirty(page); 1637 set_page_dirty(page);
1638 unlock_page(page);
1615 page_cache_release(page); 1639 page_cache_release(page);
1616 1640
1617 return copied; 1641 return copied;
@@ -1948,13 +1972,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
1948 iput(inode); 1972 iput(inode);
1949 return error; 1973 return error;
1950 } 1974 }
1951 unlock_page(page);
1952 inode->i_mapping->a_ops = &shmem_aops; 1975 inode->i_mapping->a_ops = &shmem_aops;
1953 inode->i_op = &shmem_symlink_inode_operations; 1976 inode->i_op = &shmem_symlink_inode_operations;
1954 kaddr = kmap_atomic(page, KM_USER0); 1977 kaddr = kmap_atomic(page, KM_USER0);
1955 memcpy(kaddr, symname, len); 1978 memcpy(kaddr, symname, len);
1956 kunmap_atomic(kaddr, KM_USER0); 1979 kunmap_atomic(kaddr, KM_USER0);
1957 set_page_dirty(page); 1980 set_page_dirty(page);
1981 unlock_page(page);
1958 page_cache_release(page); 1982 page_cache_release(page);
1959 } 1983 }
1960 if (dir->i_mode & S_ISGID) 1984 if (dir->i_mode & S_ISGID)
@@ -2278,8 +2302,7 @@ static void shmem_put_super(struct super_block *sb)
2278 sb->s_fs_info = NULL; 2302 sb->s_fs_info = NULL;
2279} 2303}
2280 2304
2281static int shmem_fill_super(struct super_block *sb, 2305int shmem_fill_super(struct super_block *sb, void *data, int silent)
2282 void *data, int silent)
2283{ 2306{
2284 struct inode *inode; 2307 struct inode *inode;
2285 struct dentry *root; 2308 struct dentry *root;
@@ -2287,17 +2310,14 @@ static int shmem_fill_super(struct super_block *sb,
2287 int err = -ENOMEM; 2310 int err = -ENOMEM;
2288 2311
2289 /* Round up to L1_CACHE_BYTES to resist false sharing */ 2312 /* Round up to L1_CACHE_BYTES to resist false sharing */
2290 sbinfo = kmalloc(max((int)sizeof(struct shmem_sb_info), 2313 sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
2291 L1_CACHE_BYTES), GFP_KERNEL); 2314 L1_CACHE_BYTES), GFP_KERNEL);
2292 if (!sbinfo) 2315 if (!sbinfo)
2293 return -ENOMEM; 2316 return -ENOMEM;
2294 2317
2295 sbinfo->max_blocks = 0;
2296 sbinfo->max_inodes = 0;
2297 sbinfo->mode = S_IRWXUGO | S_ISVTX; 2318 sbinfo->mode = S_IRWXUGO | S_ISVTX;
2298 sbinfo->uid = current_fsuid(); 2319 sbinfo->uid = current_fsuid();
2299 sbinfo->gid = current_fsgid(); 2320 sbinfo->gid = current_fsgid();
2300 sbinfo->mpol = NULL;
2301 sb->s_fs_info = sbinfo; 2321 sb->s_fs_info = sbinfo;
2302 2322
2303#ifdef CONFIG_TMPFS 2323#ifdef CONFIG_TMPFS
@@ -2369,7 +2389,6 @@ static void shmem_destroy_inode(struct inode *inode)
2369 /* only struct inode is valid if it's an inline symlink */ 2389 /* only struct inode is valid if it's an inline symlink */
2370 mpol_free_shared_policy(&SHMEM_I(inode)->policy); 2390 mpol_free_shared_policy(&SHMEM_I(inode)->policy);
2371 } 2391 }
2372 shmem_acl_destroy_inode(inode);
2373 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 2392 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
2374} 2393}
2375 2394
@@ -2378,10 +2397,6 @@ static void init_once(void *foo)
2378 struct shmem_inode_info *p = (struct shmem_inode_info *) foo; 2397 struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
2379 2398
2380 inode_init_once(&p->vfs_inode); 2399 inode_init_once(&p->vfs_inode);
2381#ifdef CONFIG_TMPFS_POSIX_ACL
2382 p->i_acl = NULL;
2383 p->i_default_acl = NULL;
2384#endif
2385} 2400}
2386 2401
2387static int init_inodecache(void) 2402static int init_inodecache(void)
@@ -2406,6 +2421,7 @@ static const struct address_space_operations shmem_aops = {
2406 .write_end = shmem_write_end, 2421 .write_end = shmem_write_end,
2407#endif 2422#endif
2408 .migratepage = migrate_page, 2423 .migratepage = migrate_page,
2424 .error_remove_page = generic_error_remove_page,
2409}; 2425};
2410 2426
2411static const struct file_operations shmem_file_operations = { 2427static const struct file_operations shmem_file_operations = {
@@ -2431,7 +2447,7 @@ static const struct inode_operations shmem_inode_operations = {
2431 .getxattr = generic_getxattr, 2447 .getxattr = generic_getxattr,
2432 .listxattr = generic_listxattr, 2448 .listxattr = generic_listxattr,
2433 .removexattr = generic_removexattr, 2449 .removexattr = generic_removexattr,
2434 .permission = shmem_permission, 2450 .check_acl = shmem_check_acl,
2435#endif 2451#endif
2436 2452
2437}; 2453};
@@ -2454,7 +2470,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
2454 .getxattr = generic_getxattr, 2470 .getxattr = generic_getxattr,
2455 .listxattr = generic_listxattr, 2471 .listxattr = generic_listxattr,
2456 .removexattr = generic_removexattr, 2472 .removexattr = generic_removexattr,
2457 .permission = shmem_permission, 2473 .check_acl = shmem_check_acl,
2458#endif 2474#endif
2459}; 2475};
2460 2476
@@ -2465,7 +2481,7 @@ static const struct inode_operations shmem_special_inode_operations = {
2465 .getxattr = generic_getxattr, 2481 .getxattr = generic_getxattr,
2466 .listxattr = generic_listxattr, 2482 .listxattr = generic_listxattr,
2467 .removexattr = generic_removexattr, 2483 .removexattr = generic_removexattr,
2468 .permission = shmem_permission, 2484 .check_acl = shmem_check_acl,
2469#endif 2485#endif
2470}; 2486};
2471 2487
@@ -2482,7 +2498,7 @@ static const struct super_operations shmem_ops = {
2482 .put_super = shmem_put_super, 2498 .put_super = shmem_put_super,
2483}; 2499};
2484 2500
2485static struct vm_operations_struct shmem_vm_ops = { 2501static const struct vm_operations_struct shmem_vm_ops = {
2486 .fault = shmem_fault, 2502 .fault = shmem_fault,
2487#ifdef CONFIG_NUMA 2503#ifdef CONFIG_NUMA
2488 .set_policy = shmem_set_policy, 2504 .set_policy = shmem_set_policy,
@@ -2504,7 +2520,7 @@ static struct file_system_type tmpfs_fs_type = {
2504 .kill_sb = kill_litter_super, 2520 .kill_sb = kill_litter_super,
2505}; 2521};
2506 2522
2507static int __init init_tmpfs(void) 2523int __init init_tmpfs(void)
2508{ 2524{
2509 int error; 2525 int error;
2510 2526
@@ -2561,7 +2577,7 @@ static struct file_system_type tmpfs_fs_type = {
2561 .kill_sb = kill_litter_super, 2577 .kill_sb = kill_litter_super,
2562}; 2578};
2563 2579
2564static int __init init_tmpfs(void) 2580int __init init_tmpfs(void)
2565{ 2581{
2566 BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); 2582 BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
2567 2583
@@ -2576,12 +2592,17 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
2576 return 0; 2592 return 0;
2577} 2593}
2578 2594
2595int shmem_lock(struct file *file, int lock, struct user_struct *user)
2596{
2597 return 0;
2598}
2599
2579#define shmem_vm_ops generic_file_vm_ops 2600#define shmem_vm_ops generic_file_vm_ops
2580#define shmem_file_operations ramfs_file_operations 2601#define shmem_file_operations ramfs_file_operations
2581#define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev) 2602#define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev)
2582#define shmem_acct_size(flags, size) 0 2603#define shmem_acct_size(flags, size) 0
2583#define shmem_unacct_size(flags, size) do {} while (0) 2604#define shmem_unacct_size(flags, size) do {} while (0)
2584#define SHMEM_MAX_BYTES LLONG_MAX 2605#define SHMEM_MAX_BYTES MAX_LFS_FILESIZE
2585 2606
2586#endif /* CONFIG_SHMEM */ 2607#endif /* CONFIG_SHMEM */
2587 2608
@@ -2593,7 +2614,7 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
2593 * @size: size to be set for the file 2614 * @size: size to be set for the file
2594 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 2615 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
2595 */ 2616 */
2596struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) 2617struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
2597{ 2618{
2598 int error; 2619 int error;
2599 struct file *file; 2620 struct file *file;
@@ -2640,6 +2661,7 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
2640 if (error) 2661 if (error)
2641 goto close_file; 2662 goto close_file;
2642#endif 2663#endif
2664 ima_counts_get(file);
2643 return file; 2665 return file;
2644 2666
2645close_file: 2667close_file:
@@ -2665,12 +2687,9 @@ int shmem_zero_setup(struct vm_area_struct *vma)
2665 if (IS_ERR(file)) 2687 if (IS_ERR(file))
2666 return PTR_ERR(file); 2688 return PTR_ERR(file);
2667 2689
2668 ima_shm_check(file);
2669 if (vma->vm_file) 2690 if (vma->vm_file)
2670 fput(vma->vm_file); 2691 fput(vma->vm_file);
2671 vma->vm_file = file; 2692 vma->vm_file = file;
2672 vma->vm_ops = &shmem_vm_ops; 2693 vma->vm_ops = &shmem_vm_ops;
2673 return 0; 2694 return 0;
2674} 2695}
2675
2676module_init(init_tmpfs)
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
index 8e5aadd7dcd6..df2c87fdae50 100644
--- a/mm/shmem_acl.c
+++ b/mm/shmem_acl.c
@@ -22,11 +22,11 @@ shmem_get_acl(struct inode *inode, int type)
22 spin_lock(&inode->i_lock); 22 spin_lock(&inode->i_lock);
23 switch(type) { 23 switch(type) {
24 case ACL_TYPE_ACCESS: 24 case ACL_TYPE_ACCESS:
25 acl = posix_acl_dup(SHMEM_I(inode)->i_acl); 25 acl = posix_acl_dup(inode->i_acl);
26 break; 26 break;
27 27
28 case ACL_TYPE_DEFAULT: 28 case ACL_TYPE_DEFAULT:
29 acl = posix_acl_dup(SHMEM_I(inode)->i_default_acl); 29 acl = posix_acl_dup(inode->i_default_acl);
30 break; 30 break;
31 } 31 }
32 spin_unlock(&inode->i_lock); 32 spin_unlock(&inode->i_lock);
@@ -45,13 +45,13 @@ shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl)
45 spin_lock(&inode->i_lock); 45 spin_lock(&inode->i_lock);
46 switch(type) { 46 switch(type) {
47 case ACL_TYPE_ACCESS: 47 case ACL_TYPE_ACCESS:
48 free = SHMEM_I(inode)->i_acl; 48 free = inode->i_acl;
49 SHMEM_I(inode)->i_acl = posix_acl_dup(acl); 49 inode->i_acl = posix_acl_dup(acl);
50 break; 50 break;
51 51
52 case ACL_TYPE_DEFAULT: 52 case ACL_TYPE_DEFAULT:
53 free = SHMEM_I(inode)->i_default_acl; 53 free = inode->i_default_acl;
54 SHMEM_I(inode)->i_default_acl = posix_acl_dup(acl); 54 inode->i_default_acl = posix_acl_dup(acl);
55 break; 55 break;
56 } 56 }
57 spin_unlock(&inode->i_lock); 57 spin_unlock(&inode->i_lock);
@@ -155,26 +155,9 @@ shmem_acl_init(struct inode *inode, struct inode *dir)
155} 155}
156 156
157/** 157/**
158 * shmem_acl_destroy_inode - destroy acls hanging off the in-memory inode
159 *
160 * This is done before destroying the actual inode.
161 */
162
163void
164shmem_acl_destroy_inode(struct inode *inode)
165{
166 if (SHMEM_I(inode)->i_acl)
167 posix_acl_release(SHMEM_I(inode)->i_acl);
168 SHMEM_I(inode)->i_acl = NULL;
169 if (SHMEM_I(inode)->i_default_acl)
170 posix_acl_release(SHMEM_I(inode)->i_default_acl);
171 SHMEM_I(inode)->i_default_acl = NULL;
172}
173
174/**
175 * shmem_check_acl - check_acl() callback for generic_permission() 158 * shmem_check_acl - check_acl() callback for generic_permission()
176 */ 159 */
177static int 160int
178shmem_check_acl(struct inode *inode, int mask) 161shmem_check_acl(struct inode *inode, int mask)
179{ 162{
180 struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS); 163 struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS);
@@ -186,12 +169,3 @@ shmem_check_acl(struct inode *inode, int mask)
186 } 169 }
187 return -EAGAIN; 170 return -EAGAIN;
188} 171}
189
190/**
191 * shmem_permission - permission() inode operation
192 */
193int
194shmem_permission(struct inode *inode, int mask)
195{
196 return generic_permission(inode, mask, shmem_check_acl);
197}
diff --git a/mm/slab.c b/mm/slab.c
index 825c606f691d..7dfa481c96ba 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -102,16 +102,19 @@
102#include <linux/cpu.h> 102#include <linux/cpu.h>
103#include <linux/sysctl.h> 103#include <linux/sysctl.h>
104#include <linux/module.h> 104#include <linux/module.h>
105#include <linux/kmemtrace.h>
105#include <linux/rcupdate.h> 106#include <linux/rcupdate.h>
106#include <linux/string.h> 107#include <linux/string.h>
107#include <linux/uaccess.h> 108#include <linux/uaccess.h>
108#include <linux/nodemask.h> 109#include <linux/nodemask.h>
110#include <linux/kmemleak.h>
109#include <linux/mempolicy.h> 111#include <linux/mempolicy.h>
110#include <linux/mutex.h> 112#include <linux/mutex.h>
111#include <linux/fault-inject.h> 113#include <linux/fault-inject.h>
112#include <linux/rtmutex.h> 114#include <linux/rtmutex.h>
113#include <linux/reciprocal_div.h> 115#include <linux/reciprocal_div.h>
114#include <linux/debugobjects.h> 116#include <linux/debugobjects.h>
117#include <linux/kmemcheck.h>
115 118
116#include <asm/cacheflush.h> 119#include <asm/cacheflush.h>
117#include <asm/tlbflush.h> 120#include <asm/tlbflush.h>
@@ -177,13 +180,13 @@
177 SLAB_STORE_USER | \ 180 SLAB_STORE_USER | \
178 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 181 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
179 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ 182 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
180 SLAB_DEBUG_OBJECTS) 183 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
181#else 184#else
182# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ 185# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \
183 SLAB_CACHE_DMA | \ 186 SLAB_CACHE_DMA | \
184 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 187 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
185 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ 188 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
186 SLAB_DEBUG_OBJECTS) 189 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
187#endif 190#endif
188 191
189/* 192/*
@@ -314,7 +317,7 @@ static int drain_freelist(struct kmem_cache *cache,
314 struct kmem_list3 *l3, int tofree); 317 struct kmem_list3 *l3, int tofree);
315static void free_block(struct kmem_cache *cachep, void **objpp, int len, 318static void free_block(struct kmem_cache *cachep, void **objpp, int len,
316 int node); 319 int node);
317static int enable_cpucache(struct kmem_cache *cachep); 320static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
318static void cache_reap(struct work_struct *unused); 321static void cache_reap(struct work_struct *unused);
319 322
320/* 323/*
@@ -372,87 +375,6 @@ static void kmem_list3_init(struct kmem_list3 *parent)
372 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ 375 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
373 } while (0) 376 } while (0)
374 377
375/*
376 * struct kmem_cache
377 *
378 * manages a cache.
379 */
380
381struct kmem_cache {
382/* 1) per-cpu data, touched during every alloc/free */
383 struct array_cache *array[NR_CPUS];
384/* 2) Cache tunables. Protected by cache_chain_mutex */
385 unsigned int batchcount;
386 unsigned int limit;
387 unsigned int shared;
388
389 unsigned int buffer_size;
390 u32 reciprocal_buffer_size;
391/* 3) touched by every alloc & free from the backend */
392
393 unsigned int flags; /* constant flags */
394 unsigned int num; /* # of objs per slab */
395
396/* 4) cache_grow/shrink */
397 /* order of pgs per slab (2^n) */
398 unsigned int gfporder;
399
400 /* force GFP flags, e.g. GFP_DMA */
401 gfp_t gfpflags;
402
403 size_t colour; /* cache colouring range */
404 unsigned int colour_off; /* colour offset */
405 struct kmem_cache *slabp_cache;
406 unsigned int slab_size;
407 unsigned int dflags; /* dynamic flags */
408
409 /* constructor func */
410 void (*ctor)(void *obj);
411
412/* 5) cache creation/removal */
413 const char *name;
414 struct list_head next;
415
416/* 6) statistics */
417#if STATS
418 unsigned long num_active;
419 unsigned long num_allocations;
420 unsigned long high_mark;
421 unsigned long grown;
422 unsigned long reaped;
423 unsigned long errors;
424 unsigned long max_freeable;
425 unsigned long node_allocs;
426 unsigned long node_frees;
427 unsigned long node_overflow;
428 atomic_t allochit;
429 atomic_t allocmiss;
430 atomic_t freehit;
431 atomic_t freemiss;
432#endif
433#if DEBUG
434 /*
435 * If debugging is enabled, then the allocator can add additional
436 * fields and/or padding to every object. buffer_size contains the total
437 * object size including these internal fields, the following two
438 * variables contain the offset to the user object and its size.
439 */
440 int obj_offset;
441 int obj_size;
442#endif
443 /*
444 * We put nodelists[] at the end of kmem_cache, because we want to size
445 * this array to nr_node_ids slots instead of MAX_NUMNODES
446 * (see kmem_cache_init())
447 * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache
448 * is statically defined, so we reserve the max number of nodes.
449 */
450 struct kmem_list3 *nodelists[MAX_NUMNODES];
451 /*
452 * Do not add fields after nodelists[]
453 */
454};
455
456#define CFLGS_OFF_SLAB (0x80000000UL) 378#define CFLGS_OFF_SLAB (0x80000000UL)
457#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) 379#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
458 380
@@ -568,6 +490,14 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
568 490
569#endif 491#endif
570 492
493#ifdef CONFIG_KMEMTRACE
494size_t slab_buffer_size(struct kmem_cache *cachep)
495{
496 return cachep->buffer_size;
497}
498EXPORT_SYMBOL(slab_buffer_size);
499#endif
500
571/* 501/*
572 * Do not go above this order unless 0 objects fit into the slab. 502 * Do not go above this order unless 0 objects fit into the slab.
573 */ 503 */
@@ -743,6 +673,7 @@ static enum {
743 NONE, 673 NONE,
744 PARTIAL_AC, 674 PARTIAL_AC,
745 PARTIAL_L3, 675 PARTIAL_L3,
676 EARLY,
746 FULL 677 FULL
747} g_cpucache_up; 678} g_cpucache_up;
748 679
@@ -751,7 +682,7 @@ static enum {
751 */ 682 */
752int slab_is_available(void) 683int slab_is_available(void)
753{ 684{
754 return g_cpucache_up == FULL; 685 return g_cpucache_up >= EARLY;
755} 686}
756 687
757static DEFINE_PER_CPU(struct delayed_work, reap_work); 688static DEFINE_PER_CPU(struct delayed_work, reap_work);
@@ -881,7 +812,6 @@ static void __slab_error(const char *function, struct kmem_cache *cachep,
881 */ 812 */
882 813
883static int use_alien_caches __read_mostly = 1; 814static int use_alien_caches __read_mostly = 1;
884static int numa_platform __read_mostly = 1;
885static int __init noaliencache_setup(char *s) 815static int __init noaliencache_setup(char *s)
886{ 816{
887 use_alien_caches = 0; 817 use_alien_caches = 0;
@@ -949,12 +879,20 @@ static void __cpuinit start_cpu_timer(int cpu)
949} 879}
950 880
951static struct array_cache *alloc_arraycache(int node, int entries, 881static struct array_cache *alloc_arraycache(int node, int entries,
952 int batchcount) 882 int batchcount, gfp_t gfp)
953{ 883{
954 int memsize = sizeof(void *) * entries + sizeof(struct array_cache); 884 int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
955 struct array_cache *nc = NULL; 885 struct array_cache *nc = NULL;
956 886
957 nc = kmalloc_node(memsize, GFP_KERNEL, node); 887 nc = kmalloc_node(memsize, gfp, node);
888 /*
889 * The array_cache structures contain pointers to free object.
890 * However, when such objects are allocated or transfered to another
891 * cache the pointers are not cleared and they could be counted as
892 * valid references during a kmemleak scan. Therefore, kmemleak must
893 * not scan such objects.
894 */
895 kmemleak_no_scan(nc);
958 if (nc) { 896 if (nc) {
959 nc->avail = 0; 897 nc->avail = 0;
960 nc->limit = entries; 898 nc->limit = entries;
@@ -994,7 +932,7 @@ static int transfer_objects(struct array_cache *to,
994#define drain_alien_cache(cachep, alien) do { } while (0) 932#define drain_alien_cache(cachep, alien) do { } while (0)
995#define reap_alien(cachep, l3) do { } while (0) 933#define reap_alien(cachep, l3) do { } while (0)
996 934
997static inline struct array_cache **alloc_alien_cache(int node, int limit) 935static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
998{ 936{
999 return (struct array_cache **)BAD_ALIEN_MAGIC; 937 return (struct array_cache **)BAD_ALIEN_MAGIC;
1000} 938}
@@ -1025,7 +963,7 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep,
1025static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); 963static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
1026static void *alternate_node_alloc(struct kmem_cache *, gfp_t); 964static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
1027 965
1028static struct array_cache **alloc_alien_cache(int node, int limit) 966static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
1029{ 967{
1030 struct array_cache **ac_ptr; 968 struct array_cache **ac_ptr;
1031 int memsize = sizeof(void *) * nr_node_ids; 969 int memsize = sizeof(void *) * nr_node_ids;
@@ -1033,14 +971,14 @@ static struct array_cache **alloc_alien_cache(int node, int limit)
1033 971
1034 if (limit > 1) 972 if (limit > 1)
1035 limit = 12; 973 limit = 12;
1036 ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node); 974 ac_ptr = kmalloc_node(memsize, gfp, node);
1037 if (ac_ptr) { 975 if (ac_ptr) {
1038 for_each_node(i) { 976 for_each_node(i) {
1039 if (i == node || !node_online(i)) { 977 if (i == node || !node_online(i)) {
1040 ac_ptr[i] = NULL; 978 ac_ptr[i] = NULL;
1041 continue; 979 continue;
1042 } 980 }
1043 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d); 981 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
1044 if (!ac_ptr[i]) { 982 if (!ac_ptr[i]) {
1045 for (i--; i >= 0; i--) 983 for (i--; i >= 0; i--)
1046 kfree(ac_ptr[i]); 984 kfree(ac_ptr[i]);
@@ -1160,7 +1098,7 @@ static void __cpuinit cpuup_canceled(long cpu)
1160 struct kmem_cache *cachep; 1098 struct kmem_cache *cachep;
1161 struct kmem_list3 *l3 = NULL; 1099 struct kmem_list3 *l3 = NULL;
1162 int node = cpu_to_node(cpu); 1100 int node = cpu_to_node(cpu);
1163 node_to_cpumask_ptr(mask, node); 1101 const struct cpumask *mask = cpumask_of_node(node);
1164 1102
1165 list_for_each_entry(cachep, &cache_chain, next) { 1103 list_for_each_entry(cachep, &cache_chain, next) {
1166 struct array_cache *nc; 1104 struct array_cache *nc;
@@ -1273,20 +1211,20 @@ static int __cpuinit cpuup_prepare(long cpu)
1273 struct array_cache **alien = NULL; 1211 struct array_cache **alien = NULL;
1274 1212
1275 nc = alloc_arraycache(node, cachep->limit, 1213 nc = alloc_arraycache(node, cachep->limit,
1276 cachep->batchcount); 1214 cachep->batchcount, GFP_KERNEL);
1277 if (!nc) 1215 if (!nc)
1278 goto bad; 1216 goto bad;
1279 if (cachep->shared) { 1217 if (cachep->shared) {
1280 shared = alloc_arraycache(node, 1218 shared = alloc_arraycache(node,
1281 cachep->shared * cachep->batchcount, 1219 cachep->shared * cachep->batchcount,
1282 0xbaadf00d); 1220 0xbaadf00d, GFP_KERNEL);
1283 if (!shared) { 1221 if (!shared) {
1284 kfree(nc); 1222 kfree(nc);
1285 goto bad; 1223 goto bad;
1286 } 1224 }
1287 } 1225 }
1288 if (use_alien_caches) { 1226 if (use_alien_caches) {
1289 alien = alloc_alien_cache(node, cachep->limit); 1227 alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL);
1290 if (!alien) { 1228 if (!alien) {
1291 kfree(shared); 1229 kfree(shared);
1292 kfree(nc); 1230 kfree(nc);
@@ -1390,10 +1328,9 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
1390{ 1328{
1391 struct kmem_list3 *ptr; 1329 struct kmem_list3 *ptr;
1392 1330
1393 ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid); 1331 ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_NOWAIT, nodeid);
1394 BUG_ON(!ptr); 1332 BUG_ON(!ptr);
1395 1333
1396 local_irq_disable();
1397 memcpy(ptr, list, sizeof(struct kmem_list3)); 1334 memcpy(ptr, list, sizeof(struct kmem_list3));
1398 /* 1335 /*
1399 * Do not assume that spinlocks can be initialized via memcpy: 1336 * Do not assume that spinlocks can be initialized via memcpy:
@@ -1402,7 +1339,6 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
1402 1339
1403 MAKE_ALL_LISTS(cachep, ptr, nodeid); 1340 MAKE_ALL_LISTS(cachep, ptr, nodeid);
1404 cachep->nodelists[nodeid] = ptr; 1341 cachep->nodelists[nodeid] = ptr;
1405 local_irq_enable();
1406} 1342}
1407 1343
1408/* 1344/*
@@ -1434,10 +1370,8 @@ void __init kmem_cache_init(void)
1434 int order; 1370 int order;
1435 int node; 1371 int node;
1436 1372
1437 if (num_possible_nodes() == 1) { 1373 if (num_possible_nodes() == 1)
1438 use_alien_caches = 0; 1374 use_alien_caches = 0;
1439 numa_platform = 0;
1440 }
1441 1375
1442 for (i = 0; i < NUM_INIT_LISTS; i++) { 1376 for (i = 0; i < NUM_INIT_LISTS; i++) {
1443 kmem_list3_init(&initkmem_list3[i]); 1377 kmem_list3_init(&initkmem_list3[i]);
@@ -1450,7 +1384,7 @@ void __init kmem_cache_init(void)
1450 * Fragmentation resistance on low memory - only use bigger 1384 * Fragmentation resistance on low memory - only use bigger
1451 * page orders on machines with more than 32MB of memory. 1385 * page orders on machines with more than 32MB of memory.
1452 */ 1386 */
1453 if (num_physpages > (32 << 20) >> PAGE_SHIFT) 1387 if (totalram_pages > (32 << 20) >> PAGE_SHIFT)
1454 slab_break_gfp_order = BREAK_GFP_ORDER_HI; 1388 slab_break_gfp_order = BREAK_GFP_ORDER_HI;
1455 1389
1456 /* Bootstrap is tricky, because several objects are allocated 1390 /* Bootstrap is tricky, because several objects are allocated
@@ -1566,9 +1500,8 @@ void __init kmem_cache_init(void)
1566 { 1500 {
1567 struct array_cache *ptr; 1501 struct array_cache *ptr;
1568 1502
1569 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1503 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1570 1504
1571 local_irq_disable();
1572 BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); 1505 BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
1573 memcpy(ptr, cpu_cache_get(&cache_cache), 1506 memcpy(ptr, cpu_cache_get(&cache_cache),
1574 sizeof(struct arraycache_init)); 1507 sizeof(struct arraycache_init));
@@ -1578,11 +1511,9 @@ void __init kmem_cache_init(void)
1578 spin_lock_init(&ptr->lock); 1511 spin_lock_init(&ptr->lock);
1579 1512
1580 cache_cache.array[smp_processor_id()] = ptr; 1513 cache_cache.array[smp_processor_id()] = ptr;
1581 local_irq_enable();
1582 1514
1583 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1515 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1584 1516
1585 local_irq_disable();
1586 BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep) 1517 BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
1587 != &initarray_generic.cache); 1518 != &initarray_generic.cache);
1588 memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), 1519 memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
@@ -1594,7 +1525,6 @@ void __init kmem_cache_init(void)
1594 1525
1595 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = 1526 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
1596 ptr; 1527 ptr;
1597 local_irq_enable();
1598 } 1528 }
1599 /* 5) Replace the bootstrap kmem_list3's */ 1529 /* 5) Replace the bootstrap kmem_list3's */
1600 { 1530 {
@@ -1613,23 +1543,26 @@ void __init kmem_cache_init(void)
1613 } 1543 }
1614 } 1544 }
1615 1545
1616 /* 6) resize the head arrays to their final sizes */ 1546 g_cpucache_up = EARLY;
1617 { 1547}
1618 struct kmem_cache *cachep;
1619 mutex_lock(&cache_chain_mutex);
1620 list_for_each_entry(cachep, &cache_chain, next)
1621 if (enable_cpucache(cachep))
1622 BUG();
1623 mutex_unlock(&cache_chain_mutex);
1624 }
1625 1548
1626 /* Annotate slab for lockdep -- annotate the malloc caches */ 1549void __init kmem_cache_init_late(void)
1627 init_lock_keys(); 1550{
1551 struct kmem_cache *cachep;
1628 1552
1553 /* 6) resize the head arrays to their final sizes */
1554 mutex_lock(&cache_chain_mutex);
1555 list_for_each_entry(cachep, &cache_chain, next)
1556 if (enable_cpucache(cachep, GFP_NOWAIT))
1557 BUG();
1558 mutex_unlock(&cache_chain_mutex);
1629 1559
1630 /* Done! */ 1560 /* Done! */
1631 g_cpucache_up = FULL; 1561 g_cpucache_up = FULL;
1632 1562
1563 /* Annotate slab for lockdep -- annotate the malloc caches */
1564 init_lock_keys();
1565
1633 /* 1566 /*
1634 * Register a cpu startup notifier callback that initializes 1567 * Register a cpu startup notifier callback that initializes
1635 * cpu_cache_get for all new cpus 1568 * cpu_cache_get for all new cpus
@@ -1680,7 +1613,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1680 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1613 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1681 flags |= __GFP_RECLAIMABLE; 1614 flags |= __GFP_RECLAIMABLE;
1682 1615
1683 page = alloc_pages_node(nodeid, flags, cachep->gfporder); 1616 page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
1684 if (!page) 1617 if (!page)
1685 return NULL; 1618 return NULL;
1686 1619
@@ -1693,6 +1626,16 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1693 NR_SLAB_UNRECLAIMABLE, nr_pages); 1626 NR_SLAB_UNRECLAIMABLE, nr_pages);
1694 for (i = 0; i < nr_pages; i++) 1627 for (i = 0; i < nr_pages; i++)
1695 __SetPageSlab(page + i); 1628 __SetPageSlab(page + i);
1629
1630 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
1631 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
1632
1633 if (cachep->ctor)
1634 kmemcheck_mark_uninitialized_pages(page, nr_pages);
1635 else
1636 kmemcheck_mark_unallocated_pages(page, nr_pages);
1637 }
1638
1696 return page_address(page); 1639 return page_address(page);
1697} 1640}
1698 1641
@@ -1705,6 +1648,8 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1705 struct page *page = virt_to_page(addr); 1648 struct page *page = virt_to_page(addr);
1706 const unsigned long nr_freed = i; 1649 const unsigned long nr_freed = i;
1707 1650
1651 kmemcheck_free_shadow(page, cachep->gfporder);
1652
1708 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1653 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1709 sub_zone_page_state(page_zone(page), 1654 sub_zone_page_state(page_zone(page),
1710 NR_SLAB_RECLAIMABLE, nr_freed); 1655 NR_SLAB_RECLAIMABLE, nr_freed);
@@ -2055,10 +2000,10 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
2055 return left_over; 2000 return left_over;
2056} 2001}
2057 2002
2058static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) 2003static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2059{ 2004{
2060 if (g_cpucache_up == FULL) 2005 if (g_cpucache_up == FULL)
2061 return enable_cpucache(cachep); 2006 return enable_cpucache(cachep, gfp);
2062 2007
2063 if (g_cpucache_up == NONE) { 2008 if (g_cpucache_up == NONE) {
2064 /* 2009 /*
@@ -2080,7 +2025,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
2080 g_cpucache_up = PARTIAL_AC; 2025 g_cpucache_up = PARTIAL_AC;
2081 } else { 2026 } else {
2082 cachep->array[smp_processor_id()] = 2027 cachep->array[smp_processor_id()] =
2083 kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 2028 kmalloc(sizeof(struct arraycache_init), gfp);
2084 2029
2085 if (g_cpucache_up == PARTIAL_AC) { 2030 if (g_cpucache_up == PARTIAL_AC) {
2086 set_up_list3s(cachep, SIZE_L3); 2031 set_up_list3s(cachep, SIZE_L3);
@@ -2090,7 +2035,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
2090 for_each_online_node(node) { 2035 for_each_online_node(node) {
2091 cachep->nodelists[node] = 2036 cachep->nodelists[node] =
2092 kmalloc_node(sizeof(struct kmem_list3), 2037 kmalloc_node(sizeof(struct kmem_list3),
2093 GFP_KERNEL, node); 2038 gfp, node);
2094 BUG_ON(!cachep->nodelists[node]); 2039 BUG_ON(!cachep->nodelists[node]);
2095 kmem_list3_init(cachep->nodelists[node]); 2040 kmem_list3_init(cachep->nodelists[node]);
2096 } 2041 }
@@ -2144,6 +2089,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2144{ 2089{
2145 size_t left_over, slab_size, ralign; 2090 size_t left_over, slab_size, ralign;
2146 struct kmem_cache *cachep = NULL, *pc; 2091 struct kmem_cache *cachep = NULL, *pc;
2092 gfp_t gfp;
2147 2093
2148 /* 2094 /*
2149 * Sanity checks... these are all serious usage bugs. 2095 * Sanity checks... these are all serious usage bugs.
@@ -2159,8 +2105,10 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2159 * We use cache_chain_mutex to ensure a consistent view of 2105 * We use cache_chain_mutex to ensure a consistent view of
2160 * cpu_online_mask as well. Please see cpuup_callback 2106 * cpu_online_mask as well. Please see cpuup_callback
2161 */ 2107 */
2162 get_online_cpus(); 2108 if (slab_is_available()) {
2163 mutex_lock(&cache_chain_mutex); 2109 get_online_cpus();
2110 mutex_lock(&cache_chain_mutex);
2111 }
2164 2112
2165 list_for_each_entry(pc, &cache_chain, next) { 2113 list_for_each_entry(pc, &cache_chain, next) {
2166 char tmp; 2114 char tmp;
@@ -2269,8 +2217,13 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2269 */ 2217 */
2270 align = ralign; 2218 align = ralign;
2271 2219
2220 if (slab_is_available())
2221 gfp = GFP_KERNEL;
2222 else
2223 gfp = GFP_NOWAIT;
2224
2272 /* Get cache's description obj. */ 2225 /* Get cache's description obj. */
2273 cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL); 2226 cachep = kmem_cache_zalloc(&cache_cache, gfp);
2274 if (!cachep) 2227 if (!cachep)
2275 goto oops; 2228 goto oops;
2276 2229
@@ -2344,6 +2297,15 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2344 /* really off slab. No need for manual alignment */ 2297 /* really off slab. No need for manual alignment */
2345 slab_size = 2298 slab_size =
2346 cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); 2299 cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
2300
2301#ifdef CONFIG_PAGE_POISONING
2302 /* If we're going to use the generic kernel_map_pages()
2303 * poisoning, then it's going to smash the contents of
2304 * the redzone and userword anyhow, so switch them off.
2305 */
2306 if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
2307 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2308#endif
2347 } 2309 }
2348 2310
2349 cachep->colour_off = cache_line_size(); 2311 cachep->colour_off = cache_line_size();
@@ -2373,7 +2335,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2373 cachep->ctor = ctor; 2335 cachep->ctor = ctor;
2374 cachep->name = name; 2336 cachep->name = name;
2375 2337
2376 if (setup_cpu_cache(cachep)) { 2338 if (setup_cpu_cache(cachep, gfp)) {
2377 __kmem_cache_destroy(cachep); 2339 __kmem_cache_destroy(cachep);
2378 cachep = NULL; 2340 cachep = NULL;
2379 goto oops; 2341 goto oops;
@@ -2385,8 +2347,10 @@ oops:
2385 if (!cachep && (flags & SLAB_PANIC)) 2347 if (!cachep && (flags & SLAB_PANIC))
2386 panic("kmem_cache_create(): failed to create slab `%s'\n", 2348 panic("kmem_cache_create(): failed to create slab `%s'\n",
2387 name); 2349 name);
2388 mutex_unlock(&cache_chain_mutex); 2350 if (slab_is_available()) {
2389 put_online_cpus(); 2351 mutex_unlock(&cache_chain_mutex);
2352 put_online_cpus();
2353 }
2390 return cachep; 2354 return cachep;
2391} 2355}
2392EXPORT_SYMBOL(kmem_cache_create); 2356EXPORT_SYMBOL(kmem_cache_create);
@@ -2583,7 +2547,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
2583 } 2547 }
2584 2548
2585 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) 2549 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
2586 synchronize_rcu(); 2550 rcu_barrier();
2587 2551
2588 __kmem_cache_destroy(cachep); 2552 __kmem_cache_destroy(cachep);
2589 mutex_unlock(&cache_chain_mutex); 2553 mutex_unlock(&cache_chain_mutex);
@@ -2612,6 +2576,14 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2612 /* Slab management obj is off-slab. */ 2576 /* Slab management obj is off-slab. */
2613 slabp = kmem_cache_alloc_node(cachep->slabp_cache, 2577 slabp = kmem_cache_alloc_node(cachep->slabp_cache,
2614 local_flags, nodeid); 2578 local_flags, nodeid);
2579 /*
2580 * If the first object in the slab is leaked (it's allocated
2581 * but no one has a reference to it), we want to make sure
2582 * kmemleak does not treat the ->s_mem pointer as a reference
2583 * to the object. Otherwise we will not report the leak.
2584 */
2585 kmemleak_scan_area(slabp, offsetof(struct slab, list),
2586 sizeof(struct list_head), local_flags);
2615 if (!slabp) 2587 if (!slabp)
2616 return NULL; 2588 return NULL;
2617 } else { 2589 } else {
@@ -3132,6 +3104,12 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3132 STATS_INC_ALLOCMISS(cachep); 3104 STATS_INC_ALLOCMISS(cachep);
3133 objp = cache_alloc_refill(cachep, flags); 3105 objp = cache_alloc_refill(cachep, flags);
3134 } 3106 }
3107 /*
3108 * To avoid a false negative, if an object that is in one of the
3109 * per-CPU caches is leaked, we need to make sure kmemleak doesn't
3110 * treat the array pointers as a reference to the object.
3111 */
3112 kmemleak_erase(&ac->entry[ac->avail]);
3135 return objp; 3113 return objp;
3136} 3114}
3137 3115
@@ -3210,7 +3188,7 @@ retry:
3210 if (local_flags & __GFP_WAIT) 3188 if (local_flags & __GFP_WAIT)
3211 local_irq_enable(); 3189 local_irq_enable();
3212 kmem_flagcheck(cache, flags); 3190 kmem_flagcheck(cache, flags);
3213 obj = kmem_getpages(cache, local_flags, -1); 3191 obj = kmem_getpages(cache, local_flags, numa_node_id());
3214 if (local_flags & __GFP_WAIT) 3192 if (local_flags & __GFP_WAIT)
3215 local_irq_disable(); 3193 local_irq_disable();
3216 if (obj) { 3194 if (obj) {
@@ -3318,6 +3296,8 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3318 unsigned long save_flags; 3296 unsigned long save_flags;
3319 void *ptr; 3297 void *ptr;
3320 3298
3299 flags &= gfp_allowed_mask;
3300
3321 lockdep_trace_alloc(flags); 3301 lockdep_trace_alloc(flags);
3322 3302
3323 if (slab_should_failslab(cachep, flags)) 3303 if (slab_should_failslab(cachep, flags))
@@ -3351,6 +3331,11 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3351 out: 3331 out:
3352 local_irq_restore(save_flags); 3332 local_irq_restore(save_flags);
3353 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); 3333 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
3334 kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags,
3335 flags);
3336
3337 if (likely(ptr))
3338 kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep));
3354 3339
3355 if (unlikely((flags & __GFP_ZERO) && ptr)) 3340 if (unlikely((flags & __GFP_ZERO) && ptr))
3356 memset(ptr, 0, obj_size(cachep)); 3341 memset(ptr, 0, obj_size(cachep));
@@ -3396,6 +3381,8 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
3396 unsigned long save_flags; 3381 unsigned long save_flags;
3397 void *objp; 3382 void *objp;
3398 3383
3384 flags &= gfp_allowed_mask;
3385
3399 lockdep_trace_alloc(flags); 3386 lockdep_trace_alloc(flags);
3400 3387
3401 if (slab_should_failslab(cachep, flags)) 3388 if (slab_should_failslab(cachep, flags))
@@ -3406,8 +3393,13 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
3406 objp = __do_cache_alloc(cachep, flags); 3393 objp = __do_cache_alloc(cachep, flags);
3407 local_irq_restore(save_flags); 3394 local_irq_restore(save_flags);
3408 objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); 3395 objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
3396 kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags,
3397 flags);
3409 prefetchw(objp); 3398 prefetchw(objp);
3410 3399
3400 if (likely(objp))
3401 kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep));
3402
3411 if (unlikely((flags & __GFP_ZERO) && objp)) 3403 if (unlikely((flags & __GFP_ZERO) && objp))
3412 memset(objp, 0, obj_size(cachep)); 3404 memset(objp, 0, obj_size(cachep));
3413 3405
@@ -3521,8 +3513,11 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3521 struct array_cache *ac = cpu_cache_get(cachep); 3513 struct array_cache *ac = cpu_cache_get(cachep);
3522 3514
3523 check_irq_off(); 3515 check_irq_off();
3516 kmemleak_free_recursive(objp, cachep->flags);
3524 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); 3517 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
3525 3518
3519 kmemcheck_slab_free(cachep, objp, obj_size(cachep));
3520
3526 /* 3521 /*
3527 * Skip calling cache_free_alien() when the platform is not numa. 3522 * Skip calling cache_free_alien() when the platform is not numa.
3528 * This will avoid cache misses that happen while accessing slabp (which 3523 * This will avoid cache misses that happen while accessing slabp (which
@@ -3530,7 +3525,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3530 * variable to skip the call, which is mostly likely to be present in 3525 * variable to skip the call, which is mostly likely to be present in
3531 * the cache. 3526 * the cache.
3532 */ 3527 */
3533 if (numa_platform && cache_free_alien(cachep, objp)) 3528 if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
3534 return; 3529 return;
3535 3530
3536 if (likely(ac->avail < ac->limit)) { 3531 if (likely(ac->avail < ac->limit)) {
@@ -3554,10 +3549,23 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3554 */ 3549 */
3555void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3550void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3556{ 3551{
3557 return __cache_alloc(cachep, flags, __builtin_return_address(0)); 3552 void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
3553
3554 trace_kmem_cache_alloc(_RET_IP_, ret,
3555 obj_size(cachep), cachep->buffer_size, flags);
3556
3557 return ret;
3558} 3558}
3559EXPORT_SYMBOL(kmem_cache_alloc); 3559EXPORT_SYMBOL(kmem_cache_alloc);
3560 3560
3561#ifdef CONFIG_KMEMTRACE
3562void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags)
3563{
3564 return __cache_alloc(cachep, flags, __builtin_return_address(0));
3565}
3566EXPORT_SYMBOL(kmem_cache_alloc_notrace);
3567#endif
3568
3561/** 3569/**
3562 * kmem_ptr_validate - check if an untrusted pointer might be a slab entry. 3570 * kmem_ptr_validate - check if an untrusted pointer might be a slab entry.
3563 * @cachep: the cache we're checking against 3571 * @cachep: the cache we're checking against
@@ -3602,23 +3610,46 @@ out:
3602#ifdef CONFIG_NUMA 3610#ifdef CONFIG_NUMA
3603void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) 3611void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3604{ 3612{
3605 return __cache_alloc_node(cachep, flags, nodeid, 3613 void *ret = __cache_alloc_node(cachep, flags, nodeid,
3606 __builtin_return_address(0)); 3614 __builtin_return_address(0));
3615
3616 trace_kmem_cache_alloc_node(_RET_IP_, ret,
3617 obj_size(cachep), cachep->buffer_size,
3618 flags, nodeid);
3619
3620 return ret;
3607} 3621}
3608EXPORT_SYMBOL(kmem_cache_alloc_node); 3622EXPORT_SYMBOL(kmem_cache_alloc_node);
3609 3623
3624#ifdef CONFIG_KMEMTRACE
3625void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep,
3626 gfp_t flags,
3627 int nodeid)
3628{
3629 return __cache_alloc_node(cachep, flags, nodeid,
3630 __builtin_return_address(0));
3631}
3632EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
3633#endif
3634
3610static __always_inline void * 3635static __always_inline void *
3611__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) 3636__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
3612{ 3637{
3613 struct kmem_cache *cachep; 3638 struct kmem_cache *cachep;
3639 void *ret;
3614 3640
3615 cachep = kmem_find_general_cachep(size, flags); 3641 cachep = kmem_find_general_cachep(size, flags);
3616 if (unlikely(ZERO_OR_NULL_PTR(cachep))) 3642 if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3617 return cachep; 3643 return cachep;
3618 return kmem_cache_alloc_node(cachep, flags, node); 3644 ret = kmem_cache_alloc_node_notrace(cachep, flags, node);
3645
3646 trace_kmalloc_node((unsigned long) caller, ret,
3647 size, cachep->buffer_size, flags, node);
3648
3649 return ret;
3619} 3650}
3620 3651
3621#ifdef CONFIG_DEBUG_SLAB 3652#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE)
3622void *__kmalloc_node(size_t size, gfp_t flags, int node) 3653void *__kmalloc_node(size_t size, gfp_t flags, int node)
3623{ 3654{
3624 return __do_kmalloc_node(size, flags, node, 3655 return __do_kmalloc_node(size, flags, node,
@@ -3651,6 +3682,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3651 void *caller) 3682 void *caller)
3652{ 3683{
3653 struct kmem_cache *cachep; 3684 struct kmem_cache *cachep;
3685 void *ret;
3654 3686
3655 /* If you want to save a few bytes .text space: replace 3687 /* If you want to save a few bytes .text space: replace
3656 * __ with kmem_. 3688 * __ with kmem_.
@@ -3660,11 +3692,16 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3660 cachep = __find_general_cachep(size, flags); 3692 cachep = __find_general_cachep(size, flags);
3661 if (unlikely(ZERO_OR_NULL_PTR(cachep))) 3693 if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3662 return cachep; 3694 return cachep;
3663 return __cache_alloc(cachep, flags, caller); 3695 ret = __cache_alloc(cachep, flags, caller);
3696
3697 trace_kmalloc((unsigned long) caller, ret,
3698 size, cachep->buffer_size, flags);
3699
3700 return ret;
3664} 3701}
3665 3702
3666 3703
3667#ifdef CONFIG_DEBUG_SLAB 3704#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE)
3668void *__kmalloc(size_t size, gfp_t flags) 3705void *__kmalloc(size_t size, gfp_t flags)
3669{ 3706{
3670 return __do_kmalloc(size, flags, __builtin_return_address(0)); 3707 return __do_kmalloc(size, flags, __builtin_return_address(0));
@@ -3703,6 +3740,8 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3703 debug_check_no_obj_freed(objp, obj_size(cachep)); 3740 debug_check_no_obj_freed(objp, obj_size(cachep));
3704 __cache_free(cachep, objp); 3741 __cache_free(cachep, objp);
3705 local_irq_restore(flags); 3742 local_irq_restore(flags);
3743
3744 trace_kmem_cache_free(_RET_IP_, objp);
3706} 3745}
3707EXPORT_SYMBOL(kmem_cache_free); 3746EXPORT_SYMBOL(kmem_cache_free);
3708 3747
@@ -3720,6 +3759,8 @@ void kfree(const void *objp)
3720 struct kmem_cache *c; 3759 struct kmem_cache *c;
3721 unsigned long flags; 3760 unsigned long flags;
3722 3761
3762 trace_kfree(_RET_IP_, objp);
3763
3723 if (unlikely(ZERO_OR_NULL_PTR(objp))) 3764 if (unlikely(ZERO_OR_NULL_PTR(objp)))
3724 return; 3765 return;
3725 local_irq_save(flags); 3766 local_irq_save(flags);
@@ -3747,7 +3788,7 @@ EXPORT_SYMBOL_GPL(kmem_cache_name);
3747/* 3788/*
3748 * This initializes kmem_list3 or resizes various caches for all nodes. 3789 * This initializes kmem_list3 or resizes various caches for all nodes.
3749 */ 3790 */
3750static int alloc_kmemlist(struct kmem_cache *cachep) 3791static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
3751{ 3792{
3752 int node; 3793 int node;
3753 struct kmem_list3 *l3; 3794 struct kmem_list3 *l3;
@@ -3757,7 +3798,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
3757 for_each_online_node(node) { 3798 for_each_online_node(node) {
3758 3799
3759 if (use_alien_caches) { 3800 if (use_alien_caches) {
3760 new_alien = alloc_alien_cache(node, cachep->limit); 3801 new_alien = alloc_alien_cache(node, cachep->limit, gfp);
3761 if (!new_alien) 3802 if (!new_alien)
3762 goto fail; 3803 goto fail;
3763 } 3804 }
@@ -3766,7 +3807,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
3766 if (cachep->shared) { 3807 if (cachep->shared) {
3767 new_shared = alloc_arraycache(node, 3808 new_shared = alloc_arraycache(node,
3768 cachep->shared*cachep->batchcount, 3809 cachep->shared*cachep->batchcount,
3769 0xbaadf00d); 3810 0xbaadf00d, gfp);
3770 if (!new_shared) { 3811 if (!new_shared) {
3771 free_alien_cache(new_alien); 3812 free_alien_cache(new_alien);
3772 goto fail; 3813 goto fail;
@@ -3795,7 +3836,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
3795 free_alien_cache(new_alien); 3836 free_alien_cache(new_alien);
3796 continue; 3837 continue;
3797 } 3838 }
3798 l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node); 3839 l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node);
3799 if (!l3) { 3840 if (!l3) {
3800 free_alien_cache(new_alien); 3841 free_alien_cache(new_alien);
3801 kfree(new_shared); 3842 kfree(new_shared);
@@ -3851,18 +3892,18 @@ static void do_ccupdate_local(void *info)
3851 3892
3852/* Always called with the cache_chain_mutex held */ 3893/* Always called with the cache_chain_mutex held */
3853static int do_tune_cpucache(struct kmem_cache *cachep, int limit, 3894static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3854 int batchcount, int shared) 3895 int batchcount, int shared, gfp_t gfp)
3855{ 3896{
3856 struct ccupdate_struct *new; 3897 struct ccupdate_struct *new;
3857 int i; 3898 int i;
3858 3899
3859 new = kzalloc(sizeof(*new), GFP_KERNEL); 3900 new = kzalloc(sizeof(*new), gfp);
3860 if (!new) 3901 if (!new)
3861 return -ENOMEM; 3902 return -ENOMEM;
3862 3903
3863 for_each_online_cpu(i) { 3904 for_each_online_cpu(i) {
3864 new->new[i] = alloc_arraycache(cpu_to_node(i), limit, 3905 new->new[i] = alloc_arraycache(cpu_to_node(i), limit,
3865 batchcount); 3906 batchcount, gfp);
3866 if (!new->new[i]) { 3907 if (!new->new[i]) {
3867 for (i--; i >= 0; i--) 3908 for (i--; i >= 0; i--)
3868 kfree(new->new[i]); 3909 kfree(new->new[i]);
@@ -3889,11 +3930,11 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3889 kfree(ccold); 3930 kfree(ccold);
3890 } 3931 }
3891 kfree(new); 3932 kfree(new);
3892 return alloc_kmemlist(cachep); 3933 return alloc_kmemlist(cachep, gfp);
3893} 3934}
3894 3935
3895/* Called with cache_chain_mutex held always */ 3936/* Called with cache_chain_mutex held always */
3896static int enable_cpucache(struct kmem_cache *cachep) 3937static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
3897{ 3938{
3898 int err; 3939 int err;
3899 int limit, shared; 3940 int limit, shared;
@@ -3939,7 +3980,7 @@ static int enable_cpucache(struct kmem_cache *cachep)
3939 if (limit > 32) 3980 if (limit > 32)
3940 limit = 32; 3981 limit = 32;
3941#endif 3982#endif
3942 err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared); 3983 err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp);
3943 if (err) 3984 if (err)
3944 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", 3985 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
3945 cachep->name, -err); 3986 cachep->name, -err);
@@ -3992,8 +4033,7 @@ static void cache_reap(struct work_struct *w)
3992 struct kmem_cache *searchp; 4033 struct kmem_cache *searchp;
3993 struct kmem_list3 *l3; 4034 struct kmem_list3 *l3;
3994 int node = numa_node_id(); 4035 int node = numa_node_id();
3995 struct delayed_work *work = 4036 struct delayed_work *work = to_delayed_work(w);
3996 container_of(w, struct delayed_work, work);
3997 4037
3998 if (!mutex_trylock(&cache_chain_mutex)) 4038 if (!mutex_trylock(&cache_chain_mutex))
3999 /* Give up. Setup the next iteration. */ 4039 /* Give up. Setup the next iteration. */
@@ -4246,7 +4286,8 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
4246 res = 0; 4286 res = 0;
4247 } else { 4287 } else {
4248 res = do_tune_cpucache(cachep, limit, 4288 res = do_tune_cpucache(cachep, limit,
4249 batchcount, shared); 4289 batchcount, shared,
4290 GFP_KERNEL);
4250 } 4291 }
4251 break; 4292 break;
4252 } 4293 }
diff --git a/mm/slob.c b/mm/slob.c
index 7a3411524dac..837ebd64cc34 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -46,7 +46,7 @@
46 * NUMA support in SLOB is fairly simplistic, pushing most of the real 46 * NUMA support in SLOB is fairly simplistic, pushing most of the real
47 * logic down to the page allocator, and simply doing the node accounting 47 * logic down to the page allocator, and simply doing the node accounting
48 * on the upper levels. In the event that a node id is explicitly 48 * on the upper levels. In the event that a node id is explicitly
49 * provided, alloc_pages_node() with the specified node id is used 49 * provided, alloc_pages_exact_node() with the specified node id is used
50 * instead. The common case (or when the node id isn't explicitly provided) 50 * instead. The common case (or when the node id isn't explicitly provided)
51 * will default to the current node, as per numa_node_id(). 51 * will default to the current node, as per numa_node_id().
52 * 52 *
@@ -60,11 +60,14 @@
60#include <linux/kernel.h> 60#include <linux/kernel.h>
61#include <linux/slab.h> 61#include <linux/slab.h>
62#include <linux/mm.h> 62#include <linux/mm.h>
63#include <linux/swap.h> /* struct reclaim_state */
63#include <linux/cache.h> 64#include <linux/cache.h>
64#include <linux/init.h> 65#include <linux/init.h>
65#include <linux/module.h> 66#include <linux/module.h>
66#include <linux/rcupdate.h> 67#include <linux/rcupdate.h>
67#include <linux/list.h> 68#include <linux/list.h>
69#include <linux/kmemtrace.h>
70#include <linux/kmemleak.h>
68#include <asm/atomic.h> 71#include <asm/atomic.h>
69 72
70/* 73/*
@@ -130,17 +133,17 @@ static LIST_HEAD(free_slob_large);
130 */ 133 */
131static inline int is_slob_page(struct slob_page *sp) 134static inline int is_slob_page(struct slob_page *sp)
132{ 135{
133 return PageSlobPage((struct page *)sp); 136 return PageSlab((struct page *)sp);
134} 137}
135 138
136static inline void set_slob_page(struct slob_page *sp) 139static inline void set_slob_page(struct slob_page *sp)
137{ 140{
138 __SetPageSlobPage((struct page *)sp); 141 __SetPageSlab((struct page *)sp);
139} 142}
140 143
141static inline void clear_slob_page(struct slob_page *sp) 144static inline void clear_slob_page(struct slob_page *sp)
142{ 145{
143 __ClearPageSlobPage((struct page *)sp); 146 __ClearPageSlab((struct page *)sp);
144} 147}
145 148
146static inline struct slob_page *slob_page(const void *addr) 149static inline struct slob_page *slob_page(const void *addr)
@@ -241,7 +244,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
241 244
242#ifdef CONFIG_NUMA 245#ifdef CONFIG_NUMA
243 if (node != -1) 246 if (node != -1)
244 page = alloc_pages_node(node, gfp, order); 247 page = alloc_pages_exact_node(node, gfp, order);
245 else 248 else
246#endif 249#endif
247 page = alloc_pages(gfp, order); 250 page = alloc_pages(gfp, order);
@@ -254,6 +257,8 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
254 257
255static void slob_free_pages(void *b, int order) 258static void slob_free_pages(void *b, int order)
256{ 259{
260 if (current->reclaim_state)
261 current->reclaim_state->reclaimed_slab += 1 << order;
257 free_pages((unsigned long)b, order); 262 free_pages((unsigned long)b, order);
258} 263}
259 264
@@ -406,7 +411,7 @@ static void slob_free(void *block, int size)
406 spin_unlock_irqrestore(&slob_lock, flags); 411 spin_unlock_irqrestore(&slob_lock, flags);
407 clear_slob_page(sp); 412 clear_slob_page(sp);
408 free_slob_page(sp); 413 free_slob_page(sp);
409 free_page((unsigned long)b); 414 slob_free_pages(b, 0);
410 return; 415 return;
411 } 416 }
412 417
@@ -474,6 +479,7 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
474{ 479{
475 unsigned int *m; 480 unsigned int *m;
476 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); 481 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
482 void *ret;
477 483
478 lockdep_trace_alloc(gfp); 484 lockdep_trace_alloc(gfp);
479 485
@@ -482,12 +488,16 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
482 return ZERO_SIZE_PTR; 488 return ZERO_SIZE_PTR;
483 489
484 m = slob_alloc(size + align, gfp, align, node); 490 m = slob_alloc(size + align, gfp, align, node);
491
485 if (!m) 492 if (!m)
486 return NULL; 493 return NULL;
487 *m = size; 494 *m = size;
488 return (void *)m + align; 495 ret = (void *)m + align;
496
497 trace_kmalloc_node(_RET_IP_, ret,
498 size, size + align, gfp, node);
489 } else { 499 } else {
490 void *ret; 500 unsigned int order = get_order(size);
491 501
492 ret = slob_new_pages(gfp | __GFP_COMP, get_order(size), node); 502 ret = slob_new_pages(gfp | __GFP_COMP, get_order(size), node);
493 if (ret) { 503 if (ret) {
@@ -495,8 +505,13 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
495 page = virt_to_page(ret); 505 page = virt_to_page(ret);
496 page->private = size; 506 page->private = size;
497 } 507 }
498 return ret; 508
509 trace_kmalloc_node(_RET_IP_, ret,
510 size, PAGE_SIZE << order, gfp, node);
499 } 511 }
512
513 kmemleak_alloc(ret, size, 1, gfp);
514 return ret;
500} 515}
501EXPORT_SYMBOL(__kmalloc_node); 516EXPORT_SYMBOL(__kmalloc_node);
502 517
@@ -504,8 +519,11 @@ void kfree(const void *block)
504{ 519{
505 struct slob_page *sp; 520 struct slob_page *sp;
506 521
522 trace_kfree(_RET_IP_, block);
523
507 if (unlikely(ZERO_OR_NULL_PTR(block))) 524 if (unlikely(ZERO_OR_NULL_PTR(block)))
508 return; 525 return;
526 kmemleak_free(block);
509 527
510 sp = slob_page(block); 528 sp = slob_page(block);
511 if (is_slob_page(sp)) { 529 if (is_slob_page(sp)) {
@@ -569,12 +587,16 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
569 } else if (flags & SLAB_PANIC) 587 } else if (flags & SLAB_PANIC)
570 panic("Cannot create slab cache %s\n", name); 588 panic("Cannot create slab cache %s\n", name);
571 589
590 kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL);
572 return c; 591 return c;
573} 592}
574EXPORT_SYMBOL(kmem_cache_create); 593EXPORT_SYMBOL(kmem_cache_create);
575 594
576void kmem_cache_destroy(struct kmem_cache *c) 595void kmem_cache_destroy(struct kmem_cache *c)
577{ 596{
597 kmemleak_free(c);
598 if (c->flags & SLAB_DESTROY_BY_RCU)
599 rcu_barrier();
578 slob_free(c, sizeof(struct kmem_cache)); 600 slob_free(c, sizeof(struct kmem_cache));
579} 601}
580EXPORT_SYMBOL(kmem_cache_destroy); 602EXPORT_SYMBOL(kmem_cache_destroy);
@@ -583,14 +605,22 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
583{ 605{
584 void *b; 606 void *b;
585 607
586 if (c->size < PAGE_SIZE) 608 if (c->size < PAGE_SIZE) {
587 b = slob_alloc(c->size, flags, c->align, node); 609 b = slob_alloc(c->size, flags, c->align, node);
588 else 610 trace_kmem_cache_alloc_node(_RET_IP_, b, c->size,
611 SLOB_UNITS(c->size) * SLOB_UNIT,
612 flags, node);
613 } else {
589 b = slob_new_pages(flags, get_order(c->size), node); 614 b = slob_new_pages(flags, get_order(c->size), node);
615 trace_kmem_cache_alloc_node(_RET_IP_, b, c->size,
616 PAGE_SIZE << get_order(c->size),
617 flags, node);
618 }
590 619
591 if (c->ctor) 620 if (c->ctor)
592 c->ctor(b); 621 c->ctor(b);
593 622
623 kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags);
594 return b; 624 return b;
595} 625}
596EXPORT_SYMBOL(kmem_cache_alloc_node); 626EXPORT_SYMBOL(kmem_cache_alloc_node);
@@ -613,6 +643,7 @@ static void kmem_rcu_free(struct rcu_head *head)
613 643
614void kmem_cache_free(struct kmem_cache *c, void *b) 644void kmem_cache_free(struct kmem_cache *c, void *b)
615{ 645{
646 kmemleak_free_recursive(b, c->flags);
616 if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) { 647 if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) {
617 struct slob_rcu *slob_rcu; 648 struct slob_rcu *slob_rcu;
618 slob_rcu = b + (c->size - sizeof(struct slob_rcu)); 649 slob_rcu = b + (c->size - sizeof(struct slob_rcu));
@@ -622,6 +653,8 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
622 } else { 653 } else {
623 __kmem_cache_free(b, c->size); 654 __kmem_cache_free(b, c->size);
624 } 655 }
656
657 trace_kmem_cache_free(_RET_IP_, b);
625} 658}
626EXPORT_SYMBOL(kmem_cache_free); 659EXPORT_SYMBOL(kmem_cache_free);
627 660
@@ -659,3 +692,8 @@ void __init kmem_cache_init(void)
659{ 692{
660 slob_ready = 1; 693 slob_ready = 1;
661} 694}
695
696void __init kmem_cache_init_late(void)
697{
698 /* Nothing to do */
699}
diff --git a/mm/slub.c b/mm/slub.c
index c4ea9158c9fb..4996fc719552 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -9,6 +9,7 @@
9 */ 9 */
10 10
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/swap.h> /* struct reclaim_state */
12#include <linux/module.h> 13#include <linux/module.h>
13#include <linux/bit_spinlock.h> 14#include <linux/bit_spinlock.h>
14#include <linux/interrupt.h> 15#include <linux/interrupt.h>
@@ -16,6 +17,8 @@
16#include <linux/slab.h> 17#include <linux/slab.h>
17#include <linux/proc_fs.h> 18#include <linux/proc_fs.h>
18#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/kmemtrace.h>
21#include <linux/kmemcheck.h>
19#include <linux/cpu.h> 22#include <linux/cpu.h>
20#include <linux/cpuset.h> 23#include <linux/cpuset.h>
21#include <linux/mempolicy.h> 24#include <linux/mempolicy.h>
@@ -138,13 +141,20 @@
138 SLAB_POISON | SLAB_STORE_USER) 141 SLAB_POISON | SLAB_STORE_USER)
139 142
140/* 143/*
144 * Debugging flags that require metadata to be stored in the slab. These get
145 * disabled when slub_debug=O is used and a cache's min order increases with
146 * metadata.
147 */
148#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
149
150/*
141 * Set of flags that will prevent slab merging 151 * Set of flags that will prevent slab merging
142 */ 152 */
143#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 153#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
144 SLAB_TRACE | SLAB_DESTROY_BY_RCU) 154 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE)
145 155
146#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ 156#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
147 SLAB_CACHE_DMA) 157 SLAB_CACHE_DMA | SLAB_NOTRACK)
148 158
149#ifndef ARCH_KMALLOC_MINALIGN 159#ifndef ARCH_KMALLOC_MINALIGN
150#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) 160#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
@@ -322,6 +332,7 @@ static int slub_debug;
322#endif 332#endif
323 333
324static char *slub_debug_slabs; 334static char *slub_debug_slabs;
335static int disable_higher_order_debug;
325 336
326/* 337/*
327 * Object debugging 338 * Object debugging
@@ -643,7 +654,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
643 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); 654 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
644 print_section("Padding", end - remainder, remainder); 655 print_section("Padding", end - remainder, remainder);
645 656
646 restore_bytes(s, "slab padding", POISON_INUSE, start, end); 657 restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end);
647 return 0; 658 return 0;
648} 659}
649 660
@@ -830,6 +841,11 @@ static inline unsigned long slabs_node(struct kmem_cache *s, int node)
830 return atomic_long_read(&n->nr_slabs); 841 return atomic_long_read(&n->nr_slabs);
831} 842}
832 843
844static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
845{
846 return atomic_long_read(&n->nr_slabs);
847}
848
833static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) 849static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
834{ 850{
835 struct kmem_cache_node *n = get_node(s, node); 851 struct kmem_cache_node *n = get_node(s, node);
@@ -968,6 +984,15 @@ static int __init setup_slub_debug(char *str)
968 */ 984 */
969 goto check_slabs; 985 goto check_slabs;
970 986
987 if (tolower(*str) == 'o') {
988 /*
989 * Avoid enabling debugging on caches if its minimum order
990 * would increase as a result.
991 */
992 disable_higher_order_debug = 1;
993 goto out;
994 }
995
971 slub_debug = 0; 996 slub_debug = 0;
972 if (*str == '-') 997 if (*str == '-')
973 /* 998 /*
@@ -1018,8 +1043,8 @@ static unsigned long kmem_cache_flags(unsigned long objsize,
1018 * Enable debugging if selected on the kernel commandline. 1043 * Enable debugging if selected on the kernel commandline.
1019 */ 1044 */
1020 if (slub_debug && (!slub_debug_slabs || 1045 if (slub_debug && (!slub_debug_slabs ||
1021 strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)) == 0)) 1046 !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))
1022 flags |= slub_debug; 1047 flags |= slub_debug;
1023 1048
1024 return flags; 1049 return flags;
1025} 1050}
@@ -1046,8 +1071,12 @@ static inline unsigned long kmem_cache_flags(unsigned long objsize,
1046} 1071}
1047#define slub_debug 0 1072#define slub_debug 0
1048 1073
1074#define disable_higher_order_debug 0
1075
1049static inline unsigned long slabs_node(struct kmem_cache *s, int node) 1076static inline unsigned long slabs_node(struct kmem_cache *s, int node)
1050 { return 0; } 1077 { return 0; }
1078static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
1079 { return 0; }
1051static inline void inc_slabs_node(struct kmem_cache *s, int node, 1080static inline void inc_slabs_node(struct kmem_cache *s, int node,
1052 int objects) {} 1081 int objects) {}
1053static inline void dec_slabs_node(struct kmem_cache *s, int node, 1082static inline void dec_slabs_node(struct kmem_cache *s, int node,
@@ -1062,6 +1091,8 @@ static inline struct page *alloc_slab_page(gfp_t flags, int node,
1062{ 1091{
1063 int order = oo_order(oo); 1092 int order = oo_order(oo);
1064 1093
1094 flags |= __GFP_NOTRACK;
1095
1065 if (node == -1) 1096 if (node == -1)
1066 return alloc_pages(flags, order); 1097 return alloc_pages(flags, order);
1067 else 1098 else
@@ -1072,11 +1103,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1072{ 1103{
1073 struct page *page; 1104 struct page *page;
1074 struct kmem_cache_order_objects oo = s->oo; 1105 struct kmem_cache_order_objects oo = s->oo;
1106 gfp_t alloc_gfp;
1075 1107
1076 flags |= s->allocflags; 1108 flags |= s->allocflags;
1077 1109
1078 page = alloc_slab_page(flags | __GFP_NOWARN | __GFP_NORETRY, node, 1110 /*
1079 oo); 1111 * Let the initial higher-order allocation fail under memory pressure
1112 * so we fall-back to the minimum order allocation.
1113 */
1114 alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
1115
1116 page = alloc_slab_page(alloc_gfp, node, oo);
1080 if (unlikely(!page)) { 1117 if (unlikely(!page)) {
1081 oo = s->min; 1118 oo = s->min;
1082 /* 1119 /*
@@ -1089,6 +1126,23 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1089 1126
1090 stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); 1127 stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK);
1091 } 1128 }
1129
1130 if (kmemcheck_enabled
1131 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
1132 int pages = 1 << oo_order(oo);
1133
1134 kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
1135
1136 /*
1137 * Objects from caches that have a constructor don't get
1138 * cleared when they're allocated, so we need to do it here.
1139 */
1140 if (s->ctor)
1141 kmemcheck_mark_uninitialized_pages(page, pages);
1142 else
1143 kmemcheck_mark_unallocated_pages(page, pages);
1144 }
1145
1092 page->objects = oo_objects(oo); 1146 page->objects = oo_objects(oo);
1093 mod_zone_page_state(page_zone(page), 1147 mod_zone_page_state(page_zone(page),
1094 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1148 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
@@ -1162,6 +1216,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1162 __ClearPageSlubDebug(page); 1216 __ClearPageSlubDebug(page);
1163 } 1217 }
1164 1218
1219 kmemcheck_free_shadow(page, compound_order(page));
1220
1165 mod_zone_page_state(page_zone(page), 1221 mod_zone_page_state(page_zone(page),
1166 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1222 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
1167 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1223 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
@@ -1169,6 +1225,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1169 1225
1170 __ClearPageSlab(page); 1226 __ClearPageSlab(page);
1171 reset_page_mapcount(page); 1227 reset_page_mapcount(page);
1228 if (current->reclaim_state)
1229 current->reclaim_state->reclaimed_slab += pages;
1172 __free_pages(page, order); 1230 __free_pages(page, order);
1173} 1231}
1174 1232
@@ -1480,6 +1538,69 @@ static inline int node_match(struct kmem_cache_cpu *c, int node)
1480 return 1; 1538 return 1;
1481} 1539}
1482 1540
1541static int count_free(struct page *page)
1542{
1543 return page->objects - page->inuse;
1544}
1545
1546static unsigned long count_partial(struct kmem_cache_node *n,
1547 int (*get_count)(struct page *))
1548{
1549 unsigned long flags;
1550 unsigned long x = 0;
1551 struct page *page;
1552
1553 spin_lock_irqsave(&n->list_lock, flags);
1554 list_for_each_entry(page, &n->partial, lru)
1555 x += get_count(page);
1556 spin_unlock_irqrestore(&n->list_lock, flags);
1557 return x;
1558}
1559
1560static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
1561{
1562#ifdef CONFIG_SLUB_DEBUG
1563 return atomic_long_read(&n->total_objects);
1564#else
1565 return 0;
1566#endif
1567}
1568
1569static noinline void
1570slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
1571{
1572 int node;
1573
1574 printk(KERN_WARNING
1575 "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
1576 nid, gfpflags);
1577 printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, "
1578 "default order: %d, min order: %d\n", s->name, s->objsize,
1579 s->size, oo_order(s->oo), oo_order(s->min));
1580
1581 if (oo_order(s->min) > get_order(s->objsize))
1582 printk(KERN_WARNING " %s debugging increased min order, use "
1583 "slub_debug=O to disable.\n", s->name);
1584
1585 for_each_online_node(node) {
1586 struct kmem_cache_node *n = get_node(s, node);
1587 unsigned long nr_slabs;
1588 unsigned long nr_objs;
1589 unsigned long nr_free;
1590
1591 if (!n)
1592 continue;
1593
1594 nr_free = count_partial(n, count_free);
1595 nr_slabs = node_nr_slabs(n);
1596 nr_objs = node_nr_objs(n);
1597
1598 printk(KERN_WARNING
1599 " node %d: slabs: %ld, objs: %ld, free: %ld\n",
1600 node, nr_slabs, nr_objs, nr_free);
1601 }
1602}
1603
1483/* 1604/*
1484 * Slow path. The lockless freelist is empty or we need to perform 1605 * Slow path. The lockless freelist is empty or we need to perform
1485 * debugging duties. 1606 * debugging duties.
@@ -1561,6 +1682,8 @@ new_slab:
1561 c->page = new; 1682 c->page = new;
1562 goto load_freelist; 1683 goto load_freelist;
1563 } 1684 }
1685 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
1686 slab_out_of_memory(s, gfpflags, node);
1564 return NULL; 1687 return NULL;
1565debug: 1688debug:
1566 if (!alloc_debug_processing(s, c->page, object, addr)) 1689 if (!alloc_debug_processing(s, c->page, object, addr))
@@ -1590,6 +1713,8 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1590 unsigned long flags; 1713 unsigned long flags;
1591 unsigned int objsize; 1714 unsigned int objsize;
1592 1715
1716 gfpflags &= gfp_allowed_mask;
1717
1593 lockdep_trace_alloc(gfpflags); 1718 lockdep_trace_alloc(gfpflags);
1594 might_sleep_if(gfpflags & __GFP_WAIT); 1719 might_sleep_if(gfpflags & __GFP_WAIT);
1595 1720
@@ -1613,23 +1738,53 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1613 if (unlikely((gfpflags & __GFP_ZERO) && object)) 1738 if (unlikely((gfpflags & __GFP_ZERO) && object))
1614 memset(object, 0, objsize); 1739 memset(object, 0, objsize);
1615 1740
1741 kmemcheck_slab_alloc(s, gfpflags, object, c->objsize);
1742 kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags);
1743
1616 return object; 1744 return object;
1617} 1745}
1618 1746
1619void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) 1747void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
1620{ 1748{
1621 return slab_alloc(s, gfpflags, -1, _RET_IP_); 1749 void *ret = slab_alloc(s, gfpflags, -1, _RET_IP_);
1750
1751 trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags);
1752
1753 return ret;
1622} 1754}
1623EXPORT_SYMBOL(kmem_cache_alloc); 1755EXPORT_SYMBOL(kmem_cache_alloc);
1624 1756
1757#ifdef CONFIG_KMEMTRACE
1758void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags)
1759{
1760 return slab_alloc(s, gfpflags, -1, _RET_IP_);
1761}
1762EXPORT_SYMBOL(kmem_cache_alloc_notrace);
1763#endif
1764
1625#ifdef CONFIG_NUMA 1765#ifdef CONFIG_NUMA
1626void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) 1766void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
1627{ 1767{
1628 return slab_alloc(s, gfpflags, node, _RET_IP_); 1768 void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
1769
1770 trace_kmem_cache_alloc_node(_RET_IP_, ret,
1771 s->objsize, s->size, gfpflags, node);
1772
1773 return ret;
1629} 1774}
1630EXPORT_SYMBOL(kmem_cache_alloc_node); 1775EXPORT_SYMBOL(kmem_cache_alloc_node);
1631#endif 1776#endif
1632 1777
1778#ifdef CONFIG_KMEMTRACE
1779void *kmem_cache_alloc_node_notrace(struct kmem_cache *s,
1780 gfp_t gfpflags,
1781 int node)
1782{
1783 return slab_alloc(s, gfpflags, node, _RET_IP_);
1784}
1785EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
1786#endif
1787
1633/* 1788/*
1634 * Slow patch handling. This may still be called frequently since objects 1789 * Slow patch handling. This may still be called frequently since objects
1635 * have a longer lifetime than the cpu slabs in most processing loads. 1790 * have a longer lifetime than the cpu slabs in most processing loads.
@@ -1715,8 +1870,10 @@ static __always_inline void slab_free(struct kmem_cache *s,
1715 struct kmem_cache_cpu *c; 1870 struct kmem_cache_cpu *c;
1716 unsigned long flags; 1871 unsigned long flags;
1717 1872
1873 kmemleak_free_recursive(x, s->flags);
1718 local_irq_save(flags); 1874 local_irq_save(flags);
1719 c = get_cpu_slab(s, smp_processor_id()); 1875 c = get_cpu_slab(s, smp_processor_id());
1876 kmemcheck_slab_free(s, object, c->objsize);
1720 debug_check_no_locks_freed(object, c->objsize); 1877 debug_check_no_locks_freed(object, c->objsize);
1721 if (!(s->flags & SLAB_DEBUG_OBJECTS)) 1878 if (!(s->flags & SLAB_DEBUG_OBJECTS))
1722 debug_check_no_obj_freed(object, c->objsize); 1879 debug_check_no_obj_freed(object, c->objsize);
@@ -1737,6 +1894,8 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
1737 page = virt_to_head_page(x); 1894 page = virt_to_head_page(x);
1738 1895
1739 slab_free(s, page, x, _RET_IP_); 1896 slab_free(s, page, x, _RET_IP_);
1897
1898 trace_kmem_cache_free(_RET_IP_, x);
1740} 1899}
1741EXPORT_SYMBOL(kmem_cache_free); 1900EXPORT_SYMBOL(kmem_cache_free);
1742 1901
@@ -1864,7 +2023,7 @@ static inline int calculate_order(int size)
1864 return order; 2023 return order;
1865 fraction /= 2; 2024 fraction /= 2;
1866 } 2025 }
1867 min_objects --; 2026 min_objects--;
1868 } 2027 }
1869 2028
1870 /* 2029 /*
@@ -1879,7 +2038,7 @@ static inline int calculate_order(int size)
1879 * Doh this slab cannot be placed using slub_max_order. 2038 * Doh this slab cannot be placed using slub_max_order.
1880 */ 2039 */
1881 order = slab_order(size, 1, MAX_ORDER, 1); 2040 order = slab_order(size, 1, MAX_ORDER, 1);
1882 if (order <= MAX_ORDER) 2041 if (order < MAX_ORDER)
1883 return order; 2042 return order;
1884 return -ENOSYS; 2043 return -ENOSYS;
1885} 2044}
@@ -1954,8 +2113,8 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
1954 */ 2113 */
1955#define NR_KMEM_CACHE_CPU 100 2114#define NR_KMEM_CACHE_CPU 100
1956 2115
1957static DEFINE_PER_CPU(struct kmem_cache_cpu, 2116static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU],
1958 kmem_cache_cpu)[NR_KMEM_CACHE_CPU]; 2117 kmem_cache_cpu);
1959 2118
1960static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); 2119static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
1961static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS); 2120static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS);
@@ -2263,6 +2422,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2263 * on bootup. 2422 * on bootup.
2264 */ 2423 */
2265 align = calculate_alignment(flags, align, s->objsize); 2424 align = calculate_alignment(flags, align, s->objsize);
2425 s->align = align;
2266 2426
2267 /* 2427 /*
2268 * SLUB stores one object immediately after another beginning from 2428 * SLUB stores one object immediately after another beginning from
@@ -2315,6 +2475,18 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2315 2475
2316 if (!calculate_sizes(s, -1)) 2476 if (!calculate_sizes(s, -1))
2317 goto error; 2477 goto error;
2478 if (disable_higher_order_debug) {
2479 /*
2480 * Disable debugging flags that store metadata if the min slab
2481 * order increased.
2482 */
2483 if (get_order(s->size) > get_order(s->objsize)) {
2484 s->flags &= ~DEBUG_METADATA_FLAGS;
2485 s->offset = 0;
2486 if (!calculate_sizes(s, -1))
2487 goto error;
2488 }
2489 }
2318 2490
2319 /* 2491 /*
2320 * The larger the object size is, the more pages we want on the partial 2492 * The larger the object size is, the more pages we want on the partial
@@ -2467,6 +2639,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
2467 "still has objects.\n", s->name, __func__); 2639 "still has objects.\n", s->name, __func__);
2468 dump_stack(); 2640 dump_stack();
2469 } 2641 }
2642 if (s->flags & SLAB_DESTROY_BY_RCU)
2643 rcu_barrier();
2470 sysfs_slab_remove(s); 2644 sysfs_slab_remove(s);
2471 } else 2645 } else
2472 up_write(&slub_lock); 2646 up_write(&slub_lock);
@@ -2492,6 +2666,7 @@ __setup("slub_min_order=", setup_slub_min_order);
2492static int __init setup_slub_max_order(char *str) 2666static int __init setup_slub_max_order(char *str)
2493{ 2667{
2494 get_option(&str, &slub_max_order); 2668 get_option(&str, &slub_max_order);
2669 slub_max_order = min(slub_max_order, MAX_ORDER - 1);
2495 2670
2496 return 1; 2671 return 1;
2497} 2672}
@@ -2523,13 +2698,16 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
2523 if (gfp_flags & SLUB_DMA) 2698 if (gfp_flags & SLUB_DMA)
2524 flags = SLAB_CACHE_DMA; 2699 flags = SLAB_CACHE_DMA;
2525 2700
2526 down_write(&slub_lock); 2701 /*
2702 * This function is called with IRQs disabled during early-boot on
2703 * single CPU so there's no need to take slub_lock here.
2704 */
2527 if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, 2705 if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
2528 flags, NULL)) 2706 flags, NULL))
2529 goto panic; 2707 goto panic;
2530 2708
2531 list_add(&s->list, &slab_caches); 2709 list_add(&s->list, &slab_caches);
2532 up_write(&slub_lock); 2710
2533 if (sysfs_slab_add(s)) 2711 if (sysfs_slab_add(s))
2534 goto panic; 2712 goto panic;
2535 return s; 2713 return s;
@@ -2562,6 +2740,7 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2562 struct kmem_cache *s; 2740 struct kmem_cache *s;
2563 char *text; 2741 char *text;
2564 size_t realsize; 2742 size_t realsize;
2743 unsigned long slabflags;
2565 2744
2566 s = kmalloc_caches_dma[index]; 2745 s = kmalloc_caches_dma[index];
2567 if (s) 2746 if (s)
@@ -2583,9 +2762,18 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2583 (unsigned int)realsize); 2762 (unsigned int)realsize);
2584 s = kmalloc(kmem_size, flags & ~SLUB_DMA); 2763 s = kmalloc(kmem_size, flags & ~SLUB_DMA);
2585 2764
2765 /*
2766 * Must defer sysfs creation to a workqueue because we don't know
2767 * what context we are called from. Before sysfs comes up, we don't
2768 * need to do anything because our sysfs initcall will start by
2769 * adding all existing slabs to sysfs.
2770 */
2771 slabflags = SLAB_CACHE_DMA|SLAB_NOTRACK;
2772 if (slab_state >= SYSFS)
2773 slabflags |= __SYSFS_ADD_DEFERRED;
2774
2586 if (!s || !text || !kmem_cache_open(s, flags, text, 2775 if (!s || !text || !kmem_cache_open(s, flags, text,
2587 realsize, ARCH_KMALLOC_MINALIGN, 2776 realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) {
2588 SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) {
2589 kfree(s); 2777 kfree(s);
2590 kfree(text); 2778 kfree(text);
2591 goto unlock_out; 2779 goto unlock_out;
@@ -2594,7 +2782,8 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2594 list_add(&s->list, &slab_caches); 2782 list_add(&s->list, &slab_caches);
2595 kmalloc_caches_dma[index] = s; 2783 kmalloc_caches_dma[index] = s;
2596 2784
2597 schedule_work(&sysfs_add_work); 2785 if (slab_state >= SYSFS)
2786 schedule_work(&sysfs_add_work);
2598 2787
2599unlock_out: 2788unlock_out:
2600 up_write(&slub_lock); 2789 up_write(&slub_lock);
@@ -2636,6 +2825,11 @@ static s8 size_index[24] = {
2636 2 /* 192 */ 2825 2 /* 192 */
2637}; 2826};
2638 2827
2828static inline int size_index_elem(size_t bytes)
2829{
2830 return (bytes - 1) / 8;
2831}
2832
2639static struct kmem_cache *get_slab(size_t size, gfp_t flags) 2833static struct kmem_cache *get_slab(size_t size, gfp_t flags)
2640{ 2834{
2641 int index; 2835 int index;
@@ -2644,7 +2838,7 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags)
2644 if (!size) 2838 if (!size)
2645 return ZERO_SIZE_PTR; 2839 return ZERO_SIZE_PTR;
2646 2840
2647 index = size_index[(size - 1) / 8]; 2841 index = size_index[size_index_elem(size)];
2648 } else 2842 } else
2649 index = fls(size - 1); 2843 index = fls(size - 1);
2650 2844
@@ -2659,6 +2853,7 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags)
2659void *__kmalloc(size_t size, gfp_t flags) 2853void *__kmalloc(size_t size, gfp_t flags)
2660{ 2854{
2661 struct kmem_cache *s; 2855 struct kmem_cache *s;
2856 void *ret;
2662 2857
2663 if (unlikely(size > SLUB_MAX_SIZE)) 2858 if (unlikely(size > SLUB_MAX_SIZE))
2664 return kmalloc_large(size, flags); 2859 return kmalloc_large(size, flags);
@@ -2668,35 +2863,54 @@ void *__kmalloc(size_t size, gfp_t flags)
2668 if (unlikely(ZERO_OR_NULL_PTR(s))) 2863 if (unlikely(ZERO_OR_NULL_PTR(s)))
2669 return s; 2864 return s;
2670 2865
2671 return slab_alloc(s, flags, -1, _RET_IP_); 2866 ret = slab_alloc(s, flags, -1, _RET_IP_);
2867
2868 trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
2869
2870 return ret;
2672} 2871}
2673EXPORT_SYMBOL(__kmalloc); 2872EXPORT_SYMBOL(__kmalloc);
2674 2873
2675static void *kmalloc_large_node(size_t size, gfp_t flags, int node) 2874static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
2676{ 2875{
2677 struct page *page = alloc_pages_node(node, flags | __GFP_COMP, 2876 struct page *page;
2678 get_order(size)); 2877 void *ptr = NULL;
2679 2878
2879 flags |= __GFP_COMP | __GFP_NOTRACK;
2880 page = alloc_pages_node(node, flags, get_order(size));
2680 if (page) 2881 if (page)
2681 return page_address(page); 2882 ptr = page_address(page);
2682 else 2883
2683 return NULL; 2884 kmemleak_alloc(ptr, size, 1, flags);
2885 return ptr;
2684} 2886}
2685 2887
2686#ifdef CONFIG_NUMA 2888#ifdef CONFIG_NUMA
2687void *__kmalloc_node(size_t size, gfp_t flags, int node) 2889void *__kmalloc_node(size_t size, gfp_t flags, int node)
2688{ 2890{
2689 struct kmem_cache *s; 2891 struct kmem_cache *s;
2892 void *ret;
2690 2893
2691 if (unlikely(size > SLUB_MAX_SIZE)) 2894 if (unlikely(size > SLUB_MAX_SIZE)) {
2692 return kmalloc_large_node(size, flags, node); 2895 ret = kmalloc_large_node(size, flags, node);
2896
2897 trace_kmalloc_node(_RET_IP_, ret,
2898 size, PAGE_SIZE << get_order(size),
2899 flags, node);
2900
2901 return ret;
2902 }
2693 2903
2694 s = get_slab(size, flags); 2904 s = get_slab(size, flags);
2695 2905
2696 if (unlikely(ZERO_OR_NULL_PTR(s))) 2906 if (unlikely(ZERO_OR_NULL_PTR(s)))
2697 return s; 2907 return s;
2698 2908
2699 return slab_alloc(s, flags, node, _RET_IP_); 2909 ret = slab_alloc(s, flags, node, _RET_IP_);
2910
2911 trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
2912
2913 return ret;
2700} 2914}
2701EXPORT_SYMBOL(__kmalloc_node); 2915EXPORT_SYMBOL(__kmalloc_node);
2702#endif 2916#endif
@@ -2745,12 +2959,15 @@ void kfree(const void *x)
2745 struct page *page; 2959 struct page *page;
2746 void *object = (void *)x; 2960 void *object = (void *)x;
2747 2961
2962 trace_kfree(_RET_IP_, x);
2963
2748 if (unlikely(ZERO_OR_NULL_PTR(x))) 2964 if (unlikely(ZERO_OR_NULL_PTR(x)))
2749 return; 2965 return;
2750 2966
2751 page = virt_to_head_page(x); 2967 page = virt_to_head_page(x);
2752 if (unlikely(!PageSlab(page))) { 2968 if (unlikely(!PageSlab(page))) {
2753 BUG_ON(!PageCompound(page)); 2969 BUG_ON(!PageCompound(page));
2970 kmemleak_free(x);
2754 put_page(page); 2971 put_page(page);
2755 return; 2972 return;
2756 } 2973 }
@@ -2968,7 +3185,7 @@ void __init kmem_cache_init(void)
2968 * kmem_cache_open for slab_state == DOWN. 3185 * kmem_cache_open for slab_state == DOWN.
2969 */ 3186 */
2970 create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", 3187 create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
2971 sizeof(struct kmem_cache_node), GFP_KERNEL); 3188 sizeof(struct kmem_cache_node), GFP_NOWAIT);
2972 kmalloc_caches[0].refcount = -1; 3189 kmalloc_caches[0].refcount = -1;
2973 caches++; 3190 caches++;
2974 3191
@@ -2979,18 +3196,20 @@ void __init kmem_cache_init(void)
2979 slab_state = PARTIAL; 3196 slab_state = PARTIAL;
2980 3197
2981 /* Caches that are not of the two-to-the-power-of size */ 3198 /* Caches that are not of the two-to-the-power-of size */
2982 if (KMALLOC_MIN_SIZE <= 64) { 3199 if (KMALLOC_MIN_SIZE <= 32) {
2983 create_kmalloc_cache(&kmalloc_caches[1], 3200 create_kmalloc_cache(&kmalloc_caches[1],
2984 "kmalloc-96", 96, GFP_KERNEL); 3201 "kmalloc-96", 96, GFP_NOWAIT);
2985 caches++; 3202 caches++;
3203 }
3204 if (KMALLOC_MIN_SIZE <= 64) {
2986 create_kmalloc_cache(&kmalloc_caches[2], 3205 create_kmalloc_cache(&kmalloc_caches[2],
2987 "kmalloc-192", 192, GFP_KERNEL); 3206 "kmalloc-192", 192, GFP_NOWAIT);
2988 caches++; 3207 caches++;
2989 } 3208 }
2990 3209
2991 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { 3210 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
2992 create_kmalloc_cache(&kmalloc_caches[i], 3211 create_kmalloc_cache(&kmalloc_caches[i],
2993 "kmalloc", 1 << i, GFP_KERNEL); 3212 "kmalloc", 1 << i, GFP_NOWAIT);
2994 caches++; 3213 caches++;
2995 } 3214 }
2996 3215
@@ -3009,17 +3228,28 @@ void __init kmem_cache_init(void)
3009 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || 3228 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
3010 (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); 3229 (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
3011 3230
3012 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) 3231 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
3013 size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW; 3232 int elem = size_index_elem(i);
3233 if (elem >= ARRAY_SIZE(size_index))
3234 break;
3235 size_index[elem] = KMALLOC_SHIFT_LOW;
3236 }
3014 3237
3015 if (KMALLOC_MIN_SIZE == 128) { 3238 if (KMALLOC_MIN_SIZE == 64) {
3239 /*
3240 * The 96 byte size cache is not used if the alignment
3241 * is 64 byte.
3242 */
3243 for (i = 64 + 8; i <= 96; i += 8)
3244 size_index[size_index_elem(i)] = 7;
3245 } else if (KMALLOC_MIN_SIZE == 128) {
3016 /* 3246 /*
3017 * The 192 byte sized cache is not used if the alignment 3247 * The 192 byte sized cache is not used if the alignment
3018 * is 128 byte. Redirect kmalloc to use the 256 byte cache 3248 * is 128 byte. Redirect kmalloc to use the 256 byte cache
3019 * instead. 3249 * instead.
3020 */ 3250 */
3021 for (i = 128 + 8; i <= 192; i += 8) 3251 for (i = 128 + 8; i <= 192; i += 8)
3022 size_index[(i - 1) / 8] = 8; 3252 size_index[size_index_elem(i)] = 8;
3023 } 3253 }
3024 3254
3025 slab_state = UP; 3255 slab_state = UP;
@@ -3027,7 +3257,7 @@ void __init kmem_cache_init(void)
3027 /* Provide the correct kmalloc names now that the caches are up */ 3257 /* Provide the correct kmalloc names now that the caches are up */
3028 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) 3258 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++)
3029 kmalloc_caches[i]. name = 3259 kmalloc_caches[i]. name =
3030 kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); 3260 kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
3031 3261
3032#ifdef CONFIG_SMP 3262#ifdef CONFIG_SMP
3033 register_cpu_notifier(&slab_notifier); 3263 register_cpu_notifier(&slab_notifier);
@@ -3045,6 +3275,10 @@ void __init kmem_cache_init(void)
3045 nr_cpu_ids, nr_node_ids); 3275 nr_cpu_ids, nr_node_ids);
3046} 3276}
3047 3277
3278void __init kmem_cache_init_late(void)
3279{
3280}
3281
3048/* 3282/*
3049 * Find a mergeable slab cache 3283 * Find a mergeable slab cache
3050 */ 3284 */
@@ -3111,6 +3345,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3111{ 3345{
3112 struct kmem_cache *s; 3346 struct kmem_cache *s;
3113 3347
3348 if (WARN_ON(!name))
3349 return NULL;
3350
3114 down_write(&slub_lock); 3351 down_write(&slub_lock);
3115 s = find_mergeable(size, align, flags, name, ctor); 3352 s = find_mergeable(size, align, flags, name, ctor);
3116 if (s) { 3353 if (s) {
@@ -3224,6 +3461,7 @@ static struct notifier_block __cpuinitdata slab_notifier = {
3224void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) 3461void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
3225{ 3462{
3226 struct kmem_cache *s; 3463 struct kmem_cache *s;
3464 void *ret;
3227 3465
3228 if (unlikely(size > SLUB_MAX_SIZE)) 3466 if (unlikely(size > SLUB_MAX_SIZE))
3229 return kmalloc_large(size, gfpflags); 3467 return kmalloc_large(size, gfpflags);
@@ -3233,13 +3471,19 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
3233 if (unlikely(ZERO_OR_NULL_PTR(s))) 3471 if (unlikely(ZERO_OR_NULL_PTR(s)))
3234 return s; 3472 return s;
3235 3473
3236 return slab_alloc(s, gfpflags, -1, caller); 3474 ret = slab_alloc(s, gfpflags, -1, caller);
3475
3476 /* Honor the call site pointer we recieved. */
3477 trace_kmalloc(caller, ret, size, s->size, gfpflags);
3478
3479 return ret;
3237} 3480}
3238 3481
3239void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, 3482void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3240 int node, unsigned long caller) 3483 int node, unsigned long caller)
3241{ 3484{
3242 struct kmem_cache *s; 3485 struct kmem_cache *s;
3486 void *ret;
3243 3487
3244 if (unlikely(size > SLUB_MAX_SIZE)) 3488 if (unlikely(size > SLUB_MAX_SIZE))
3245 return kmalloc_large_node(size, gfpflags, node); 3489 return kmalloc_large_node(size, gfpflags, node);
@@ -3249,24 +3493,15 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3249 if (unlikely(ZERO_OR_NULL_PTR(s))) 3493 if (unlikely(ZERO_OR_NULL_PTR(s)))
3250 return s; 3494 return s;
3251 3495
3252 return slab_alloc(s, gfpflags, node, caller); 3496 ret = slab_alloc(s, gfpflags, node, caller);
3253}
3254 3497
3255#ifdef CONFIG_SLUB_DEBUG 3498 /* Honor the call site pointer we recieved. */
3256static unsigned long count_partial(struct kmem_cache_node *n, 3499 trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
3257 int (*get_count)(struct page *))
3258{
3259 unsigned long flags;
3260 unsigned long x = 0;
3261 struct page *page;
3262 3500
3263 spin_lock_irqsave(&n->list_lock, flags); 3501 return ret;
3264 list_for_each_entry(page, &n->partial, lru)
3265 x += get_count(page);
3266 spin_unlock_irqrestore(&n->list_lock, flags);
3267 return x;
3268} 3502}
3269 3503
3504#ifdef CONFIG_SLUB_DEBUG
3270static int count_inuse(struct page *page) 3505static int count_inuse(struct page *page)
3271{ 3506{
3272 return page->inuse; 3507 return page->inuse;
@@ -3277,11 +3512,6 @@ static int count_total(struct page *page)
3277 return page->objects; 3512 return page->objects;
3278} 3513}
3279 3514
3280static int count_free(struct page *page)
3281{
3282 return page->objects - page->inuse;
3283}
3284
3285static int validate_slab(struct kmem_cache *s, struct page *page, 3515static int validate_slab(struct kmem_cache *s, struct page *page,
3286 unsigned long *map) 3516 unsigned long *map)
3287{ 3517{
@@ -3650,7 +3880,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
3650 to_cpumask(l->cpus)); 3880 to_cpumask(l->cpus));
3651 } 3881 }
3652 3882
3653 if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && 3883 if (nr_online_nodes > 1 && !nodes_empty(l->nodes) &&
3654 len < PAGE_SIZE - 60) { 3884 len < PAGE_SIZE - 60) {
3655 len += sprintf(buf + len, " nodes="); 3885 len += sprintf(buf + len, " nodes=");
3656 len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, 3886 len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50,
@@ -4325,6 +4555,8 @@ static char *create_unique_id(struct kmem_cache *s)
4325 *p++ = 'a'; 4555 *p++ = 'a';
4326 if (s->flags & SLAB_DEBUG_FREE) 4556 if (s->flags & SLAB_DEBUG_FREE)
4327 *p++ = 'F'; 4557 *p++ = 'F';
4558 if (!(s->flags & SLAB_NOTRACK))
4559 *p++ = 't';
4328 if (p != name + 1) 4560 if (p != name + 1)
4329 *p++ = '-'; 4561 *p++ = '-';
4330 p += sprintf(p, "%07d", s->size); 4562 p += sprintf(p, "%07d", s->size);
@@ -4367,8 +4599,11 @@ static int sysfs_slab_add(struct kmem_cache *s)
4367 } 4599 }
4368 4600
4369 err = sysfs_create_group(&s->kobj, &slab_attr_group); 4601 err = sysfs_create_group(&s->kobj, &slab_attr_group);
4370 if (err) 4602 if (err) {
4603 kobject_del(&s->kobj);
4604 kobject_put(&s->kobj);
4371 return err; 4605 return err;
4606 }
4372 kobject_uevent(&s->kobj, KOBJ_ADD); 4607 kobject_uevent(&s->kobj, KOBJ_ADD);
4373 if (!unmergeable) { 4608 if (!unmergeable) {
4374 /* Setup first alias */ 4609 /* Setup first alias */
@@ -4550,7 +4785,7 @@ static const struct file_operations proc_slabinfo_operations = {
4550 4785
4551static int __init slab_proc_init(void) 4786static int __init slab_proc_init(void)
4552{ 4787{
4553 proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); 4788 proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations);
4554 return 0; 4789 return 0;
4555} 4790}
4556module_init(slab_proc_init); 4791module_init(slab_proc_init);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index a13ea6401ae7..d9714bdcb4a3 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -48,8 +48,14 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
48{ 48{
49 /* If the main allocator is up use that, fallback to bootmem. */ 49 /* If the main allocator is up use that, fallback to bootmem. */
50 if (slab_is_available()) { 50 if (slab_is_available()) {
51 struct page *page = alloc_pages_node(node, 51 struct page *page;
52
53 if (node_state(node, N_HIGH_MEMORY))
54 page = alloc_pages_node(node,
52 GFP_KERNEL | __GFP_ZERO, get_order(size)); 55 GFP_KERNEL | __GFP_ZERO, get_order(size));
56 else
57 page = alloc_pages(GFP_KERNEL | __GFP_ZERO,
58 get_order(size));
53 if (page) 59 if (page)
54 return page_address(page); 60 return page_address(page);
55 return NULL; 61 return NULL;
diff --git a/mm/sparse.c b/mm/sparse.c
index da432d9f0ae8..6ce4aab69e99 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -62,9 +62,12 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid)
62 unsigned long array_size = SECTIONS_PER_ROOT * 62 unsigned long array_size = SECTIONS_PER_ROOT *
63 sizeof(struct mem_section); 63 sizeof(struct mem_section);
64 64
65 if (slab_is_available()) 65 if (slab_is_available()) {
66 section = kmalloc_node(array_size, GFP_KERNEL, nid); 66 if (node_state(nid, N_HIGH_MEMORY))
67 else 67 section = kmalloc_node(array_size, GFP_KERNEL, nid);
68 else
69 section = kmalloc(array_size, GFP_KERNEL);
70 } else
68 section = alloc_bootmem_node(NODE_DATA(nid), array_size); 71 section = alloc_bootmem_node(NODE_DATA(nid), array_size);
69 72
70 if (section) 73 if (section)
diff --git a/mm/swap.c b/mm/swap.c
index 6e83084c1f6c..308e57d8d7ed 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -118,7 +118,7 @@ static void pagevec_move_tail(struct pagevec *pvec)
118 spin_lock(&zone->lru_lock); 118 spin_lock(&zone->lru_lock);
119 } 119 }
120 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 120 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
121 int lru = page_is_file_cache(page); 121 int lru = page_lru_base_type(page);
122 list_move_tail(&page->lru, &zone->lru[lru].list); 122 list_move_tail(&page->lru, &zone->lru[lru].list);
123 pgmoved++; 123 pgmoved++;
124 } 124 }
@@ -181,7 +181,7 @@ void activate_page(struct page *page)
181 spin_lock_irq(&zone->lru_lock); 181 spin_lock_irq(&zone->lru_lock);
182 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 182 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
183 int file = page_is_file_cache(page); 183 int file = page_is_file_cache(page);
184 int lru = LRU_BASE + file; 184 int lru = page_lru_base_type(page);
185 del_page_from_lru_list(zone, page, lru); 185 del_page_from_lru_list(zone, page, lru);
186 186
187 SetPageActive(page); 187 SetPageActive(page);
@@ -189,7 +189,7 @@ void activate_page(struct page *page)
189 add_page_to_lru_list(zone, page, lru); 189 add_page_to_lru_list(zone, page, lru);
190 __count_vm_event(PGACTIVATE); 190 __count_vm_event(PGACTIVATE);
191 191
192 update_page_reclaim_stat(zone, page, !!file, 1); 192 update_page_reclaim_stat(zone, page, file, 1);
193 } 193 }
194 spin_unlock_irq(&zone->lru_lock); 194 spin_unlock_irq(&zone->lru_lock);
195} 195}
@@ -448,8 +448,8 @@ void pagevec_strip(struct pagevec *pvec)
448 for (i = 0; i < pagevec_count(pvec); i++) { 448 for (i = 0; i < pagevec_count(pvec); i++) {
449 struct page *page = pvec->pages[i]; 449 struct page *page = pvec->pages[i];
450 450
451 if (PagePrivate(page) && trylock_page(page)) { 451 if (page_has_private(page) && trylock_page(page)) {
452 if (PagePrivate(page)) 452 if (page_has_private(page))
453 try_to_release_page(page, 0); 453 try_to_release_page(page, 0);
454 unlock_page(page); 454 unlock_page(page);
455 } 455 }
@@ -491,55 +491,12 @@ unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
491 491
492EXPORT_SYMBOL(pagevec_lookup_tag); 492EXPORT_SYMBOL(pagevec_lookup_tag);
493 493
494#ifdef CONFIG_SMP
495/*
496 * We tolerate a little inaccuracy to avoid ping-ponging the counter between
497 * CPUs
498 */
499#define ACCT_THRESHOLD max(16, NR_CPUS * 2)
500
501static DEFINE_PER_CPU(long, committed_space);
502
503void vm_acct_memory(long pages)
504{
505 long *local;
506
507 preempt_disable();
508 local = &__get_cpu_var(committed_space);
509 *local += pages;
510 if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) {
511 atomic_long_add(*local, &vm_committed_space);
512 *local = 0;
513 }
514 preempt_enable();
515}
516
517#ifdef CONFIG_HOTPLUG_CPU
518
519/* Drop the CPU's cached committed space back into the central pool. */
520static int cpu_swap_callback(struct notifier_block *nfb,
521 unsigned long action,
522 void *hcpu)
523{
524 long *committed;
525
526 committed = &per_cpu(committed_space, (long)hcpu);
527 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
528 atomic_long_add(*committed, &vm_committed_space);
529 *committed = 0;
530 drain_cpu_pagevecs((long)hcpu);
531 }
532 return NOTIFY_OK;
533}
534#endif /* CONFIG_HOTPLUG_CPU */
535#endif /* CONFIG_SMP */
536
537/* 494/*
538 * Perform any setup for the swap system 495 * Perform any setup for the swap system
539 */ 496 */
540void __init swap_setup(void) 497void __init swap_setup(void)
541{ 498{
542 unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); 499 unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
543 500
544#ifdef CONFIG_SWAP 501#ifdef CONFIG_SWAP
545 bdi_init(swapper_space.backing_dev_info); 502 bdi_init(swapper_space.backing_dev_info);
@@ -554,7 +511,4 @@ void __init swap_setup(void)
554 * Right now other parts of the system means that we 511 * Right now other parts of the system means that we
555 * _really_ don't want to cluster much more 512 * _really_ don't want to cluster much more
556 */ 513 */
557#ifdef CONFIG_HOTPLUG_CPU
558 hotcpu_notifier(cpu_swap_callback, 0);
559#endif
560} 514}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 3ecea98ecb45..6d1daeb1cb4a 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -34,6 +34,7 @@ static const struct address_space_operations swap_aops = {
34}; 34};
35 35
36static struct backing_dev_info swap_backing_dev_info = { 36static struct backing_dev_info swap_backing_dev_info = {
37 .name = "swap",
37 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, 38 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
38 .unplug_io_fn = swap_unplug_io_fn, 39 .unplug_io_fn = swap_unplug_io_fn,
39}; 40};
@@ -66,10 +67,10 @@ void show_swap_cache_info(void)
66} 67}
67 68
68/* 69/*
69 * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, 70 * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
70 * but sets SwapCache flag and private instead of mapping and index. 71 * but sets SwapCache flag and private instead of mapping and index.
71 */ 72 */
72int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) 73static int __add_to_swap_cache(struct page *page, swp_entry_t entry)
73{ 74{
74 int error; 75 int error;
75 76
@@ -77,28 +78,43 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
77 VM_BUG_ON(PageSwapCache(page)); 78 VM_BUG_ON(PageSwapCache(page));
78 VM_BUG_ON(!PageSwapBacked(page)); 79 VM_BUG_ON(!PageSwapBacked(page));
79 80
81 page_cache_get(page);
82 SetPageSwapCache(page);
83 set_page_private(page, entry.val);
84
85 spin_lock_irq(&swapper_space.tree_lock);
86 error = radix_tree_insert(&swapper_space.page_tree, entry.val, page);
87 if (likely(!error)) {
88 total_swapcache_pages++;
89 __inc_zone_page_state(page, NR_FILE_PAGES);
90 INC_CACHE_INFO(add_total);
91 }
92 spin_unlock_irq(&swapper_space.tree_lock);
93
94 if (unlikely(error)) {
95 /*
96 * Only the context which have set SWAP_HAS_CACHE flag
97 * would call add_to_swap_cache().
98 * So add_to_swap_cache() doesn't returns -EEXIST.
99 */
100 VM_BUG_ON(error == -EEXIST);
101 set_page_private(page, 0UL);
102 ClearPageSwapCache(page);
103 page_cache_release(page);
104 }
105
106 return error;
107}
108
109
110int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
111{
112 int error;
113
80 error = radix_tree_preload(gfp_mask); 114 error = radix_tree_preload(gfp_mask);
81 if (!error) { 115 if (!error) {
82 page_cache_get(page); 116 error = __add_to_swap_cache(page, entry);
83 SetPageSwapCache(page);
84 set_page_private(page, entry.val);
85
86 spin_lock_irq(&swapper_space.tree_lock);
87 error = radix_tree_insert(&swapper_space.page_tree,
88 entry.val, page);
89 if (likely(!error)) {
90 total_swapcache_pages++;
91 __inc_zone_page_state(page, NR_FILE_PAGES);
92 INC_CACHE_INFO(add_total);
93 }
94 spin_unlock_irq(&swapper_space.tree_lock);
95 radix_tree_preload_end(); 117 radix_tree_preload_end();
96
97 if (unlikely(error)) {
98 set_page_private(page, 0UL);
99 ClearPageSwapCache(page);
100 page_cache_release(page);
101 }
102 } 118 }
103 return error; 119 return error;
104} 120}
@@ -109,8 +125,6 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
109 */ 125 */
110void __delete_from_swap_cache(struct page *page) 126void __delete_from_swap_cache(struct page *page)
111{ 127{
112 swp_entry_t ent = {.val = page_private(page)};
113
114 VM_BUG_ON(!PageLocked(page)); 128 VM_BUG_ON(!PageLocked(page));
115 VM_BUG_ON(!PageSwapCache(page)); 129 VM_BUG_ON(!PageSwapCache(page));
116 VM_BUG_ON(PageWriteback(page)); 130 VM_BUG_ON(PageWriteback(page));
@@ -121,13 +135,11 @@ void __delete_from_swap_cache(struct page *page)
121 total_swapcache_pages--; 135 total_swapcache_pages--;
122 __dec_zone_page_state(page, NR_FILE_PAGES); 136 __dec_zone_page_state(page, NR_FILE_PAGES);
123 INC_CACHE_INFO(del_total); 137 INC_CACHE_INFO(del_total);
124 mem_cgroup_uncharge_swapcache(page, ent);
125} 138}
126 139
127/** 140/**
128 * add_to_swap - allocate swap space for a page 141 * add_to_swap - allocate swap space for a page
129 * @page: page we want to move to swap 142 * @page: page we want to move to swap
130 * @gfp_mask: memory allocation flags
131 * 143 *
132 * Allocate swap space for the page and add the page to the 144 * Allocate swap space for the page and add the page to the
133 * swap cache. Caller needs to hold the page lock. 145 * swap cache. Caller needs to hold the page lock.
@@ -140,38 +152,34 @@ int add_to_swap(struct page *page)
140 VM_BUG_ON(!PageLocked(page)); 152 VM_BUG_ON(!PageLocked(page));
141 VM_BUG_ON(!PageUptodate(page)); 153 VM_BUG_ON(!PageUptodate(page));
142 154
143 for (;;) { 155 entry = get_swap_page();
144 entry = get_swap_page(); 156 if (!entry.val)
145 if (!entry.val) 157 return 0;
146 return 0;
147 158
159 /*
160 * Radix-tree node allocations from PF_MEMALLOC contexts could
161 * completely exhaust the page allocator. __GFP_NOMEMALLOC
162 * stops emergency reserves from being allocated.
163 *
164 * TODO: this could cause a theoretical memory reclaim
165 * deadlock in the swap out path.
166 */
167 /*
168 * Add it to the swap cache and mark it dirty
169 */
170 err = add_to_swap_cache(page, entry,
171 __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
172
173 if (!err) { /* Success */
174 SetPageDirty(page);
175 return 1;
176 } else { /* -ENOMEM radix-tree allocation failure */
148 /* 177 /*
149 * Radix-tree node allocations from PF_MEMALLOC contexts could 178 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
150 * completely exhaust the page allocator. __GFP_NOMEMALLOC 179 * clear SWAP_HAS_CACHE flag.
151 * stops emergency reserves from being allocated.
152 *
153 * TODO: this could cause a theoretical memory reclaim
154 * deadlock in the swap out path.
155 */
156 /*
157 * Add it to the swap cache and mark it dirty
158 */ 180 */
159 err = add_to_swap_cache(page, entry, 181 swapcache_free(entry, NULL);
160 __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); 182 return 0;
161
162 switch (err) {
163 case 0: /* Success */
164 SetPageDirty(page);
165 return 1;
166 case -EEXIST:
167 /* Raced with "speculative" read_swap_cache_async */
168 swap_free(entry);
169 continue;
170 default:
171 /* -ENOMEM radix-tree allocation failure */
172 swap_free(entry);
173 return 0;
174 }
175 } 183 }
176} 184}
177 185
@@ -191,7 +199,7 @@ void delete_from_swap_cache(struct page *page)
191 __delete_from_swap_cache(page); 199 __delete_from_swap_cache(page);
192 spin_unlock_irq(&swapper_space.tree_lock); 200 spin_unlock_irq(&swapper_space.tree_lock);
193 201
194 swap_free(entry); 202 swapcache_free(entry, page);
195 page_cache_release(page); 203 page_cache_release(page);
196} 204}
197 205
@@ -293,33 +301,46 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
293 } 301 }
294 302
295 /* 303 /*
296 * Swap entry may have been freed since our caller observed it. 304 * call radix_tree_preload() while we can wait.
297 */ 305 */
298 if (!swap_duplicate(entry)) 306 err = radix_tree_preload(gfp_mask & GFP_KERNEL);
307 if (err)
299 break; 308 break;
300 309
301 /* 310 /*
302 * Associate the page with swap entry in the swap cache. 311 * Swap entry may have been freed since our caller observed it.
303 * May fail (-EEXIST) if there is already a page associated
304 * with this entry in the swap cache: added by a racing
305 * read_swap_cache_async, or add_to_swap or shmem_writepage
306 * re-using the just freed swap entry for an existing page.
307 * May fail (-ENOMEM) if radix-tree node allocation failed.
308 */ 312 */
313 err = swapcache_prepare(entry);
314 if (err == -EEXIST) { /* seems racy */
315 radix_tree_preload_end();
316 continue;
317 }
318 if (err) { /* swp entry is obsolete ? */
319 radix_tree_preload_end();
320 break;
321 }
322
323 /* May fail (-ENOMEM) if radix-tree node allocation failed. */
309 __set_page_locked(new_page); 324 __set_page_locked(new_page);
310 SetPageSwapBacked(new_page); 325 SetPageSwapBacked(new_page);
311 err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); 326 err = __add_to_swap_cache(new_page, entry);
312 if (likely(!err)) { 327 if (likely(!err)) {
328 radix_tree_preload_end();
313 /* 329 /*
314 * Initiate read into locked page and return. 330 * Initiate read into locked page and return.
315 */ 331 */
316 lru_cache_add_anon(new_page); 332 lru_cache_add_anon(new_page);
317 swap_readpage(NULL, new_page); 333 swap_readpage(new_page);
318 return new_page; 334 return new_page;
319 } 335 }
336 radix_tree_preload_end();
320 ClearPageSwapBacked(new_page); 337 ClearPageSwapBacked(new_page);
321 __clear_page_locked(new_page); 338 __clear_page_locked(new_page);
322 swap_free(entry); 339 /*
340 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
341 * clear SWAP_HAS_CACHE flag.
342 */
343 swapcache_free(entry, NULL);
323 } while (err != -ENOMEM); 344 } while (err != -ENOMEM);
324 345
325 if (new_page) 346 if (new_page)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 312fafe0ab6e..9c590eef7912 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -53,6 +53,59 @@ static struct swap_info_struct swap_info[MAX_SWAPFILES];
53 53
54static DEFINE_MUTEX(swapon_mutex); 54static DEFINE_MUTEX(swapon_mutex);
55 55
56/* For reference count accounting in swap_map */
57/* enum for swap_map[] handling. internal use only */
58enum {
59 SWAP_MAP = 0, /* ops for reference from swap users */
60 SWAP_CACHE, /* ops for reference from swap cache */
61};
62
63static inline int swap_count(unsigned short ent)
64{
65 return ent & SWAP_COUNT_MASK;
66}
67
68static inline bool swap_has_cache(unsigned short ent)
69{
70 return !!(ent & SWAP_HAS_CACHE);
71}
72
73static inline unsigned short encode_swapmap(int count, bool has_cache)
74{
75 unsigned short ret = count;
76
77 if (has_cache)
78 return SWAP_HAS_CACHE | ret;
79 return ret;
80}
81
82/* returnes 1 if swap entry is freed */
83static int
84__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
85{
86 int type = si - swap_info;
87 swp_entry_t entry = swp_entry(type, offset);
88 struct page *page;
89 int ret = 0;
90
91 page = find_get_page(&swapper_space, entry.val);
92 if (!page)
93 return 0;
94 /*
95 * This function is called from scan_swap_map() and it's called
96 * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
97 * We have to use trylock for avoiding deadlock. This is a special
98 * case and you should use try_to_free_swap() with explicit lock_page()
99 * in usual operations.
100 */
101 if (trylock_page(page)) {
102 ret = try_to_free_swap(page);
103 unlock_page(page);
104 }
105 page_cache_release(page);
106 return ret;
107}
108
56/* 109/*
57 * We need this because the bdev->unplug_fn can sleep and we cannot 110 * We need this because the bdev->unplug_fn can sleep and we cannot
58 * hold swap_lock while calling the unplug_fn. And swap_lock 111 * hold swap_lock while calling the unplug_fn. And swap_lock
@@ -108,7 +161,8 @@ static int discard_swap(struct swap_info_struct *si)
108 } 161 }
109 162
110 err = blkdev_issue_discard(si->bdev, start_block, 163 err = blkdev_issue_discard(si->bdev, start_block,
111 nr_blocks, GFP_KERNEL); 164 nr_blocks, GFP_KERNEL,
165 DISCARD_FL_BARRIER);
112 if (err) 166 if (err)
113 break; 167 break;
114 168
@@ -147,7 +201,8 @@ static void discard_swap_cluster(struct swap_info_struct *si,
147 start_block <<= PAGE_SHIFT - 9; 201 start_block <<= PAGE_SHIFT - 9;
148 nr_blocks <<= PAGE_SHIFT - 9; 202 nr_blocks <<= PAGE_SHIFT - 9;
149 if (blkdev_issue_discard(si->bdev, start_block, 203 if (blkdev_issue_discard(si->bdev, start_block,
150 nr_blocks, GFP_NOIO)) 204 nr_blocks, GFP_NOIO,
205 DISCARD_FL_BARRIER))
151 break; 206 break;
152 } 207 }
153 208
@@ -167,7 +222,8 @@ static int wait_for_discard(void *word)
167#define SWAPFILE_CLUSTER 256 222#define SWAPFILE_CLUSTER 256
168#define LATENCY_LIMIT 256 223#define LATENCY_LIMIT 256
169 224
170static inline unsigned long scan_swap_map(struct swap_info_struct *si) 225static inline unsigned long scan_swap_map(struct swap_info_struct *si,
226 int cache)
171{ 227{
172 unsigned long offset; 228 unsigned long offset;
173 unsigned long scan_base; 229 unsigned long scan_base;
@@ -273,6 +329,19 @@ checks:
273 goto no_page; 329 goto no_page;
274 if (offset > si->highest_bit) 330 if (offset > si->highest_bit)
275 scan_base = offset = si->lowest_bit; 331 scan_base = offset = si->lowest_bit;
332
333 /* reuse swap entry of cache-only swap if not busy. */
334 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
335 int swap_was_freed;
336 spin_unlock(&swap_lock);
337 swap_was_freed = __try_to_reclaim_swap(si, offset);
338 spin_lock(&swap_lock);
339 /* entry was freed successfully, try to use this again */
340 if (swap_was_freed)
341 goto checks;
342 goto scan; /* check next one */
343 }
344
276 if (si->swap_map[offset]) 345 if (si->swap_map[offset])
277 goto scan; 346 goto scan;
278 347
@@ -285,7 +354,10 @@ checks:
285 si->lowest_bit = si->max; 354 si->lowest_bit = si->max;
286 si->highest_bit = 0; 355 si->highest_bit = 0;
287 } 356 }
288 si->swap_map[offset] = 1; 357 if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */
358 si->swap_map[offset] = encode_swapmap(0, true);
359 else /* at suspend */
360 si->swap_map[offset] = encode_swapmap(1, false);
289 si->cluster_next = offset + 1; 361 si->cluster_next = offset + 1;
290 si->flags -= SWP_SCANNING; 362 si->flags -= SWP_SCANNING;
291 363
@@ -351,6 +423,10 @@ scan:
351 spin_lock(&swap_lock); 423 spin_lock(&swap_lock);
352 goto checks; 424 goto checks;
353 } 425 }
426 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
427 spin_lock(&swap_lock);
428 goto checks;
429 }
354 if (unlikely(--latency_ration < 0)) { 430 if (unlikely(--latency_ration < 0)) {
355 cond_resched(); 431 cond_resched();
356 latency_ration = LATENCY_LIMIT; 432 latency_ration = LATENCY_LIMIT;
@@ -362,6 +438,10 @@ scan:
362 spin_lock(&swap_lock); 438 spin_lock(&swap_lock);
363 goto checks; 439 goto checks;
364 } 440 }
441 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
442 spin_lock(&swap_lock);
443 goto checks;
444 }
365 if (unlikely(--latency_ration < 0)) { 445 if (unlikely(--latency_ration < 0)) {
366 cond_resched(); 446 cond_resched();
367 latency_ration = LATENCY_LIMIT; 447 latency_ration = LATENCY_LIMIT;
@@ -401,7 +481,8 @@ swp_entry_t get_swap_page(void)
401 continue; 481 continue;
402 482
403 swap_list.next = next; 483 swap_list.next = next;
404 offset = scan_swap_map(si); 484 /* This is called for allocating swap entry for cache */
485 offset = scan_swap_map(si, SWAP_CACHE);
405 if (offset) { 486 if (offset) {
406 spin_unlock(&swap_lock); 487 spin_unlock(&swap_lock);
407 return swp_entry(type, offset); 488 return swp_entry(type, offset);
@@ -415,6 +496,7 @@ noswap:
415 return (swp_entry_t) {0}; 496 return (swp_entry_t) {0};
416} 497}
417 498
499/* The only caller of this function is now susupend routine */
418swp_entry_t get_swap_page_of_type(int type) 500swp_entry_t get_swap_page_of_type(int type)
419{ 501{
420 struct swap_info_struct *si; 502 struct swap_info_struct *si;
@@ -424,7 +506,8 @@ swp_entry_t get_swap_page_of_type(int type)
424 si = swap_info + type; 506 si = swap_info + type;
425 if (si->flags & SWP_WRITEOK) { 507 if (si->flags & SWP_WRITEOK) {
426 nr_swap_pages--; 508 nr_swap_pages--;
427 offset = scan_swap_map(si); 509 /* This is called for allocating swap entry, not cache */
510 offset = scan_swap_map(si, SWAP_MAP);
428 if (offset) { 511 if (offset) {
429 spin_unlock(&swap_lock); 512 spin_unlock(&swap_lock);
430 return swp_entry(type, offset); 513 return swp_entry(type, offset);
@@ -471,26 +554,40 @@ out:
471 return NULL; 554 return NULL;
472} 555}
473 556
474static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent) 557static int swap_entry_free(struct swap_info_struct *p,
558 swp_entry_t ent, int cache)
475{ 559{
476 unsigned long offset = swp_offset(ent); 560 unsigned long offset = swp_offset(ent);
477 int count = p->swap_map[offset]; 561 int count = swap_count(p->swap_map[offset]);
478 562 bool has_cache;
479 if (count < SWAP_MAP_MAX) { 563
480 count--; 564 has_cache = swap_has_cache(p->swap_map[offset]);
481 p->swap_map[offset] = count; 565
482 if (!count) { 566 if (cache == SWAP_MAP) { /* dropping usage count of swap */
483 if (offset < p->lowest_bit) 567 if (count < SWAP_MAP_MAX) {
484 p->lowest_bit = offset; 568 count--;
485 if (offset > p->highest_bit) 569 p->swap_map[offset] = encode_swapmap(count, has_cache);
486 p->highest_bit = offset;
487 if (p->prio > swap_info[swap_list.next].prio)
488 swap_list.next = p - swap_info;
489 nr_swap_pages++;
490 p->inuse_pages--;
491 mem_cgroup_uncharge_swap(ent);
492 } 570 }
571 } else { /* dropping swap cache flag */
572 VM_BUG_ON(!has_cache);
573 p->swap_map[offset] = encode_swapmap(count, false);
574
575 }
576 /* return code. */
577 count = p->swap_map[offset];
578 /* free if no reference */
579 if (!count) {
580 if (offset < p->lowest_bit)
581 p->lowest_bit = offset;
582 if (offset > p->highest_bit)
583 p->highest_bit = offset;
584 if (p->prio > swap_info[swap_list.next].prio)
585 swap_list.next = p - swap_info;
586 nr_swap_pages++;
587 p->inuse_pages--;
493 } 588 }
589 if (!swap_count(count))
590 mem_cgroup_uncharge_swap(ent);
494 return count; 591 return count;
495} 592}
496 593
@@ -504,9 +601,33 @@ void swap_free(swp_entry_t entry)
504 601
505 p = swap_info_get(entry); 602 p = swap_info_get(entry);
506 if (p) { 603 if (p) {
507 swap_entry_free(p, entry); 604 swap_entry_free(p, entry, SWAP_MAP);
605 spin_unlock(&swap_lock);
606 }
607}
608
609/*
610 * Called after dropping swapcache to decrease refcnt to swap entries.
611 */
612void swapcache_free(swp_entry_t entry, struct page *page)
613{
614 struct swap_info_struct *p;
615 int ret;
616
617 p = swap_info_get(entry);
618 if (p) {
619 ret = swap_entry_free(p, entry, SWAP_CACHE);
620 if (page) {
621 bool swapout;
622 if (ret)
623 swapout = true; /* the end of swap out */
624 else
625 swapout = false; /* no more swap users! */
626 mem_cgroup_uncharge_swapcache(page, entry, swapout);
627 }
508 spin_unlock(&swap_lock); 628 spin_unlock(&swap_lock);
509 } 629 }
630 return;
510} 631}
511 632
512/* 633/*
@@ -521,8 +642,7 @@ static inline int page_swapcount(struct page *page)
521 entry.val = page_private(page); 642 entry.val = page_private(page);
522 p = swap_info_get(entry); 643 p = swap_info_get(entry);
523 if (p) { 644 if (p) {
524 /* Subtract the 1 for the swap cache itself */ 645 count = swap_count(p->swap_map[swp_offset(entry)]);
525 count = p->swap_map[swp_offset(entry)] - 1;
526 spin_unlock(&swap_lock); 646 spin_unlock(&swap_lock);
527 } 647 }
528 return count; 648 return count;
@@ -579,12 +699,12 @@ int free_swap_and_cache(swp_entry_t entry)
579 struct swap_info_struct *p; 699 struct swap_info_struct *p;
580 struct page *page = NULL; 700 struct page *page = NULL;
581 701
582 if (is_migration_entry(entry)) 702 if (non_swap_entry(entry))
583 return 1; 703 return 1;
584 704
585 p = swap_info_get(entry); 705 p = swap_info_get(entry);
586 if (p) { 706 if (p) {
587 if (swap_entry_free(p, entry) == 1) { 707 if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) {
588 page = find_get_page(&swapper_space, entry.val); 708 page = find_get_page(&swapper_space, entry.val);
589 if (page && !trylock_page(page)) { 709 if (page && !trylock_page(page)) {
590 page_cache_release(page); 710 page_cache_release(page);
@@ -635,7 +755,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
635 755
636 if (!bdev) { 756 if (!bdev) {
637 if (bdev_p) 757 if (bdev_p)
638 *bdev_p = bdget(sis->bdev->bd_dev); 758 *bdev_p = bdgrab(sis->bdev);
639 759
640 spin_unlock(&swap_lock); 760 spin_unlock(&swap_lock);
641 return i; 761 return i;
@@ -647,7 +767,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
647 struct swap_extent, list); 767 struct swap_extent, list);
648 if (se->start_block == offset) { 768 if (se->start_block == offset) {
649 if (bdev_p) 769 if (bdev_p)
650 *bdev_p = bdget(sis->bdev->bd_dev); 770 *bdev_p = bdgrab(sis->bdev);
651 771
652 spin_unlock(&swap_lock); 772 spin_unlock(&swap_lock);
653 bdput(bdev); 773 bdput(bdev);
@@ -891,7 +1011,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
891 i = 1; 1011 i = 1;
892 } 1012 }
893 count = si->swap_map[i]; 1013 count = si->swap_map[i];
894 if (count && count != SWAP_MAP_BAD) 1014 if (count && swap_count(count) != SWAP_MAP_BAD)
895 break; 1015 break;
896 } 1016 }
897 return i; 1017 return i;
@@ -995,13 +1115,13 @@ static int try_to_unuse(unsigned int type)
995 */ 1115 */
996 shmem = 0; 1116 shmem = 0;
997 swcount = *swap_map; 1117 swcount = *swap_map;
998 if (swcount > 1) { 1118 if (swap_count(swcount)) {
999 if (start_mm == &init_mm) 1119 if (start_mm == &init_mm)
1000 shmem = shmem_unuse(entry, page); 1120 shmem = shmem_unuse(entry, page);
1001 else 1121 else
1002 retval = unuse_mm(start_mm, entry, page); 1122 retval = unuse_mm(start_mm, entry, page);
1003 } 1123 }
1004 if (*swap_map > 1) { 1124 if (swap_count(*swap_map)) {
1005 int set_start_mm = (*swap_map >= swcount); 1125 int set_start_mm = (*swap_map >= swcount);
1006 struct list_head *p = &start_mm->mmlist; 1126 struct list_head *p = &start_mm->mmlist;
1007 struct mm_struct *new_start_mm = start_mm; 1127 struct mm_struct *new_start_mm = start_mm;
@@ -1011,7 +1131,7 @@ static int try_to_unuse(unsigned int type)
1011 atomic_inc(&new_start_mm->mm_users); 1131 atomic_inc(&new_start_mm->mm_users);
1012 atomic_inc(&prev_mm->mm_users); 1132 atomic_inc(&prev_mm->mm_users);
1013 spin_lock(&mmlist_lock); 1133 spin_lock(&mmlist_lock);
1014 while (*swap_map > 1 && !retval && !shmem && 1134 while (swap_count(*swap_map) && !retval && !shmem &&
1015 (p = p->next) != &start_mm->mmlist) { 1135 (p = p->next) != &start_mm->mmlist) {
1016 mm = list_entry(p, struct mm_struct, mmlist); 1136 mm = list_entry(p, struct mm_struct, mmlist);
1017 if (!atomic_inc_not_zero(&mm->mm_users)) 1137 if (!atomic_inc_not_zero(&mm->mm_users))
@@ -1023,13 +1143,14 @@ static int try_to_unuse(unsigned int type)
1023 cond_resched(); 1143 cond_resched();
1024 1144
1025 swcount = *swap_map; 1145 swcount = *swap_map;
1026 if (swcount <= 1) 1146 if (!swap_count(swcount)) /* any usage ? */
1027 ; 1147 ;
1028 else if (mm == &init_mm) { 1148 else if (mm == &init_mm) {
1029 set_start_mm = 1; 1149 set_start_mm = 1;
1030 shmem = shmem_unuse(entry, page); 1150 shmem = shmem_unuse(entry, page);
1031 } else 1151 } else
1032 retval = unuse_mm(mm, entry, page); 1152 retval = unuse_mm(mm, entry, page);
1153
1033 if (set_start_mm && *swap_map < swcount) { 1154 if (set_start_mm && *swap_map < swcount) {
1034 mmput(new_start_mm); 1155 mmput(new_start_mm);
1035 atomic_inc(&mm->mm_users); 1156 atomic_inc(&mm->mm_users);
@@ -1057,21 +1178,25 @@ static int try_to_unuse(unsigned int type)
1057 } 1178 }
1058 1179
1059 /* 1180 /*
1060 * How could swap count reach 0x7fff when the maximum 1181 * How could swap count reach 0x7ffe ?
1061 * pid is 0x7fff, and there's no way to repeat a swap 1182 * There's no way to repeat a swap page within an mm
1062 * page within an mm (except in shmem, where it's the 1183 * (except in shmem, where it's the shared object which takes
1063 * shared object which takes the reference count)? 1184 * the reference count)?
1064 * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. 1185 * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned
1065 * 1186 * short is too small....)
1066 * If that's wrong, then we should worry more about 1187 * If that's wrong, then we should worry more about
1067 * exit_mmap() and do_munmap() cases described above: 1188 * exit_mmap() and do_munmap() cases described above:
1068 * we might be resetting SWAP_MAP_MAX too early here. 1189 * we might be resetting SWAP_MAP_MAX too early here.
1069 * We know "Undead"s can happen, they're okay, so don't 1190 * We know "Undead"s can happen, they're okay, so don't
1070 * report them; but do report if we reset SWAP_MAP_MAX. 1191 * report them; but do report if we reset SWAP_MAP_MAX.
1071 */ 1192 */
1072 if (*swap_map == SWAP_MAP_MAX) { 1193 /* We might release the lock_page() in unuse_mm(). */
1194 if (!PageSwapCache(page) || page_private(page) != entry.val)
1195 goto retry;
1196
1197 if (swap_count(*swap_map) == SWAP_MAP_MAX) {
1073 spin_lock(&swap_lock); 1198 spin_lock(&swap_lock);
1074 *swap_map = 1; 1199 *swap_map = encode_swapmap(0, true);
1075 spin_unlock(&swap_lock); 1200 spin_unlock(&swap_lock);
1076 reset_overflow = 1; 1201 reset_overflow = 1;
1077 } 1202 }
@@ -1089,7 +1214,8 @@ static int try_to_unuse(unsigned int type)
1089 * pages would be incorrect if swap supported "shared 1214 * pages would be incorrect if swap supported "shared
1090 * private" pages, but they are handled by tmpfs files. 1215 * private" pages, but they are handled by tmpfs files.
1091 */ 1216 */
1092 if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { 1217 if (swap_count(*swap_map) &&
1218 PageDirty(page) && PageSwapCache(page)) {
1093 struct writeback_control wbc = { 1219 struct writeback_control wbc = {
1094 .sync_mode = WB_SYNC_NONE, 1220 .sync_mode = WB_SYNC_NONE,
1095 }; 1221 };
@@ -1116,6 +1242,7 @@ static int try_to_unuse(unsigned int type)
1116 * mark page dirty so shrink_page_list will preserve it. 1242 * mark page dirty so shrink_page_list will preserve it.
1117 */ 1243 */
1118 SetPageDirty(page); 1244 SetPageDirty(page);
1245retry:
1119 unlock_page(page); 1246 unlock_page(page);
1120 page_cache_release(page); 1247 page_cache_release(page);
1121 1248
@@ -1447,9 +1574,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1447 p->flags &= ~SWP_WRITEOK; 1574 p->flags &= ~SWP_WRITEOK;
1448 spin_unlock(&swap_lock); 1575 spin_unlock(&swap_lock);
1449 1576
1450 current->flags |= PF_SWAPOFF; 1577 current->flags |= PF_OOM_ORIGIN;
1451 err = try_to_unuse(type); 1578 err = try_to_unuse(type);
1452 current->flags &= ~PF_SWAPOFF; 1579 current->flags &= ~PF_OOM_ORIGIN;
1453 1580
1454 if (err) { 1581 if (err) {
1455 /* re-insert swap space back into swap_list */ 1582 /* re-insert swap space back into swap_list */
@@ -1846,12 +1973,14 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1846 goto bad_swap; 1973 goto bad_swap;
1847 } 1974 }
1848 1975
1849 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { 1976 if (p->bdev) {
1850 p->flags |= SWP_SOLIDSTATE; 1977 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
1851 p->cluster_next = 1 + (random32() % p->highest_bit); 1978 p->flags |= SWP_SOLIDSTATE;
1979 p->cluster_next = 1 + (random32() % p->highest_bit);
1980 }
1981 if (discard_swap(p) == 0)
1982 p->flags |= SWP_DISCARDABLE;
1852 } 1983 }
1853 if (discard_swap(p) == 0)
1854 p->flags |= SWP_DISCARDABLE;
1855 1984
1856 mutex_lock(&swapon_mutex); 1985 mutex_lock(&swapon_mutex);
1857 spin_lock(&swap_lock); 1986 spin_lock(&swap_lock);
@@ -1942,15 +2071,23 @@ void si_swapinfo(struct sysinfo *val)
1942 * 2071 *
1943 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as 2072 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
1944 * "permanent", but will be reclaimed by the next swapoff. 2073 * "permanent", but will be reclaimed by the next swapoff.
2074 * Returns error code in following case.
2075 * - success -> 0
2076 * - swp_entry is invalid -> EINVAL
2077 * - swp_entry is migration entry -> EINVAL
2078 * - swap-cache reference is requested but there is already one. -> EEXIST
2079 * - swap-cache reference is requested but the entry is not used. -> ENOENT
1945 */ 2080 */
1946int swap_duplicate(swp_entry_t entry) 2081static int __swap_duplicate(swp_entry_t entry, bool cache)
1947{ 2082{
1948 struct swap_info_struct * p; 2083 struct swap_info_struct * p;
1949 unsigned long offset, type; 2084 unsigned long offset, type;
1950 int result = 0; 2085 int result = -EINVAL;
2086 int count;
2087 bool has_cache;
1951 2088
1952 if (is_migration_entry(entry)) 2089 if (non_swap_entry(entry))
1953 return 1; 2090 return -EINVAL;
1954 2091
1955 type = swp_type(entry); 2092 type = swp_type(entry);
1956 if (type >= nr_swapfiles) 2093 if (type >= nr_swapfiles)
@@ -1959,17 +2096,40 @@ int swap_duplicate(swp_entry_t entry)
1959 offset = swp_offset(entry); 2096 offset = swp_offset(entry);
1960 2097
1961 spin_lock(&swap_lock); 2098 spin_lock(&swap_lock);
1962 if (offset < p->max && p->swap_map[offset]) { 2099
1963 if (p->swap_map[offset] < SWAP_MAP_MAX - 1) { 2100 if (unlikely(offset >= p->max))
1964 p->swap_map[offset]++; 2101 goto unlock_out;
1965 result = 1; 2102
1966 } else if (p->swap_map[offset] <= SWAP_MAP_MAX) { 2103 count = swap_count(p->swap_map[offset]);
2104 has_cache = swap_has_cache(p->swap_map[offset]);
2105
2106 if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */
2107
2108 /* set SWAP_HAS_CACHE if there is no cache and entry is used */
2109 if (!has_cache && count) {
2110 p->swap_map[offset] = encode_swapmap(count, true);
2111 result = 0;
2112 } else if (has_cache) /* someone added cache */
2113 result = -EEXIST;
2114 else if (!count) /* no users */
2115 result = -ENOENT;
2116
2117 } else if (count || has_cache) {
2118 if (count < SWAP_MAP_MAX - 1) {
2119 p->swap_map[offset] = encode_swapmap(count + 1,
2120 has_cache);
2121 result = 0;
2122 } else if (count <= SWAP_MAP_MAX) {
1967 if (swap_overflow++ < 5) 2123 if (swap_overflow++ < 5)
1968 printk(KERN_WARNING "swap_dup: swap entry overflow\n"); 2124 printk(KERN_WARNING
1969 p->swap_map[offset] = SWAP_MAP_MAX; 2125 "swap_dup: swap entry overflow\n");
1970 result = 1; 2126 p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX,
2127 has_cache);
2128 result = 0;
1971 } 2129 }
1972 } 2130 } else
2131 result = -ENOENT; /* unused swap entry */
2132unlock_out:
1973 spin_unlock(&swap_lock); 2133 spin_unlock(&swap_lock);
1974out: 2134out:
1975 return result; 2135 return result;
@@ -1978,6 +2138,27 @@ bad_file:
1978 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); 2138 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
1979 goto out; 2139 goto out;
1980} 2140}
2141/*
2142 * increase reference count of swap entry by 1.
2143 */
2144void swap_duplicate(swp_entry_t entry)
2145{
2146 __swap_duplicate(entry, SWAP_MAP);
2147}
2148
2149/*
2150 * @entry: swap entry for which we allocate swap cache.
2151 *
2152 * Called when allocating swap cache for exising swap entry,
2153 * This can return error codes. Returns 0 at success.
2154 * -EBUSY means there is a swap cache.
2155 * Note: return code is different from swap_duplicate().
2156 */
2157int swapcache_prepare(swp_entry_t entry)
2158{
2159 return __swap_duplicate(entry, SWAP_CACHE);
2160}
2161
1981 2162
1982struct swap_info_struct * 2163struct swap_info_struct *
1983get_swap_info_struct(unsigned type) 2164get_swap_info_struct(unsigned type)
@@ -2016,7 +2197,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2016 /* Don't read in free or bad pages */ 2197 /* Don't read in free or bad pages */
2017 if (!si->swap_map[toff]) 2198 if (!si->swap_map[toff])
2018 break; 2199 break;
2019 if (si->swap_map[toff] == SWAP_MAP_BAD) 2200 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2020 break; 2201 break;
2021 } 2202 }
2022 /* Count contiguous allocated slots below our target */ 2203 /* Count contiguous allocated slots below our target */
@@ -2024,7 +2205,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2024 /* Don't read in free or bad pages */ 2205 /* Don't read in free or bad pages */
2025 if (!si->swap_map[toff]) 2206 if (!si->swap_map[toff])
2026 break; 2207 break;
2027 if (si->swap_map[toff] == SWAP_MAP_BAD) 2208 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2028 break; 2209 break;
2029 } 2210 }
2030 spin_unlock(&swap_lock); 2211 spin_unlock(&swap_lock);
diff --git a/mm/thrash.c b/mm/thrash.c
index c4c5205a9c35..2372d4ed5dd8 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -26,47 +26,45 @@ static DEFINE_SPINLOCK(swap_token_lock);
26struct mm_struct *swap_token_mm; 26struct mm_struct *swap_token_mm;
27static unsigned int global_faults; 27static unsigned int global_faults;
28 28
29void grab_swap_token(void) 29void grab_swap_token(struct mm_struct *mm)
30{ 30{
31 int current_interval; 31 int current_interval;
32 32
33 global_faults++; 33 global_faults++;
34 34
35 current_interval = global_faults - current->mm->faultstamp; 35 current_interval = global_faults - mm->faultstamp;
36 36
37 if (!spin_trylock(&swap_token_lock)) 37 if (!spin_trylock(&swap_token_lock))
38 return; 38 return;
39 39
40 /* First come first served */ 40 /* First come first served */
41 if (swap_token_mm == NULL) { 41 if (swap_token_mm == NULL) {
42 current->mm->token_priority = current->mm->token_priority + 2; 42 mm->token_priority = mm->token_priority + 2;
43 swap_token_mm = current->mm; 43 swap_token_mm = mm;
44 goto out; 44 goto out;
45 } 45 }
46 46
47 if (current->mm != swap_token_mm) { 47 if (mm != swap_token_mm) {
48 if (current_interval < current->mm->last_interval) 48 if (current_interval < mm->last_interval)
49 current->mm->token_priority++; 49 mm->token_priority++;
50 else { 50 else {
51 if (likely(current->mm->token_priority > 0)) 51 if (likely(mm->token_priority > 0))
52 current->mm->token_priority--; 52 mm->token_priority--;
53 } 53 }
54 /* Check if we deserve the token */ 54 /* Check if we deserve the token */
55 if (current->mm->token_priority > 55 if (mm->token_priority > swap_token_mm->token_priority) {
56 swap_token_mm->token_priority) { 56 mm->token_priority += 2;
57 current->mm->token_priority += 2; 57 swap_token_mm = mm;
58 swap_token_mm = current->mm;
59 } 58 }
60 } else { 59 } else {
61 /* Token holder came in again! */ 60 /* Token holder came in again! */
62 current->mm->token_priority += 2; 61 mm->token_priority += 2;
63 } 62 }
64 63
65out: 64out:
66 current->mm->faultstamp = global_faults; 65 mm->faultstamp = global_faults;
67 current->mm->last_interval = current_interval; 66 mm->last_interval = current_interval;
68 spin_unlock(&swap_token_lock); 67 spin_unlock(&swap_token_lock);
69return;
70} 68}
71 69
72/* Called on process exit. */ 70/* Called on process exit. */
diff --git a/mm/truncate.c b/mm/truncate.c
index 1229211104f8..450cebdabfc0 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -50,7 +50,7 @@ void do_invalidatepage(struct page *page, unsigned long offset)
50static inline void truncate_partial_page(struct page *page, unsigned partial) 50static inline void truncate_partial_page(struct page *page, unsigned partial)
51{ 51{
52 zero_user_segment(page, partial, PAGE_CACHE_SIZE); 52 zero_user_segment(page, partial, PAGE_CACHE_SIZE);
53 if (PagePrivate(page)) 53 if (page_has_private(page))
54 do_invalidatepage(page, partial); 54 do_invalidatepage(page, partial);
55} 55}
56 56
@@ -93,13 +93,13 @@ EXPORT_SYMBOL(cancel_dirty_page);
93 * its lock, b) when a concurrent invalidate_mapping_pages got there first and 93 * its lock, b) when a concurrent invalidate_mapping_pages got there first and
94 * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. 94 * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
95 */ 95 */
96static void 96static int
97truncate_complete_page(struct address_space *mapping, struct page *page) 97truncate_complete_page(struct address_space *mapping, struct page *page)
98{ 98{
99 if (page->mapping != mapping) 99 if (page->mapping != mapping)
100 return; 100 return -EIO;
101 101
102 if (PagePrivate(page)) 102 if (page_has_private(page))
103 do_invalidatepage(page, 0); 103 do_invalidatepage(page, 0);
104 104
105 cancel_dirty_page(page, PAGE_CACHE_SIZE); 105 cancel_dirty_page(page, PAGE_CACHE_SIZE);
@@ -108,6 +108,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
108 remove_from_page_cache(page); 108 remove_from_page_cache(page);
109 ClearPageMappedToDisk(page); 109 ClearPageMappedToDisk(page);
110 page_cache_release(page); /* pagecache ref */ 110 page_cache_release(page); /* pagecache ref */
111 return 0;
111} 112}
112 113
113/* 114/*
@@ -126,7 +127,7 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
126 if (page->mapping != mapping) 127 if (page->mapping != mapping)
127 return 0; 128 return 0;
128 129
129 if (PagePrivate(page) && !try_to_release_page(page, 0)) 130 if (page_has_private(page) && !try_to_release_page(page, 0))
130 return 0; 131 return 0;
131 132
132 clear_page_mlock(page); 133 clear_page_mlock(page);
@@ -135,6 +136,51 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
135 return ret; 136 return ret;
136} 137}
137 138
139int truncate_inode_page(struct address_space *mapping, struct page *page)
140{
141 if (page_mapped(page)) {
142 unmap_mapping_range(mapping,
143 (loff_t)page->index << PAGE_CACHE_SHIFT,
144 PAGE_CACHE_SIZE, 0);
145 }
146 return truncate_complete_page(mapping, page);
147}
148
149/*
150 * Used to get rid of pages on hardware memory corruption.
151 */
152int generic_error_remove_page(struct address_space *mapping, struct page *page)
153{
154 if (!mapping)
155 return -EINVAL;
156 /*
157 * Only punch for normal data pages for now.
158 * Handling other types like directories would need more auditing.
159 */
160 if (!S_ISREG(mapping->host->i_mode))
161 return -EIO;
162 return truncate_inode_page(mapping, page);
163}
164EXPORT_SYMBOL(generic_error_remove_page);
165
166/*
167 * Safely invalidate one page from its pagecache mapping.
168 * It only drops clean, unused pages. The page must be locked.
169 *
170 * Returns 1 if the page is successfully invalidated, otherwise 0.
171 */
172int invalidate_inode_page(struct page *page)
173{
174 struct address_space *mapping = page_mapping(page);
175 if (!mapping)
176 return 0;
177 if (PageDirty(page) || PageWriteback(page))
178 return 0;
179 if (page_mapped(page))
180 return 0;
181 return invalidate_complete_page(mapping, page);
182}
183
138/** 184/**
139 * truncate_inode_pages - truncate range of pages specified by start & end byte offsets 185 * truncate_inode_pages - truncate range of pages specified by start & end byte offsets
140 * @mapping: mapping to truncate 186 * @mapping: mapping to truncate
@@ -196,12 +242,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
196 unlock_page(page); 242 unlock_page(page);
197 continue; 243 continue;
198 } 244 }
199 if (page_mapped(page)) { 245 truncate_inode_page(mapping, page);
200 unmap_mapping_range(mapping,
201 (loff_t)page_index<<PAGE_CACHE_SHIFT,
202 PAGE_CACHE_SIZE, 0);
203 }
204 truncate_complete_page(mapping, page);
205 unlock_page(page); 246 unlock_page(page);
206 } 247 }
207 pagevec_release(&pvec); 248 pagevec_release(&pvec);
@@ -238,15 +279,10 @@ void truncate_inode_pages_range(struct address_space *mapping,
238 break; 279 break;
239 lock_page(page); 280 lock_page(page);
240 wait_on_page_writeback(page); 281 wait_on_page_writeback(page);
241 if (page_mapped(page)) { 282 truncate_inode_page(mapping, page);
242 unmap_mapping_range(mapping,
243 (loff_t)page->index<<PAGE_CACHE_SHIFT,
244 PAGE_CACHE_SIZE, 0);
245 }
246 if (page->index > next) 283 if (page->index > next)
247 next = page->index; 284 next = page->index;
248 next++; 285 next++;
249 truncate_complete_page(mapping, page);
250 unlock_page(page); 286 unlock_page(page);
251 } 287 }
252 pagevec_release(&pvec); 288 pagevec_release(&pvec);
@@ -267,8 +303,21 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
267} 303}
268EXPORT_SYMBOL(truncate_inode_pages); 304EXPORT_SYMBOL(truncate_inode_pages);
269 305
270unsigned long __invalidate_mapping_pages(struct address_space *mapping, 306/**
271 pgoff_t start, pgoff_t end, bool be_atomic) 307 * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
308 * @mapping: the address_space which holds the pages to invalidate
309 * @start: the offset 'from' which to invalidate
310 * @end: the offset 'to' which to invalidate (inclusive)
311 *
312 * This function only removes the unlocked pages, if you want to
313 * remove all the pages of one inode, you must call truncate_inode_pages.
314 *
315 * invalidate_mapping_pages() will not block on IO activity. It will not
316 * invalidate pages which are dirty, locked, under writeback or mapped into
317 * pagetables.
318 */
319unsigned long invalidate_mapping_pages(struct address_space *mapping,
320 pgoff_t start, pgoff_t end)
272{ 321{
273 struct pagevec pvec; 322 struct pagevec pvec;
274 pgoff_t next = start; 323 pgoff_t next = start;
@@ -298,41 +347,17 @@ unsigned long __invalidate_mapping_pages(struct address_space *mapping,
298 if (lock_failed) 347 if (lock_failed)
299 continue; 348 continue;
300 349
301 if (PageDirty(page) || PageWriteback(page)) 350 ret += invalidate_inode_page(page);
302 goto unlock; 351
303 if (page_mapped(page))
304 goto unlock;
305 ret += invalidate_complete_page(mapping, page);
306unlock:
307 unlock_page(page); 352 unlock_page(page);
308 if (next > end) 353 if (next > end)
309 break; 354 break;
310 } 355 }
311 pagevec_release(&pvec); 356 pagevec_release(&pvec);
312 if (likely(!be_atomic)) 357 cond_resched();
313 cond_resched();
314 } 358 }
315 return ret; 359 return ret;
316} 360}
317
318/**
319 * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
320 * @mapping: the address_space which holds the pages to invalidate
321 * @start: the offset 'from' which to invalidate
322 * @end: the offset 'to' which to invalidate (inclusive)
323 *
324 * This function only removes the unlocked pages, if you want to
325 * remove all the pages of one inode, you must call truncate_inode_pages.
326 *
327 * invalidate_mapping_pages() will not block on IO activity. It will not
328 * invalidate pages which are dirty, locked, under writeback or mapped into
329 * pagetables.
330 */
331unsigned long invalidate_mapping_pages(struct address_space *mapping,
332 pgoff_t start, pgoff_t end)
333{
334 return __invalidate_mapping_pages(mapping, start, end, false);
335}
336EXPORT_SYMBOL(invalidate_mapping_pages); 361EXPORT_SYMBOL(invalidate_mapping_pages);
337 362
338/* 363/*
@@ -348,7 +373,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
348 if (page->mapping != mapping) 373 if (page->mapping != mapping)
349 return 0; 374 return 0;
350 375
351 if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) 376 if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
352 return 0; 377 return 0;
353 378
354 spin_lock_irq(&mapping->tree_lock); 379 spin_lock_irq(&mapping->tree_lock);
@@ -356,9 +381,10 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
356 goto failed; 381 goto failed;
357 382
358 clear_page_mlock(page); 383 clear_page_mlock(page);
359 BUG_ON(PagePrivate(page)); 384 BUG_ON(page_has_private(page));
360 __remove_from_page_cache(page); 385 __remove_from_page_cache(page);
361 spin_unlock_irq(&mapping->tree_lock); 386 spin_unlock_irq(&mapping->tree_lock);
387 mem_cgroup_uncharge_cache_page(page);
362 page_cache_release(page); /* pagecache ref */ 388 page_cache_release(page); /* pagecache ref */
363 return 1; 389 return 1;
364failed: 390failed:
@@ -471,3 +497,67 @@ int invalidate_inode_pages2(struct address_space *mapping)
471 return invalidate_inode_pages2_range(mapping, 0, -1); 497 return invalidate_inode_pages2_range(mapping, 0, -1);
472} 498}
473EXPORT_SYMBOL_GPL(invalidate_inode_pages2); 499EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
500
501/**
502 * truncate_pagecache - unmap and remove pagecache that has been truncated
503 * @inode: inode
504 * @old: old file offset
505 * @new: new file offset
506 *
507 * inode's new i_size must already be written before truncate_pagecache
508 * is called.
509 *
510 * This function should typically be called before the filesystem
511 * releases resources associated with the freed range (eg. deallocates
512 * blocks). This way, pagecache will always stay logically coherent
513 * with on-disk format, and the filesystem would not have to deal with
514 * situations such as writepage being called for a page that has already
515 * had its underlying blocks deallocated.
516 */
517void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
518{
519 if (new < old) {
520 struct address_space *mapping = inode->i_mapping;
521
522 /*
523 * unmap_mapping_range is called twice, first simply for
524 * efficiency so that truncate_inode_pages does fewer
525 * single-page unmaps. However after this first call, and
526 * before truncate_inode_pages finishes, it is possible for
527 * private pages to be COWed, which remain after
528 * truncate_inode_pages finishes, hence the second
529 * unmap_mapping_range call must be made for correctness.
530 */
531 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
532 truncate_inode_pages(mapping, new);
533 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
534 }
535}
536EXPORT_SYMBOL(truncate_pagecache);
537
538/**
539 * vmtruncate - unmap mappings "freed" by truncate() syscall
540 * @inode: inode of the file used
541 * @offset: file offset to start truncating
542 *
543 * NOTE! We have to be ready to update the memory sharing
544 * between the file and the memory map for a potential last
545 * incomplete page. Ugly, but necessary.
546 */
547int vmtruncate(struct inode *inode, loff_t offset)
548{
549 loff_t oldsize;
550 int error;
551
552 error = inode_newsize_ok(inode, offset);
553 if (error)
554 return error;
555 oldsize = inode->i_size;
556 i_size_write(inode, offset);
557 truncate_pagecache(inode, oldsize, offset);
558 if (inode->i_op->truncate)
559 inode->i_op->truncate(inode);
560
561 return error;
562}
563EXPORT_SYMBOL(vmtruncate);
diff --git a/mm/util.c b/mm/util.c
index 7c122e49f769..7c35ad95f927 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -6,6 +6,9 @@
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <asm/uaccess.h> 7#include <asm/uaccess.h>
8 8
9#define CREATE_TRACE_POINTS
10#include <trace/events/kmem.h>
11
9/** 12/**
10 * kstrdup - allocate space for and copy an existing string 13 * kstrdup - allocate space for and copy an existing string
11 * @s: the string to duplicate 14 * @s: the string to duplicate
@@ -165,6 +168,10 @@ EXPORT_SYMBOL(krealloc);
165 * 168 *
166 * The memory of the object @p points to is zeroed before freed. 169 * The memory of the object @p points to is zeroed before freed.
167 * If @p is %NULL, kzfree() does nothing. 170 * If @p is %NULL, kzfree() does nothing.
171 *
172 * Note: this function zeroes the whole allocated buffer which can be a good
173 * deal bigger than the requested buffer size passed to kmalloc(). So be
174 * careful when using this function in performance sensitive code.
168 */ 175 */
169void kzfree(const void *p) 176void kzfree(const void *p)
170{ 177{
@@ -222,6 +229,30 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
222} 229}
223#endif 230#endif
224 231
232/**
233 * get_user_pages_fast() - pin user pages in memory
234 * @start: starting user address
235 * @nr_pages: number of pages from start to pin
236 * @write: whether pages will be written to
237 * @pages: array that receives pointers to the pages pinned.
238 * Should be at least nr_pages long.
239 *
240 * Returns number of pages pinned. This may be fewer than the number
241 * requested. If nr_pages is 0 or negative, returns 0. If no pages
242 * were pinned, returns -errno.
243 *
244 * get_user_pages_fast provides equivalent functionality to get_user_pages,
245 * operating on current and current->mm, with force=0 and vma=NULL. However
246 * unlike get_user_pages, it must be called without mmap_sem held.
247 *
248 * get_user_pages_fast may take mmap_sem and page table locks, so no
249 * assumptions can be made about lack of locking. get_user_pages_fast is to be
250 * implemented in a way that is advantageous (vs get_user_pages()) when the
251 * user memory area is already faulted in and present in ptes. However if the
252 * pages have to be faulted in, it may turn out to be slightly slower so
253 * callers need to carefully consider what to use. On many architectures,
254 * get_user_pages_fast simply falls back to get_user_pages.
255 */
225int __attribute__((weak)) get_user_pages_fast(unsigned long start, 256int __attribute__((weak)) get_user_pages_fast(unsigned long start,
226 int nr_pages, int write, struct page **pages) 257 int nr_pages, int write, struct page **pages)
227{ 258{
@@ -236,3 +267,11 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start,
236 return ret; 267 return ret;
237} 268}
238EXPORT_SYMBOL_GPL(get_user_pages_fast); 269EXPORT_SYMBOL_GPL(get_user_pages_fast);
270
271/* Tracepoints definitions. */
272EXPORT_TRACEPOINT_SYMBOL(kmalloc);
273EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
274EXPORT_TRACEPOINT_SYMBOL(kmalloc_node);
275EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node);
276EXPORT_TRACEPOINT_SYMBOL(kfree);
277EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index fab19876b4d1..0f551a4a44cd 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -12,6 +12,7 @@
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/highmem.h> 14#include <linux/highmem.h>
15#include <linux/sched.h>
15#include <linux/slab.h> 16#include <linux/slab.h>
16#include <linux/spinlock.h> 17#include <linux/spinlock.h>
17#include <linux/interrupt.h> 18#include <linux/interrupt.h>
@@ -23,12 +24,12 @@
23#include <linux/rbtree.h> 24#include <linux/rbtree.h>
24#include <linux/radix-tree.h> 25#include <linux/radix-tree.h>
25#include <linux/rcupdate.h> 26#include <linux/rcupdate.h>
26#include <linux/bootmem.h>
27#include <linux/pfn.h> 27#include <linux/pfn.h>
28 28#include <linux/kmemleak.h>
29#include <asm/atomic.h> 29#include <asm/atomic.h>
30#include <asm/uaccess.h> 30#include <asm/uaccess.h>
31#include <asm/tlbflush.h> 31#include <asm/tlbflush.h>
32#include <asm/shmparam.h>
32 33
33 34
34/*** Page table manipulation functions ***/ 35/*** Page table manipulation functions ***/
@@ -168,11 +169,9 @@ static int vmap_page_range_noflush(unsigned long start, unsigned long end,
168 next = pgd_addr_end(addr, end); 169 next = pgd_addr_end(addr, end);
169 err = vmap_pud_range(pgd, addr, next, prot, pages, &nr); 170 err = vmap_pud_range(pgd, addr, next, prot, pages, &nr);
170 if (err) 171 if (err)
171 break; 172 return err;
172 } while (pgd++, addr = next, addr != end); 173 } while (pgd++, addr = next, addr != end);
173 174
174 if (unlikely(err))
175 return err;
176 return nr; 175 return nr;
177} 176}
178 177
@@ -186,7 +185,7 @@ static int vmap_page_range(unsigned long start, unsigned long end,
186 return ret; 185 return ret;
187} 186}
188 187
189static inline int is_vmalloc_or_module_addr(const void *x) 188int is_vmalloc_or_module_addr(const void *x)
190{ 189{
191 /* 190 /*
192 * ARM, x86-64 and sparc64 put modules in a special place, 191 * ARM, x86-64 and sparc64 put modules in a special place,
@@ -265,6 +264,7 @@ struct vmap_area {
265static DEFINE_SPINLOCK(vmap_area_lock); 264static DEFINE_SPINLOCK(vmap_area_lock);
266static struct rb_root vmap_area_root = RB_ROOT; 265static struct rb_root vmap_area_root = RB_ROOT;
267static LIST_HEAD(vmap_area_list); 266static LIST_HEAD(vmap_area_list);
267static unsigned long vmap_area_pcpu_hole;
268 268
269static struct vmap_area *__find_vmap_area(unsigned long addr) 269static struct vmap_area *__find_vmap_area(unsigned long addr)
270{ 270{
@@ -402,6 +402,7 @@ overflow:
402 printk(KERN_WARNING 402 printk(KERN_WARNING
403 "vmap allocation for size %lu failed: " 403 "vmap allocation for size %lu failed: "
404 "use vmalloc=<size> to increase size.\n", size); 404 "use vmalloc=<size> to increase size.\n", size);
405 kfree(va);
405 return ERR_PTR(-EBUSY); 406 return ERR_PTR(-EBUSY);
406 } 407 }
407 408
@@ -430,6 +431,15 @@ static void __free_vmap_area(struct vmap_area *va)
430 RB_CLEAR_NODE(&va->rb_node); 431 RB_CLEAR_NODE(&va->rb_node);
431 list_del_rcu(&va->list); 432 list_del_rcu(&va->list);
432 433
434 /*
435 * Track the highest possible candidate for pcpu area
436 * allocation. Areas outside of vmalloc area can be returned
437 * here too, consider only end addresses which fall inside
438 * vmalloc area proper.
439 */
440 if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
441 vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);
442
433 call_rcu(&va->rcu_head, rcu_free_va); 443 call_rcu(&va->rcu_head, rcu_free_va);
434} 444}
435 445
@@ -1031,12 +1041,15 @@ void __init vmalloc_init(void)
1031 1041
1032 /* Import existing vmlist entries. */ 1042 /* Import existing vmlist entries. */
1033 for (tmp = vmlist; tmp; tmp = tmp->next) { 1043 for (tmp = vmlist; tmp; tmp = tmp->next) {
1034 va = alloc_bootmem(sizeof(struct vmap_area)); 1044 va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
1035 va->flags = tmp->flags | VM_VM_AREA; 1045 va->flags = tmp->flags | VM_VM_AREA;
1036 va->va_start = (unsigned long)tmp->addr; 1046 va->va_start = (unsigned long)tmp->addr;
1037 va->va_end = va->va_start + tmp->size; 1047 va->va_end = va->va_start + tmp->size;
1038 __insert_vmap_area(va); 1048 __insert_vmap_area(va);
1039 } 1049 }
1050
1051 vmap_area_pcpu_hole = VMALLOC_END;
1052
1040 vmap_initialized = true; 1053 vmap_initialized = true;
1041} 1054}
1042 1055
@@ -1121,14 +1134,34 @@ EXPORT_SYMBOL_GPL(map_vm_area);
1121DEFINE_RWLOCK(vmlist_lock); 1134DEFINE_RWLOCK(vmlist_lock);
1122struct vm_struct *vmlist; 1135struct vm_struct *vmlist;
1123 1136
1137static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1138 unsigned long flags, void *caller)
1139{
1140 struct vm_struct *tmp, **p;
1141
1142 vm->flags = flags;
1143 vm->addr = (void *)va->va_start;
1144 vm->size = va->va_end - va->va_start;
1145 vm->caller = caller;
1146 va->private = vm;
1147 va->flags |= VM_VM_AREA;
1148
1149 write_lock(&vmlist_lock);
1150 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
1151 if (tmp->addr >= vm->addr)
1152 break;
1153 }
1154 vm->next = *p;
1155 *p = vm;
1156 write_unlock(&vmlist_lock);
1157}
1158
1124static struct vm_struct *__get_vm_area_node(unsigned long size, 1159static struct vm_struct *__get_vm_area_node(unsigned long size,
1125 unsigned long flags, unsigned long start, unsigned long end, 1160 unsigned long align, unsigned long flags, unsigned long start,
1126 int node, gfp_t gfp_mask, void *caller) 1161 unsigned long end, int node, gfp_t gfp_mask, void *caller)
1127{ 1162{
1128 static struct vmap_area *va; 1163 static struct vmap_area *va;
1129 struct vm_struct *area; 1164 struct vm_struct *area;
1130 struct vm_struct *tmp, **p;
1131 unsigned long align = 1;
1132 1165
1133 BUG_ON(in_interrupt()); 1166 BUG_ON(in_interrupt());
1134 if (flags & VM_IOREMAP) { 1167 if (flags & VM_IOREMAP) {
@@ -1146,7 +1179,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
1146 if (unlikely(!size)) 1179 if (unlikely(!size))
1147 return NULL; 1180 return NULL;
1148 1181
1149 area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); 1182 area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
1150 if (unlikely(!area)) 1183 if (unlikely(!area))
1151 return NULL; 1184 return NULL;
1152 1185
@@ -1161,32 +1194,14 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
1161 return NULL; 1194 return NULL;
1162 } 1195 }
1163 1196
1164 area->flags = flags; 1197 insert_vmalloc_vm(area, va, flags, caller);
1165 area->addr = (void *)va->va_start;
1166 area->size = size;
1167 area->pages = NULL;
1168 area->nr_pages = 0;
1169 area->phys_addr = 0;
1170 area->caller = caller;
1171 va->private = area;
1172 va->flags |= VM_VM_AREA;
1173
1174 write_lock(&vmlist_lock);
1175 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
1176 if (tmp->addr >= area->addr)
1177 break;
1178 }
1179 area->next = *p;
1180 *p = area;
1181 write_unlock(&vmlist_lock);
1182
1183 return area; 1198 return area;
1184} 1199}
1185 1200
1186struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, 1201struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
1187 unsigned long start, unsigned long end) 1202 unsigned long start, unsigned long end)
1188{ 1203{
1189 return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL, 1204 return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
1190 __builtin_return_address(0)); 1205 __builtin_return_address(0));
1191} 1206}
1192EXPORT_SYMBOL_GPL(__get_vm_area); 1207EXPORT_SYMBOL_GPL(__get_vm_area);
@@ -1195,7 +1210,7 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
1195 unsigned long start, unsigned long end, 1210 unsigned long start, unsigned long end,
1196 void *caller) 1211 void *caller)
1197{ 1212{
1198 return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL, 1213 return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
1199 caller); 1214 caller);
1200} 1215}
1201 1216
@@ -1210,22 +1225,22 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
1210 */ 1225 */
1211struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) 1226struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
1212{ 1227{
1213 return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, 1228 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
1214 -1, GFP_KERNEL, __builtin_return_address(0)); 1229 -1, GFP_KERNEL, __builtin_return_address(0));
1215} 1230}
1216 1231
1217struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, 1232struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
1218 void *caller) 1233 void *caller)
1219{ 1234{
1220 return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, 1235 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
1221 -1, GFP_KERNEL, caller); 1236 -1, GFP_KERNEL, caller);
1222} 1237}
1223 1238
1224struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, 1239struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
1225 int node, gfp_t gfp_mask) 1240 int node, gfp_t gfp_mask)
1226{ 1241{
1227 return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node, 1242 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
1228 gfp_mask, __builtin_return_address(0)); 1243 node, gfp_mask, __builtin_return_address(0));
1229} 1244}
1230 1245
1231static struct vm_struct *find_vm_area(const void *addr) 1246static struct vm_struct *find_vm_area(const void *addr)
@@ -1255,17 +1270,21 @@ struct vm_struct *remove_vm_area(const void *addr)
1255 if (va && va->flags & VM_VM_AREA) { 1270 if (va && va->flags & VM_VM_AREA) {
1256 struct vm_struct *vm = va->private; 1271 struct vm_struct *vm = va->private;
1257 struct vm_struct *tmp, **p; 1272 struct vm_struct *tmp, **p;
1258 1273 /*
1259 vmap_debug_free_range(va->va_start, va->va_end); 1274 * remove from list and disallow access to this vm_struct
1260 free_unmap_vmap_area(va); 1275 * before unmap. (address range confliction is maintained by
1261 vm->size -= PAGE_SIZE; 1276 * vmap.)
1262 1277 */
1263 write_lock(&vmlist_lock); 1278 write_lock(&vmlist_lock);
1264 for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) 1279 for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
1265 ; 1280 ;
1266 *p = tmp->next; 1281 *p = tmp->next;
1267 write_unlock(&vmlist_lock); 1282 write_unlock(&vmlist_lock);
1268 1283
1284 vmap_debug_free_range(va->va_start, va->va_end);
1285 free_unmap_vmap_area(va);
1286 vm->size -= PAGE_SIZE;
1287
1269 return vm; 1288 return vm;
1270 } 1289 }
1271 return NULL; 1290 return NULL;
@@ -1326,6 +1345,9 @@ static void __vunmap(const void *addr, int deallocate_pages)
1326void vfree(const void *addr) 1345void vfree(const void *addr)
1327{ 1346{
1328 BUG_ON(in_interrupt()); 1347 BUG_ON(in_interrupt());
1348
1349 kmemleak_free(addr);
1350
1329 __vunmap(addr, 1); 1351 __vunmap(addr, 1);
1330} 1352}
1331EXPORT_SYMBOL(vfree); 1353EXPORT_SYMBOL(vfree);
@@ -1364,7 +1386,7 @@ void *vmap(struct page **pages, unsigned int count,
1364 1386
1365 might_sleep(); 1387 might_sleep();
1366 1388
1367 if (count > num_physpages) 1389 if (count > totalram_pages)
1368 return NULL; 1390 return NULL;
1369 1391
1370 area = get_vm_area_caller((count << PAGE_SHIFT), flags, 1392 area = get_vm_area_caller((count << PAGE_SHIFT), flags,
@@ -1381,7 +1403,8 @@ void *vmap(struct page **pages, unsigned int count,
1381} 1403}
1382EXPORT_SYMBOL(vmap); 1404EXPORT_SYMBOL(vmap);
1383 1405
1384static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, 1406static void *__vmalloc_node(unsigned long size, unsigned long align,
1407 gfp_t gfp_mask, pgprot_t prot,
1385 int node, void *caller); 1408 int node, void *caller);
1386static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 1409static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1387 pgprot_t prot, int node, void *caller) 1410 pgprot_t prot, int node, void *caller)
@@ -1395,7 +1418,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1395 area->nr_pages = nr_pages; 1418 area->nr_pages = nr_pages;
1396 /* Please note that the recursion is strictly bounded. */ 1419 /* Please note that the recursion is strictly bounded. */
1397 if (array_size > PAGE_SIZE) { 1420 if (array_size > PAGE_SIZE) {
1398 pages = __vmalloc_node(array_size, gfp_mask | __GFP_ZERO, 1421 pages = __vmalloc_node(array_size, 1, gfp_mask | __GFP_ZERO,
1399 PAGE_KERNEL, node, caller); 1422 PAGE_KERNEL, node, caller);
1400 area->flags |= VM_VPAGES; 1423 area->flags |= VM_VPAGES;
1401 } else { 1424 } else {
@@ -1438,13 +1461,23 @@ fail:
1438 1461
1439void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) 1462void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
1440{ 1463{
1441 return __vmalloc_area_node(area, gfp_mask, prot, -1, 1464 void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1,
1442 __builtin_return_address(0)); 1465 __builtin_return_address(0));
1466
1467 /*
1468 * A ref_count = 3 is needed because the vm_struct and vmap_area
1469 * structures allocated in the __get_vm_area_node() function contain
1470 * references to the virtual address of the vmalloc'ed block.
1471 */
1472 kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask);
1473
1474 return addr;
1443} 1475}
1444 1476
1445/** 1477/**
1446 * __vmalloc_node - allocate virtually contiguous memory 1478 * __vmalloc_node - allocate virtually contiguous memory
1447 * @size: allocation size 1479 * @size: allocation size
1480 * @align: desired alignment
1448 * @gfp_mask: flags for the page level allocator 1481 * @gfp_mask: flags for the page level allocator
1449 * @prot: protection mask for the allocated pages 1482 * @prot: protection mask for the allocated pages
1450 * @node: node to use for allocation or -1 1483 * @node: node to use for allocation or -1
@@ -1454,27 +1487,39 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
1454 * allocator with @gfp_mask flags. Map them into contiguous 1487 * allocator with @gfp_mask flags. Map them into contiguous
1455 * kernel virtual space, using a pagetable protection of @prot. 1488 * kernel virtual space, using a pagetable protection of @prot.
1456 */ 1489 */
1457static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, 1490static void *__vmalloc_node(unsigned long size, unsigned long align,
1458 int node, void *caller) 1491 gfp_t gfp_mask, pgprot_t prot,
1492 int node, void *caller)
1459{ 1493{
1460 struct vm_struct *area; 1494 struct vm_struct *area;
1495 void *addr;
1496 unsigned long real_size = size;
1461 1497
1462 size = PAGE_ALIGN(size); 1498 size = PAGE_ALIGN(size);
1463 if (!size || (size >> PAGE_SHIFT) > num_physpages) 1499 if (!size || (size >> PAGE_SHIFT) > totalram_pages)
1464 return NULL; 1500 return NULL;
1465 1501
1466 area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END, 1502 area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START,
1467 node, gfp_mask, caller); 1503 VMALLOC_END, node, gfp_mask, caller);
1468 1504
1469 if (!area) 1505 if (!area)
1470 return NULL; 1506 return NULL;
1471 1507
1472 return __vmalloc_area_node(area, gfp_mask, prot, node, caller); 1508 addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
1509
1510 /*
1511 * A ref_count = 3 is needed because the vm_struct and vmap_area
1512 * structures allocated in the __get_vm_area_node() function contain
1513 * references to the virtual address of the vmalloc'ed block.
1514 */
1515 kmemleak_alloc(addr, real_size, 3, gfp_mask);
1516
1517 return addr;
1473} 1518}
1474 1519
1475void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) 1520void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
1476{ 1521{
1477 return __vmalloc_node(size, gfp_mask, prot, -1, 1522 return __vmalloc_node(size, 1, gfp_mask, prot, -1,
1478 __builtin_return_address(0)); 1523 __builtin_return_address(0));
1479} 1524}
1480EXPORT_SYMBOL(__vmalloc); 1525EXPORT_SYMBOL(__vmalloc);
@@ -1490,7 +1535,7 @@ EXPORT_SYMBOL(__vmalloc);
1490 */ 1535 */
1491void *vmalloc(unsigned long size) 1536void *vmalloc(unsigned long size)
1492{ 1537{
1493 return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, 1538 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
1494 -1, __builtin_return_address(0)); 1539 -1, __builtin_return_address(0));
1495} 1540}
1496EXPORT_SYMBOL(vmalloc); 1541EXPORT_SYMBOL(vmalloc);
@@ -1507,7 +1552,8 @@ void *vmalloc_user(unsigned long size)
1507 struct vm_struct *area; 1552 struct vm_struct *area;
1508 void *ret; 1553 void *ret;
1509 1554
1510 ret = __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, 1555 ret = __vmalloc_node(size, SHMLBA,
1556 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
1511 PAGE_KERNEL, -1, __builtin_return_address(0)); 1557 PAGE_KERNEL, -1, __builtin_return_address(0));
1512 if (ret) { 1558 if (ret) {
1513 area = find_vm_area(ret); 1559 area = find_vm_area(ret);
@@ -1530,7 +1576,7 @@ EXPORT_SYMBOL(vmalloc_user);
1530 */ 1576 */
1531void *vmalloc_node(unsigned long size, int node) 1577void *vmalloc_node(unsigned long size, int node)
1532{ 1578{
1533 return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, 1579 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
1534 node, __builtin_return_address(0)); 1580 node, __builtin_return_address(0));
1535} 1581}
1536EXPORT_SYMBOL(vmalloc_node); 1582EXPORT_SYMBOL(vmalloc_node);
@@ -1553,7 +1599,7 @@ EXPORT_SYMBOL(vmalloc_node);
1553 1599
1554void *vmalloc_exec(unsigned long size) 1600void *vmalloc_exec(unsigned long size)
1555{ 1601{
1556 return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, 1602 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
1557 -1, __builtin_return_address(0)); 1603 -1, __builtin_return_address(0));
1558} 1604}
1559 1605
@@ -1574,7 +1620,7 @@ void *vmalloc_exec(unsigned long size)
1574 */ 1620 */
1575void *vmalloc_32(unsigned long size) 1621void *vmalloc_32(unsigned long size)
1576{ 1622{
1577 return __vmalloc_node(size, GFP_VMALLOC32, PAGE_KERNEL, 1623 return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
1578 -1, __builtin_return_address(0)); 1624 -1, __builtin_return_address(0));
1579} 1625}
1580EXPORT_SYMBOL(vmalloc_32); 1626EXPORT_SYMBOL(vmalloc_32);
@@ -1591,7 +1637,7 @@ void *vmalloc_32_user(unsigned long size)
1591 struct vm_struct *area; 1637 struct vm_struct *area;
1592 void *ret; 1638 void *ret;
1593 1639
1594 ret = __vmalloc_node(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, 1640 ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
1595 -1, __builtin_return_address(0)); 1641 -1, __builtin_return_address(0));
1596 if (ret) { 1642 if (ret) {
1597 area = find_vm_area(ret); 1643 area = find_vm_area(ret);
@@ -1601,10 +1647,120 @@ void *vmalloc_32_user(unsigned long size)
1601} 1647}
1602EXPORT_SYMBOL(vmalloc_32_user); 1648EXPORT_SYMBOL(vmalloc_32_user);
1603 1649
1650/*
1651 * small helper routine , copy contents to buf from addr.
1652 * If the page is not present, fill zero.
1653 */
1654
1655static int aligned_vread(char *buf, char *addr, unsigned long count)
1656{
1657 struct page *p;
1658 int copied = 0;
1659
1660 while (count) {
1661 unsigned long offset, length;
1662
1663 offset = (unsigned long)addr & ~PAGE_MASK;
1664 length = PAGE_SIZE - offset;
1665 if (length > count)
1666 length = count;
1667 p = vmalloc_to_page(addr);
1668 /*
1669 * To do safe access to this _mapped_ area, we need
1670 * lock. But adding lock here means that we need to add
1671 * overhead of vmalloc()/vfree() calles for this _debug_
1672 * interface, rarely used. Instead of that, we'll use
1673 * kmap() and get small overhead in this access function.
1674 */
1675 if (p) {
1676 /*
1677 * we can expect USER0 is not used (see vread/vwrite's
1678 * function description)
1679 */
1680 void *map = kmap_atomic(p, KM_USER0);
1681 memcpy(buf, map + offset, length);
1682 kunmap_atomic(map, KM_USER0);
1683 } else
1684 memset(buf, 0, length);
1685
1686 addr += length;
1687 buf += length;
1688 copied += length;
1689 count -= length;
1690 }
1691 return copied;
1692}
1693
1694static int aligned_vwrite(char *buf, char *addr, unsigned long count)
1695{
1696 struct page *p;
1697 int copied = 0;
1698
1699 while (count) {
1700 unsigned long offset, length;
1701
1702 offset = (unsigned long)addr & ~PAGE_MASK;
1703 length = PAGE_SIZE - offset;
1704 if (length > count)
1705 length = count;
1706 p = vmalloc_to_page(addr);
1707 /*
1708 * To do safe access to this _mapped_ area, we need
1709 * lock. But adding lock here means that we need to add
1710 * overhead of vmalloc()/vfree() calles for this _debug_
1711 * interface, rarely used. Instead of that, we'll use
1712 * kmap() and get small overhead in this access function.
1713 */
1714 if (p) {
1715 /*
1716 * we can expect USER0 is not used (see vread/vwrite's
1717 * function description)
1718 */
1719 void *map = kmap_atomic(p, KM_USER0);
1720 memcpy(map + offset, buf, length);
1721 kunmap_atomic(map, KM_USER0);
1722 }
1723 addr += length;
1724 buf += length;
1725 copied += length;
1726 count -= length;
1727 }
1728 return copied;
1729}
1730
1731/**
1732 * vread() - read vmalloc area in a safe way.
1733 * @buf: buffer for reading data
1734 * @addr: vm address.
1735 * @count: number of bytes to be read.
1736 *
1737 * Returns # of bytes which addr and buf should be increased.
1738 * (same number to @count). Returns 0 if [addr...addr+count) doesn't
1739 * includes any intersect with alive vmalloc area.
1740 *
1741 * This function checks that addr is a valid vmalloc'ed area, and
1742 * copy data from that area to a given buffer. If the given memory range
1743 * of [addr...addr+count) includes some valid address, data is copied to
1744 * proper area of @buf. If there are memory holes, they'll be zero-filled.
1745 * IOREMAP area is treated as memory hole and no copy is done.
1746 *
1747 * If [addr...addr+count) doesn't includes any intersects with alive
1748 * vm_struct area, returns 0.
1749 * @buf should be kernel's buffer. Because this function uses KM_USER0,
1750 * the caller should guarantee KM_USER0 is not used.
1751 *
1752 * Note: In usual ops, vread() is never necessary because the caller
1753 * should know vmalloc() area is valid and can use memcpy().
1754 * This is for routines which have to access vmalloc area without
1755 * any informaion, as /dev/kmem.
1756 *
1757 */
1758
1604long vread(char *buf, char *addr, unsigned long count) 1759long vread(char *buf, char *addr, unsigned long count)
1605{ 1760{
1606 struct vm_struct *tmp; 1761 struct vm_struct *tmp;
1607 char *vaddr, *buf_start = buf; 1762 char *vaddr, *buf_start = buf;
1763 unsigned long buflen = count;
1608 unsigned long n; 1764 unsigned long n;
1609 1765
1610 /* Don't allow overflow */ 1766 /* Don't allow overflow */
@@ -1612,7 +1768,7 @@ long vread(char *buf, char *addr, unsigned long count)
1612 count = -(unsigned long) addr; 1768 count = -(unsigned long) addr;
1613 1769
1614 read_lock(&vmlist_lock); 1770 read_lock(&vmlist_lock);
1615 for (tmp = vmlist; tmp; tmp = tmp->next) { 1771 for (tmp = vmlist; count && tmp; tmp = tmp->next) {
1616 vaddr = (char *) tmp->addr; 1772 vaddr = (char *) tmp->addr;
1617 if (addr >= vaddr + tmp->size - PAGE_SIZE) 1773 if (addr >= vaddr + tmp->size - PAGE_SIZE)
1618 continue; 1774 continue;
@@ -1625,32 +1781,72 @@ long vread(char *buf, char *addr, unsigned long count)
1625 count--; 1781 count--;
1626 } 1782 }
1627 n = vaddr + tmp->size - PAGE_SIZE - addr; 1783 n = vaddr + tmp->size - PAGE_SIZE - addr;
1628 do { 1784 if (n > count)
1629 if (count == 0) 1785 n = count;
1630 goto finished; 1786 if (!(tmp->flags & VM_IOREMAP))
1631 *buf = *addr; 1787 aligned_vread(buf, addr, n);
1632 buf++; 1788 else /* IOREMAP area is treated as memory hole */
1633 addr++; 1789 memset(buf, 0, n);
1634 count--; 1790 buf += n;
1635 } while (--n > 0); 1791 addr += n;
1792 count -= n;
1636 } 1793 }
1637finished: 1794finished:
1638 read_unlock(&vmlist_lock); 1795 read_unlock(&vmlist_lock);
1639 return buf - buf_start; 1796
1797 if (buf == buf_start)
1798 return 0;
1799 /* zero-fill memory holes */
1800 if (buf != buf_start + buflen)
1801 memset(buf, 0, buflen - (buf - buf_start));
1802
1803 return buflen;
1640} 1804}
1641 1805
1806/**
1807 * vwrite() - write vmalloc area in a safe way.
1808 * @buf: buffer for source data
1809 * @addr: vm address.
1810 * @count: number of bytes to be read.
1811 *
1812 * Returns # of bytes which addr and buf should be incresed.
1813 * (same number to @count).
1814 * If [addr...addr+count) doesn't includes any intersect with valid
1815 * vmalloc area, returns 0.
1816 *
1817 * This function checks that addr is a valid vmalloc'ed area, and
1818 * copy data from a buffer to the given addr. If specified range of
1819 * [addr...addr+count) includes some valid address, data is copied from
1820 * proper area of @buf. If there are memory holes, no copy to hole.
1821 * IOREMAP area is treated as memory hole and no copy is done.
1822 *
1823 * If [addr...addr+count) doesn't includes any intersects with alive
1824 * vm_struct area, returns 0.
1825 * @buf should be kernel's buffer. Because this function uses KM_USER0,
1826 * the caller should guarantee KM_USER0 is not used.
1827 *
1828 * Note: In usual ops, vwrite() is never necessary because the caller
1829 * should know vmalloc() area is valid and can use memcpy().
1830 * This is for routines which have to access vmalloc area without
1831 * any informaion, as /dev/kmem.
1832 *
1833 * The caller should guarantee KM_USER1 is not used.
1834 */
1835
1642long vwrite(char *buf, char *addr, unsigned long count) 1836long vwrite(char *buf, char *addr, unsigned long count)
1643{ 1837{
1644 struct vm_struct *tmp; 1838 struct vm_struct *tmp;
1645 char *vaddr, *buf_start = buf; 1839 char *vaddr;
1646 unsigned long n; 1840 unsigned long n, buflen;
1841 int copied = 0;
1647 1842
1648 /* Don't allow overflow */ 1843 /* Don't allow overflow */
1649 if ((unsigned long) addr + count < count) 1844 if ((unsigned long) addr + count < count)
1650 count = -(unsigned long) addr; 1845 count = -(unsigned long) addr;
1846 buflen = count;
1651 1847
1652 read_lock(&vmlist_lock); 1848 read_lock(&vmlist_lock);
1653 for (tmp = vmlist; tmp; tmp = tmp->next) { 1849 for (tmp = vmlist; count && tmp; tmp = tmp->next) {
1654 vaddr = (char *) tmp->addr; 1850 vaddr = (char *) tmp->addr;
1655 if (addr >= vaddr + tmp->size - PAGE_SIZE) 1851 if (addr >= vaddr + tmp->size - PAGE_SIZE)
1656 continue; 1852 continue;
@@ -1662,18 +1858,21 @@ long vwrite(char *buf, char *addr, unsigned long count)
1662 count--; 1858 count--;
1663 } 1859 }
1664 n = vaddr + tmp->size - PAGE_SIZE - addr; 1860 n = vaddr + tmp->size - PAGE_SIZE - addr;
1665 do { 1861 if (n > count)
1666 if (count == 0) 1862 n = count;
1667 goto finished; 1863 if (!(tmp->flags & VM_IOREMAP)) {
1668 *addr = *buf; 1864 aligned_vwrite(buf, addr, n);
1669 buf++; 1865 copied++;
1670 addr++; 1866 }
1671 count--; 1867 buf += n;
1672 } while (--n > 0); 1868 addr += n;
1869 count -= n;
1673 } 1870 }
1674finished: 1871finished:
1675 read_unlock(&vmlist_lock); 1872 read_unlock(&vmlist_lock);
1676 return buf - buf_start; 1873 if (!copied)
1874 return 0;
1875 return buflen;
1677} 1876}
1678 1877
1679/** 1878/**
@@ -1794,6 +1993,286 @@ void free_vm_area(struct vm_struct *area)
1794} 1993}
1795EXPORT_SYMBOL_GPL(free_vm_area); 1994EXPORT_SYMBOL_GPL(free_vm_area);
1796 1995
1996static struct vmap_area *node_to_va(struct rb_node *n)
1997{
1998 return n ? rb_entry(n, struct vmap_area, rb_node) : NULL;
1999}
2000
2001/**
2002 * pvm_find_next_prev - find the next and prev vmap_area surrounding @end
2003 * @end: target address
2004 * @pnext: out arg for the next vmap_area
2005 * @pprev: out arg for the previous vmap_area
2006 *
2007 * Returns: %true if either or both of next and prev are found,
2008 * %false if no vmap_area exists
2009 *
2010 * Find vmap_areas end addresses of which enclose @end. ie. if not
2011 * NULL, *pnext->va_end > @end and *pprev->va_end <= @end.
2012 */
2013static bool pvm_find_next_prev(unsigned long end,
2014 struct vmap_area **pnext,
2015 struct vmap_area **pprev)
2016{
2017 struct rb_node *n = vmap_area_root.rb_node;
2018 struct vmap_area *va = NULL;
2019
2020 while (n) {
2021 va = rb_entry(n, struct vmap_area, rb_node);
2022 if (end < va->va_end)
2023 n = n->rb_left;
2024 else if (end > va->va_end)
2025 n = n->rb_right;
2026 else
2027 break;
2028 }
2029
2030 if (!va)
2031 return false;
2032
2033 if (va->va_end > end) {
2034 *pnext = va;
2035 *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
2036 } else {
2037 *pprev = va;
2038 *pnext = node_to_va(rb_next(&(*pprev)->rb_node));
2039 }
2040 return true;
2041}
2042
2043/**
2044 * pvm_determine_end - find the highest aligned address between two vmap_areas
2045 * @pnext: in/out arg for the next vmap_area
2046 * @pprev: in/out arg for the previous vmap_area
2047 * @align: alignment
2048 *
2049 * Returns: determined end address
2050 *
2051 * Find the highest aligned address between *@pnext and *@pprev below
2052 * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned
2053 * down address is between the end addresses of the two vmap_areas.
2054 *
2055 * Please note that the address returned by this function may fall
2056 * inside *@pnext vmap_area. The caller is responsible for checking
2057 * that.
2058 */
2059static unsigned long pvm_determine_end(struct vmap_area **pnext,
2060 struct vmap_area **pprev,
2061 unsigned long align)
2062{
2063 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
2064 unsigned long addr;
2065
2066 if (*pnext)
2067 addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end);
2068 else
2069 addr = vmalloc_end;
2070
2071 while (*pprev && (*pprev)->va_end > addr) {
2072 *pnext = *pprev;
2073 *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
2074 }
2075
2076 return addr;
2077}
2078
2079/**
2080 * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
2081 * @offsets: array containing offset of each area
2082 * @sizes: array containing size of each area
2083 * @nr_vms: the number of areas to allocate
2084 * @align: alignment, all entries in @offsets and @sizes must be aligned to this
2085 * @gfp_mask: allocation mask
2086 *
2087 * Returns: kmalloc'd vm_struct pointer array pointing to allocated
2088 * vm_structs on success, %NULL on failure
2089 *
2090 * Percpu allocator wants to use congruent vm areas so that it can
2091 * maintain the offsets among percpu areas. This function allocates
2092 * congruent vmalloc areas for it. These areas tend to be scattered
2093 * pretty far, distance between two areas easily going up to
2094 * gigabytes. To avoid interacting with regular vmallocs, these areas
2095 * are allocated from top.
2096 *
2097 * Despite its complicated look, this allocator is rather simple. It
2098 * does everything top-down and scans areas from the end looking for
2099 * matching slot. While scanning, if any of the areas overlaps with
2100 * existing vmap_area, the base address is pulled down to fit the
2101 * area. Scanning is repeated till all the areas fit and then all
2102 * necessary data structres are inserted and the result is returned.
2103 */
2104struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
2105 const size_t *sizes, int nr_vms,
2106 size_t align, gfp_t gfp_mask)
2107{
2108 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
2109 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
2110 struct vmap_area **vas, *prev, *next;
2111 struct vm_struct **vms;
2112 int area, area2, last_area, term_area;
2113 unsigned long base, start, end, last_end;
2114 bool purged = false;
2115
2116 gfp_mask &= GFP_RECLAIM_MASK;
2117
2118 /* verify parameters and allocate data structures */
2119 BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align));
2120 for (last_area = 0, area = 0; area < nr_vms; area++) {
2121 start = offsets[area];
2122 end = start + sizes[area];
2123
2124 /* is everything aligned properly? */
2125 BUG_ON(!IS_ALIGNED(offsets[area], align));
2126 BUG_ON(!IS_ALIGNED(sizes[area], align));
2127
2128 /* detect the area with the highest address */
2129 if (start > offsets[last_area])
2130 last_area = area;
2131
2132 for (area2 = 0; area2 < nr_vms; area2++) {
2133 unsigned long start2 = offsets[area2];
2134 unsigned long end2 = start2 + sizes[area2];
2135
2136 if (area2 == area)
2137 continue;
2138
2139 BUG_ON(start2 >= start && start2 < end);
2140 BUG_ON(end2 <= end && end2 > start);
2141 }
2142 }
2143 last_end = offsets[last_area] + sizes[last_area];
2144
2145 if (vmalloc_end - vmalloc_start < last_end) {
2146 WARN_ON(true);
2147 return NULL;
2148 }
2149
2150 vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask);
2151 vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask);
2152 if (!vas || !vms)
2153 goto err_free;
2154
2155 for (area = 0; area < nr_vms; area++) {
2156 vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask);
2157 vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask);
2158 if (!vas[area] || !vms[area])
2159 goto err_free;
2160 }
2161retry:
2162 spin_lock(&vmap_area_lock);
2163
2164 /* start scanning - we scan from the top, begin with the last area */
2165 area = term_area = last_area;
2166 start = offsets[area];
2167 end = start + sizes[area];
2168
2169 if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) {
2170 base = vmalloc_end - last_end;
2171 goto found;
2172 }
2173 base = pvm_determine_end(&next, &prev, align) - end;
2174
2175 while (true) {
2176 BUG_ON(next && next->va_end <= base + end);
2177 BUG_ON(prev && prev->va_end > base + end);
2178
2179 /*
2180 * base might have underflowed, add last_end before
2181 * comparing.
2182 */
2183 if (base + last_end < vmalloc_start + last_end) {
2184 spin_unlock(&vmap_area_lock);
2185 if (!purged) {
2186 purge_vmap_area_lazy();
2187 purged = true;
2188 goto retry;
2189 }
2190 goto err_free;
2191 }
2192
2193 /*
2194 * If next overlaps, move base downwards so that it's
2195 * right below next and then recheck.
2196 */
2197 if (next && next->va_start < base + end) {
2198 base = pvm_determine_end(&next, &prev, align) - end;
2199 term_area = area;
2200 continue;
2201 }
2202
2203 /*
2204 * If prev overlaps, shift down next and prev and move
2205 * base so that it's right below new next and then
2206 * recheck.
2207 */
2208 if (prev && prev->va_end > base + start) {
2209 next = prev;
2210 prev = node_to_va(rb_prev(&next->rb_node));
2211 base = pvm_determine_end(&next, &prev, align) - end;
2212 term_area = area;
2213 continue;
2214 }
2215
2216 /*
2217 * This area fits, move on to the previous one. If
2218 * the previous one is the terminal one, we're done.
2219 */
2220 area = (area + nr_vms - 1) % nr_vms;
2221 if (area == term_area)
2222 break;
2223 start = offsets[area];
2224 end = start + sizes[area];
2225 pvm_find_next_prev(base + end, &next, &prev);
2226 }
2227found:
2228 /* we've found a fitting base, insert all va's */
2229 for (area = 0; area < nr_vms; area++) {
2230 struct vmap_area *va = vas[area];
2231
2232 va->va_start = base + offsets[area];
2233 va->va_end = va->va_start + sizes[area];
2234 __insert_vmap_area(va);
2235 }
2236
2237 vmap_area_pcpu_hole = base + offsets[last_area];
2238
2239 spin_unlock(&vmap_area_lock);
2240
2241 /* insert all vm's */
2242 for (area = 0; area < nr_vms; area++)
2243 insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
2244 pcpu_get_vm_areas);
2245
2246 kfree(vas);
2247 return vms;
2248
2249err_free:
2250 for (area = 0; area < nr_vms; area++) {
2251 if (vas)
2252 kfree(vas[area]);
2253 if (vms)
2254 kfree(vms[area]);
2255 }
2256 kfree(vas);
2257 kfree(vms);
2258 return NULL;
2259}
2260
2261/**
2262 * pcpu_free_vm_areas - free vmalloc areas for percpu allocator
2263 * @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
2264 * @nr_vms: the number of allocated areas
2265 *
2266 * Free vm_structs and the array allocated by pcpu_get_vm_areas().
2267 */
2268void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
2269{
2270 int i;
2271
2272 for (i = 0; i < nr_vms; i++)
2273 free_vm_area(vms[i]);
2274 kfree(vms);
2275}
1797 2276
1798#ifdef CONFIG_PROC_FS 2277#ifdef CONFIG_PROC_FS
1799static void *s_start(struct seq_file *m, loff_t *pos) 2278static void *s_start(struct seq_file *m, loff_t *pos)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 06e72693b458..777af57fd8c8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -63,6 +63,9 @@ struct scan_control {
63 /* Can mapped pages be reclaimed? */ 63 /* Can mapped pages be reclaimed? */
64 int may_unmap; 64 int may_unmap;
65 65
66 /* Can pages be swapped as part of reclaim? */
67 int may_swap;
68
66 /* This context's SWAP_CLUSTER_MAX. If freeing memory for 69 /* This context's SWAP_CLUSTER_MAX. If freeing memory for
67 * suspend, we effectively ignore SWAP_CLUSTER_MAX. 70 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
68 * In this context, it doesn't matter that we scan the 71 * In this context, it doesn't matter that we scan the
@@ -145,8 +148,8 @@ static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
145 return &zone->reclaim_stat; 148 return &zone->reclaim_stat;
146} 149}
147 150
148static unsigned long zone_nr_pages(struct zone *zone, struct scan_control *sc, 151static unsigned long zone_nr_lru_pages(struct zone *zone,
149 enum lru_list lru) 152 struct scan_control *sc, enum lru_list lru)
150{ 153{
151 if (!scanning_global_lru(sc)) 154 if (!scanning_global_lru(sc))
152 return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru); 155 return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru);
@@ -283,7 +286,12 @@ static inline int page_mapping_inuse(struct page *page)
283 286
284static inline int is_page_cache_freeable(struct page *page) 287static inline int is_page_cache_freeable(struct page *page)
285{ 288{
286 return page_count(page) - !!PagePrivate(page) == 2; 289 /*
290 * A freeable page cache page is referenced only by the caller
291 * that isolated the page, the page cache radix tree and
292 * optional buffer heads at page->private.
293 */
294 return page_count(page) - page_has_private(page) == 2;
287} 295}
288 296
289static int may_write_to_queue(struct backing_dev_info *bdi) 297static int may_write_to_queue(struct backing_dev_info *bdi)
@@ -358,7 +366,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
358 * block, for some throttling. This happens by accident, because 366 * block, for some throttling. This happens by accident, because
359 * swap_backing_dev_info is bust: it doesn't reflect the 367 * swap_backing_dev_info is bust: it doesn't reflect the
360 * congestion state of the swapdevs. Easy to fix, if needed. 368 * congestion state of the swapdevs. Easy to fix, if needed.
361 * See swapfile.c:page_queue_congested().
362 */ 369 */
363 if (!is_page_cache_freeable(page)) 370 if (!is_page_cache_freeable(page))
364 return PAGE_KEEP; 371 return PAGE_KEEP;
@@ -367,7 +374,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
367 * Some data journaling orphaned pages can have 374 * Some data journaling orphaned pages can have
368 * page->mapping == NULL while being dirty with clean buffers. 375 * page->mapping == NULL while being dirty with clean buffers.
369 */ 376 */
370 if (PagePrivate(page)) { 377 if (page_has_private(page)) {
371 if (try_to_free_buffers(page)) { 378 if (try_to_free_buffers(page)) {
372 ClearPageDirty(page); 379 ClearPageDirty(page);
373 printk("%s: orphaned page\n", __func__); 380 printk("%s: orphaned page\n", __func__);
@@ -467,10 +474,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
467 swp_entry_t swap = { .val = page_private(page) }; 474 swp_entry_t swap = { .val = page_private(page) };
468 __delete_from_swap_cache(page); 475 __delete_from_swap_cache(page);
469 spin_unlock_irq(&mapping->tree_lock); 476 spin_unlock_irq(&mapping->tree_lock);
470 swap_free(swap); 477 swapcache_free(swap, page);
471 } else { 478 } else {
472 __remove_from_page_cache(page); 479 __remove_from_page_cache(page);
473 spin_unlock_irq(&mapping->tree_lock); 480 spin_unlock_irq(&mapping->tree_lock);
481 mem_cgroup_uncharge_cache_page(page);
474 } 482 }
475 483
476 return 1; 484 return 1;
@@ -509,7 +517,6 @@ int remove_mapping(struct address_space *mapping, struct page *page)
509 * 517 *
510 * lru_lock must not be held, interrupts must be enabled. 518 * lru_lock must not be held, interrupts must be enabled.
511 */ 519 */
512#ifdef CONFIG_UNEVICTABLE_LRU
513void putback_lru_page(struct page *page) 520void putback_lru_page(struct page *page)
514{ 521{
515 int lru; 522 int lru;
@@ -528,7 +535,7 @@ redo:
528 * unevictable page on [in]active list. 535 * unevictable page on [in]active list.
529 * We know how to handle that. 536 * We know how to handle that.
530 */ 537 */
531 lru = active + page_is_file_cache(page); 538 lru = active + page_lru_base_type(page);
532 lru_cache_add_lru(page, lru); 539 lru_cache_add_lru(page, lru);
533 } else { 540 } else {
534 /* 541 /*
@@ -537,6 +544,16 @@ redo:
537 */ 544 */
538 lru = LRU_UNEVICTABLE; 545 lru = LRU_UNEVICTABLE;
539 add_page_to_unevictable_list(page); 546 add_page_to_unevictable_list(page);
547 /*
548 * When racing with an mlock clearing (page is
549 * unlocked), make sure that if the other thread does
550 * not observe our setting of PG_lru and fails
551 * isolation, we see PG_mlocked cleared below and move
552 * the page back to the evictable list.
553 *
554 * The other side is TestClearPageMlocked().
555 */
556 smp_mb();
540 } 557 }
541 558
542 /* 559 /*
@@ -563,20 +580,6 @@ redo:
563 put_page(page); /* drop ref from isolate */ 580 put_page(page); /* drop ref from isolate */
564} 581}
565 582
566#else /* CONFIG_UNEVICTABLE_LRU */
567
568void putback_lru_page(struct page *page)
569{
570 int lru;
571 VM_BUG_ON(PageLRU(page));
572
573 lru = !!TestClearPageActive(page) + page_is_file_cache(page);
574 lru_cache_add_lru(page, lru);
575 put_page(page);
576}
577#endif /* CONFIG_UNEVICTABLE_LRU */
578
579
580/* 583/*
581 * shrink_page_list() returns the number of reclaimed pages 584 * shrink_page_list() returns the number of reclaimed pages
582 */ 585 */
@@ -588,6 +591,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
588 struct pagevec freed_pvec; 591 struct pagevec freed_pvec;
589 int pgactivate = 0; 592 int pgactivate = 0;
590 unsigned long nr_reclaimed = 0; 593 unsigned long nr_reclaimed = 0;
594 unsigned long vm_flags;
591 595
592 cond_resched(); 596 cond_resched();
593 597
@@ -638,10 +642,16 @@ static unsigned long shrink_page_list(struct list_head *page_list,
638 goto keep_locked; 642 goto keep_locked;
639 } 643 }
640 644
641 referenced = page_referenced(page, 1, sc->mem_cgroup); 645 referenced = page_referenced(page, 1,
642 /* In active use or really unfreeable? Activate it. */ 646 sc->mem_cgroup, &vm_flags);
647 /*
648 * In active use or really unfreeable? Activate it.
649 * If page which have PG_mlocked lost isoltation race,
650 * try_to_unmap moves it to unevictable list
651 */
643 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && 652 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
644 referenced && page_mapping_inuse(page)) 653 referenced && page_mapping_inuse(page)
654 && !(vm_flags & VM_LOCKED))
645 goto activate_locked; 655 goto activate_locked;
646 656
647 /* 657 /*
@@ -663,7 +673,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
663 * processes. Try to unmap it here. 673 * processes. Try to unmap it here.
664 */ 674 */
665 if (page_mapped(page) && mapping) { 675 if (page_mapped(page) && mapping) {
666 switch (try_to_unmap(page, 0)) { 676 switch (try_to_unmap(page, TTU_UNMAP)) {
667 case SWAP_FAIL: 677 case SWAP_FAIL:
668 goto activate_locked; 678 goto activate_locked;
669 case SWAP_AGAIN: 679 case SWAP_AGAIN:
@@ -727,7 +737,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
727 * process address space (page_count == 1) it can be freed. 737 * process address space (page_count == 1) it can be freed.
728 * Otherwise, leave the page on the LRU so it is swappable. 738 * Otherwise, leave the page on the LRU so it is swappable.
729 */ 739 */
730 if (PagePrivate(page)) { 740 if (page_has_private(page)) {
731 if (!try_to_release_page(page, sc->gfp_mask)) 741 if (!try_to_release_page(page, sc->gfp_mask))
732 goto activate_locked; 742 goto activate_locked;
733 if (!mapping && page_count(page) == 1) { 743 if (!mapping && page_count(page) == 1) {
@@ -825,7 +835,7 @@ int __isolate_lru_page(struct page *page, int mode, int file)
825 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) 835 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
826 return ret; 836 return ret;
827 837
828 if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file)) 838 if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file)
829 return ret; 839 return ret;
830 840
831 /* 841 /*
@@ -846,7 +856,6 @@ int __isolate_lru_page(struct page *page, int mode, int file)
846 */ 856 */
847 ClearPageLRU(page); 857 ClearPageLRU(page);
848 ret = 0; 858 ret = 0;
849 mem_cgroup_del_lru(page);
850 } 859 }
851 860
852 return ret; 861 return ret;
@@ -894,12 +903,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
894 switch (__isolate_lru_page(page, mode, file)) { 903 switch (__isolate_lru_page(page, mode, file)) {
895 case 0: 904 case 0:
896 list_move(&page->lru, dst); 905 list_move(&page->lru, dst);
906 mem_cgroup_del_lru(page);
897 nr_taken++; 907 nr_taken++;
898 break; 908 break;
899 909
900 case -EBUSY: 910 case -EBUSY:
901 /* else it is being freed elsewhere */ 911 /* else it is being freed elsewhere */
902 list_move(&page->lru, src); 912 list_move(&page->lru, src);
913 mem_cgroup_rotate_lru_list(page, page_lru(page));
903 continue; 914 continue;
904 915
905 default: 916 default:
@@ -938,18 +949,21 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
938 /* Check that we have not crossed a zone boundary. */ 949 /* Check that we have not crossed a zone boundary. */
939 if (unlikely(page_zone_id(cursor_page) != zone_id)) 950 if (unlikely(page_zone_id(cursor_page) != zone_id))
940 continue; 951 continue;
941 switch (__isolate_lru_page(cursor_page, mode, file)) { 952
942 case 0: 953 /*
954 * If we don't have enough swap space, reclaiming of
955 * anon page which don't already have a swap slot is
956 * pointless.
957 */
958 if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
959 !PageSwapCache(cursor_page))
960 continue;
961
962 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
943 list_move(&cursor_page->lru, dst); 963 list_move(&cursor_page->lru, dst);
964 mem_cgroup_del_lru(cursor_page);
944 nr_taken++; 965 nr_taken++;
945 scan++; 966 scan++;
946 break;
947
948 case -EBUSY:
949 /* else it is being freed elsewhere */
950 list_move(&cursor_page->lru, src);
951 default:
952 break; /* ! on LRU or wrong list */
953 } 967 }
954 } 968 }
955 } 969 }
@@ -971,7 +985,7 @@ static unsigned long isolate_pages_global(unsigned long nr,
971 if (file) 985 if (file)
972 lru += LRU_FILE; 986 lru += LRU_FILE;
973 return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order, 987 return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
974 mode, !!file); 988 mode, file);
975} 989}
976 990
977/* 991/*
@@ -986,7 +1000,7 @@ static unsigned long clear_active_flags(struct list_head *page_list,
986 struct page *page; 1000 struct page *page;
987 1001
988 list_for_each_entry(page, page_list, lru) { 1002 list_for_each_entry(page, page_list, lru) {
989 lru = page_is_file_cache(page); 1003 lru = page_lru_base_type(page);
990 if (PageActive(page)) { 1004 if (PageActive(page)) {
991 lru += LRU_ACTIVE; 1005 lru += LRU_ACTIVE;
992 ClearPageActive(page); 1006 ClearPageActive(page);
@@ -1044,6 +1058,31 @@ int isolate_lru_page(struct page *page)
1044} 1058}
1045 1059
1046/* 1060/*
1061 * Are there way too many processes in the direct reclaim path already?
1062 */
1063static int too_many_isolated(struct zone *zone, int file,
1064 struct scan_control *sc)
1065{
1066 unsigned long inactive, isolated;
1067
1068 if (current_is_kswapd())
1069 return 0;
1070
1071 if (!scanning_global_lru(sc))
1072 return 0;
1073
1074 if (file) {
1075 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1076 isolated = zone_page_state(zone, NR_ISOLATED_FILE);
1077 } else {
1078 inactive = zone_page_state(zone, NR_INACTIVE_ANON);
1079 isolated = zone_page_state(zone, NR_ISOLATED_ANON);
1080 }
1081
1082 return isolated > inactive;
1083}
1084
1085/*
1047 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number 1086 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
1048 * of reclaimed pages 1087 * of reclaimed pages
1049 */ 1088 */
@@ -1056,6 +1095,27 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1056 unsigned long nr_scanned = 0; 1095 unsigned long nr_scanned = 0;
1057 unsigned long nr_reclaimed = 0; 1096 unsigned long nr_reclaimed = 0;
1058 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1097 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1098 int lumpy_reclaim = 0;
1099
1100 while (unlikely(too_many_isolated(zone, file, sc))) {
1101 congestion_wait(BLK_RW_ASYNC, HZ/10);
1102
1103 /* We are about to die and free our memory. Return now. */
1104 if (fatal_signal_pending(current))
1105 return SWAP_CLUSTER_MAX;
1106 }
1107
1108 /*
1109 * If we need a large contiguous chunk of memory, or have
1110 * trouble getting a small set of contiguous pages, we
1111 * will reclaim both active and inactive pages.
1112 *
1113 * We use the same threshold as pageout congestion_wait below.
1114 */
1115 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1116 lumpy_reclaim = 1;
1117 else if (sc->order && priority < DEF_PRIORITY - 2)
1118 lumpy_reclaim = 1;
1059 1119
1060 pagevec_init(&pvec, 1); 1120 pagevec_init(&pvec, 1);
1061 1121
@@ -1068,23 +1128,27 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1068 unsigned long nr_freed; 1128 unsigned long nr_freed;
1069 unsigned long nr_active; 1129 unsigned long nr_active;
1070 unsigned int count[NR_LRU_LISTS] = { 0, }; 1130 unsigned int count[NR_LRU_LISTS] = { 0, };
1071 int mode = ISOLATE_INACTIVE; 1131 int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE;
1072 1132 unsigned long nr_anon;
1073 /* 1133 unsigned long nr_file;
1074 * If we need a large contiguous chunk of memory, or have
1075 * trouble getting a small set of contiguous pages, we
1076 * will reclaim both active and inactive pages.
1077 *
1078 * We use the same threshold as pageout congestion_wait below.
1079 */
1080 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1081 mode = ISOLATE_BOTH;
1082 else if (sc->order && priority < DEF_PRIORITY - 2)
1083 mode = ISOLATE_BOTH;
1084 1134
1085 nr_taken = sc->isolate_pages(sc->swap_cluster_max, 1135 nr_taken = sc->isolate_pages(sc->swap_cluster_max,
1086 &page_list, &nr_scan, sc->order, mode, 1136 &page_list, &nr_scan, sc->order, mode,
1087 zone, sc->mem_cgroup, 0, file); 1137 zone, sc->mem_cgroup, 0, file);
1138
1139 if (scanning_global_lru(sc)) {
1140 zone->pages_scanned += nr_scan;
1141 if (current_is_kswapd())
1142 __count_zone_vm_events(PGSCAN_KSWAPD, zone,
1143 nr_scan);
1144 else
1145 __count_zone_vm_events(PGSCAN_DIRECT, zone,
1146 nr_scan);
1147 }
1148
1149 if (nr_taken == 0)
1150 goto done;
1151
1088 nr_active = clear_active_flags(&page_list, count); 1152 nr_active = clear_active_flags(&page_list, count);
1089 __count_vm_events(PGDEACTIVATE, nr_active); 1153 __count_vm_events(PGDEACTIVATE, nr_active);
1090 1154
@@ -1097,8 +1161,10 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1097 __mod_zone_page_state(zone, NR_INACTIVE_ANON, 1161 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
1098 -count[LRU_INACTIVE_ANON]); 1162 -count[LRU_INACTIVE_ANON]);
1099 1163
1100 if (scanning_global_lru(sc)) 1164 nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
1101 zone->pages_scanned += nr_scan; 1165 nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
1166 __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
1167 __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
1102 1168
1103 reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON]; 1169 reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON];
1104 reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON]; 1170 reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON];
@@ -1117,8 +1183,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1117 * but that should be acceptable to the caller 1183 * but that should be acceptable to the caller
1118 */ 1184 */
1119 if (nr_freed < nr_taken && !current_is_kswapd() && 1185 if (nr_freed < nr_taken && !current_is_kswapd() &&
1120 sc->order > PAGE_ALLOC_COSTLY_ORDER) { 1186 lumpy_reclaim) {
1121 congestion_wait(WRITE, HZ/10); 1187 congestion_wait(BLK_RW_ASYNC, HZ/10);
1122 1188
1123 /* 1189 /*
1124 * The attempt at page out may have made some 1190 * The attempt at page out may have made some
@@ -1132,18 +1198,12 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1132 } 1198 }
1133 1199
1134 nr_reclaimed += nr_freed; 1200 nr_reclaimed += nr_freed;
1201
1135 local_irq_disable(); 1202 local_irq_disable();
1136 if (current_is_kswapd()) { 1203 if (current_is_kswapd())
1137 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
1138 __count_vm_events(KSWAPD_STEAL, nr_freed); 1204 __count_vm_events(KSWAPD_STEAL, nr_freed);
1139 } else if (scanning_global_lru(sc))
1140 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
1141
1142 __count_zone_vm_events(PGSTEAL, zone, nr_freed); 1205 __count_zone_vm_events(PGSTEAL, zone, nr_freed);
1143 1206
1144 if (nr_taken == 0)
1145 goto done;
1146
1147 spin_lock(&zone->lru_lock); 1207 spin_lock(&zone->lru_lock);
1148 /* 1208 /*
1149 * Put back any unfreeable pages. 1209 * Put back any unfreeable pages.
@@ -1162,8 +1222,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1162 SetPageLRU(page); 1222 SetPageLRU(page);
1163 lru = page_lru(page); 1223 lru = page_lru(page);
1164 add_page_to_lru_list(zone, page, lru); 1224 add_page_to_lru_list(zone, page, lru);
1165 if (PageActive(page)) { 1225 if (is_active_lru(lru)) {
1166 int file = !!page_is_file_cache(page); 1226 int file = is_file_lru(lru);
1167 reclaim_stat->recent_rotated[file]++; 1227 reclaim_stat->recent_rotated[file]++;
1168 } 1228 }
1169 if (!pagevec_add(&pvec, page)) { 1229 if (!pagevec_add(&pvec, page)) {
@@ -1172,10 +1232,13 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1172 spin_lock_irq(&zone->lru_lock); 1232 spin_lock_irq(&zone->lru_lock);
1173 } 1233 }
1174 } 1234 }
1235 __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
1236 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
1237
1175 } while (nr_scanned < max_scan); 1238 } while (nr_scanned < max_scan);
1176 spin_unlock(&zone->lru_lock); 1239
1177done: 1240done:
1178 local_irq_enable(); 1241 spin_unlock_irq(&zone->lru_lock);
1179 pagevec_release(&pvec); 1242 pagevec_release(&pvec);
1180 return nr_reclaimed; 1243 return nr_reclaimed;
1181} 1244}
@@ -1212,23 +1275,55 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority)
1212 * But we had to alter page->flags anyway. 1275 * But we had to alter page->flags anyway.
1213 */ 1276 */
1214 1277
1278static void move_active_pages_to_lru(struct zone *zone,
1279 struct list_head *list,
1280 enum lru_list lru)
1281{
1282 unsigned long pgmoved = 0;
1283 struct pagevec pvec;
1284 struct page *page;
1285
1286 pagevec_init(&pvec, 1);
1287
1288 while (!list_empty(list)) {
1289 page = lru_to_page(list);
1290
1291 VM_BUG_ON(PageLRU(page));
1292 SetPageLRU(page);
1293
1294 list_move(&page->lru, &zone->lru[lru].list);
1295 mem_cgroup_add_lru_list(page, lru);
1296 pgmoved++;
1297
1298 if (!pagevec_add(&pvec, page) || list_empty(list)) {
1299 spin_unlock_irq(&zone->lru_lock);
1300 if (buffer_heads_over_limit)
1301 pagevec_strip(&pvec);
1302 __pagevec_release(&pvec);
1303 spin_lock_irq(&zone->lru_lock);
1304 }
1305 }
1306 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1307 if (!is_active_lru(lru))
1308 __count_vm_events(PGDEACTIVATE, pgmoved);
1309}
1215 1310
1216static void shrink_active_list(unsigned long nr_pages, struct zone *zone, 1311static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1217 struct scan_control *sc, int priority, int file) 1312 struct scan_control *sc, int priority, int file)
1218{ 1313{
1219 unsigned long pgmoved; 1314 unsigned long nr_taken;
1220 int pgdeactivate = 0;
1221 unsigned long pgscanned; 1315 unsigned long pgscanned;
1316 unsigned long vm_flags;
1222 LIST_HEAD(l_hold); /* The pages which were snipped off */ 1317 LIST_HEAD(l_hold); /* The pages which were snipped off */
1318 LIST_HEAD(l_active);
1223 LIST_HEAD(l_inactive); 1319 LIST_HEAD(l_inactive);
1224 struct page *page; 1320 struct page *page;
1225 struct pagevec pvec;
1226 enum lru_list lru;
1227 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1321 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1322 unsigned long nr_rotated = 0;
1228 1323
1229 lru_add_drain(); 1324 lru_add_drain();
1230 spin_lock_irq(&zone->lru_lock); 1325 spin_lock_irq(&zone->lru_lock);
1231 pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, 1326 nr_taken = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
1232 ISOLATE_ACTIVE, zone, 1327 ISOLATE_ACTIVE, zone,
1233 sc->mem_cgroup, 1, file); 1328 sc->mem_cgroup, 1, file);
1234 /* 1329 /*
@@ -1238,15 +1333,16 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1238 if (scanning_global_lru(sc)) { 1333 if (scanning_global_lru(sc)) {
1239 zone->pages_scanned += pgscanned; 1334 zone->pages_scanned += pgscanned;
1240 } 1335 }
1241 reclaim_stat->recent_scanned[!!file] += pgmoved; 1336 reclaim_stat->recent_scanned[file] += nr_taken;
1242 1337
1338 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1243 if (file) 1339 if (file)
1244 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); 1340 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
1245 else 1341 else
1246 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); 1342 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken);
1343 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
1247 spin_unlock_irq(&zone->lru_lock); 1344 spin_unlock_irq(&zone->lru_lock);
1248 1345
1249 pgmoved = 0;
1250 while (!list_empty(&l_hold)) { 1346 while (!list_empty(&l_hold)) {
1251 cond_resched(); 1347 cond_resched();
1252 page = lru_to_page(&l_hold); 1348 page = lru_to_page(&l_hold);
@@ -1259,58 +1355,45 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1259 1355
1260 /* page_referenced clears PageReferenced */ 1356 /* page_referenced clears PageReferenced */
1261 if (page_mapping_inuse(page) && 1357 if (page_mapping_inuse(page) &&
1262 page_referenced(page, 0, sc->mem_cgroup)) 1358 page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
1263 pgmoved++; 1359 nr_rotated++;
1360 /*
1361 * Identify referenced, file-backed active pages and
1362 * give them one more trip around the active list. So
1363 * that executable code get better chances to stay in
1364 * memory under moderate memory pressure. Anon pages
1365 * are not likely to be evicted by use-once streaming
1366 * IO, plus JVM can create lots of anon VM_EXEC pages,
1367 * so we ignore them here.
1368 */
1369 if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
1370 list_add(&page->lru, &l_active);
1371 continue;
1372 }
1373 }
1264 1374
1375 ClearPageActive(page); /* we are de-activating */
1265 list_add(&page->lru, &l_inactive); 1376 list_add(&page->lru, &l_inactive);
1266 } 1377 }
1267 1378
1268 /* 1379 /*
1269 * Move the pages to the [file or anon] inactive list. 1380 * Move pages back to the lru list.
1270 */ 1381 */
1271 pagevec_init(&pvec, 1);
1272 lru = LRU_BASE + file * LRU_FILE;
1273
1274 spin_lock_irq(&zone->lru_lock); 1382 spin_lock_irq(&zone->lru_lock);
1275 /* 1383 /*
1276 * Count referenced pages from currently used mappings as 1384 * Count referenced pages from currently used mappings as rotated,
1277 * rotated, even though they are moved to the inactive list. 1385 * even though only some of them are actually re-activated. This
1278 * This helps balance scan pressure between file and anonymous 1386 * helps balance scan pressure between file and anonymous pages in
1279 * pages in get_scan_ratio. 1387 * get_scan_ratio.
1280 */ 1388 */
1281 reclaim_stat->recent_rotated[!!file] += pgmoved; 1389 reclaim_stat->recent_rotated[file] += nr_rotated;
1282 1390
1283 pgmoved = 0; 1391 move_active_pages_to_lru(zone, &l_active,
1284 while (!list_empty(&l_inactive)) { 1392 LRU_ACTIVE + file * LRU_FILE);
1285 page = lru_to_page(&l_inactive); 1393 move_active_pages_to_lru(zone, &l_inactive,
1286 prefetchw_prev_lru_page(page, &l_inactive, flags); 1394 LRU_BASE + file * LRU_FILE);
1287 VM_BUG_ON(PageLRU(page)); 1395 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1288 SetPageLRU(page);
1289 VM_BUG_ON(!PageActive(page));
1290 ClearPageActive(page);
1291
1292 list_move(&page->lru, &zone->lru[lru].list);
1293 mem_cgroup_add_lru_list(page, lru);
1294 pgmoved++;
1295 if (!pagevec_add(&pvec, page)) {
1296 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1297 spin_unlock_irq(&zone->lru_lock);
1298 pgdeactivate += pgmoved;
1299 pgmoved = 0;
1300 if (buffer_heads_over_limit)
1301 pagevec_strip(&pvec);
1302 __pagevec_release(&pvec);
1303 spin_lock_irq(&zone->lru_lock);
1304 }
1305 }
1306 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1307 pgdeactivate += pgmoved;
1308 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1309 __count_vm_events(PGDEACTIVATE, pgdeactivate);
1310 spin_unlock_irq(&zone->lru_lock); 1396 spin_unlock_irq(&zone->lru_lock);
1311 if (buffer_heads_over_limit)
1312 pagevec_strip(&pvec);
1313 pagevec_release(&pvec);
1314} 1397}
1315 1398
1316static int inactive_anon_is_low_global(struct zone *zone) 1399static int inactive_anon_is_low_global(struct zone *zone)
@@ -1345,12 +1428,48 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
1345 return low; 1428 return low;
1346} 1429}
1347 1430
1431static int inactive_file_is_low_global(struct zone *zone)
1432{
1433 unsigned long active, inactive;
1434
1435 active = zone_page_state(zone, NR_ACTIVE_FILE);
1436 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1437
1438 return (active > inactive);
1439}
1440
1441/**
1442 * inactive_file_is_low - check if file pages need to be deactivated
1443 * @zone: zone to check
1444 * @sc: scan control of this context
1445 *
1446 * When the system is doing streaming IO, memory pressure here
1447 * ensures that active file pages get deactivated, until more
1448 * than half of the file pages are on the inactive list.
1449 *
1450 * Once we get to that situation, protect the system's working
1451 * set from being evicted by disabling active file page aging.
1452 *
1453 * This uses a different ratio than the anonymous pages, because
1454 * the page cache uses a use-once replacement algorithm.
1455 */
1456static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
1457{
1458 int low;
1459
1460 if (scanning_global_lru(sc))
1461 low = inactive_file_is_low_global(zone);
1462 else
1463 low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup);
1464 return low;
1465}
1466
1348static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, 1467static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1349 struct zone *zone, struct scan_control *sc, int priority) 1468 struct zone *zone, struct scan_control *sc, int priority)
1350{ 1469{
1351 int file = is_file_lru(lru); 1470 int file = is_file_lru(lru);
1352 1471
1353 if (lru == LRU_ACTIVE_FILE) { 1472 if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) {
1354 shrink_active_list(nr_to_scan, zone, sc, priority, file); 1473 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1355 return 0; 1474 return 0;
1356 } 1475 }
@@ -1379,23 +1498,16 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1379 unsigned long ap, fp; 1498 unsigned long ap, fp;
1380 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1499 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1381 1500
1382 /* If we have no swap space, do not bother scanning anon pages. */ 1501 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1383 if (nr_swap_pages <= 0) { 1502 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
1384 percent[0] = 0; 1503 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
1385 percent[1] = 100; 1504 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1386 return;
1387 }
1388
1389 anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) +
1390 zone_nr_pages(zone, sc, LRU_INACTIVE_ANON);
1391 file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) +
1392 zone_nr_pages(zone, sc, LRU_INACTIVE_FILE);
1393 1505
1394 if (scanning_global_lru(sc)) { 1506 if (scanning_global_lru(sc)) {
1395 free = zone_page_state(zone, NR_FREE_PAGES); 1507 free = zone_page_state(zone, NR_FREE_PAGES);
1396 /* If we have very few page cache pages, 1508 /* If we have very few page cache pages,
1397 force-scan anon pages. */ 1509 force-scan anon pages. */
1398 if (unlikely(file + free <= zone->pages_high)) { 1510 if (unlikely(file + free <= high_wmark_pages(zone))) {
1399 percent[0] = 100; 1511 percent[0] = 100;
1400 percent[1] = 0; 1512 percent[1] = 0;
1401 return; 1513 return;
@@ -1450,6 +1562,26 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1450 percent[1] = 100 - percent[0]; 1562 percent[1] = 100 - percent[0];
1451} 1563}
1452 1564
1565/*
1566 * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
1567 * until we collected @swap_cluster_max pages to scan.
1568 */
1569static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
1570 unsigned long *nr_saved_scan,
1571 unsigned long swap_cluster_max)
1572{
1573 unsigned long nr;
1574
1575 *nr_saved_scan += nr_to_scan;
1576 nr = *nr_saved_scan;
1577
1578 if (nr >= swap_cluster_max)
1579 *nr_saved_scan = 0;
1580 else
1581 nr = 0;
1582
1583 return nr;
1584}
1453 1585
1454/* 1586/*
1455 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1587 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
@@ -1463,27 +1595,29 @@ static void shrink_zone(int priority, struct zone *zone,
1463 enum lru_list l; 1595 enum lru_list l;
1464 unsigned long nr_reclaimed = sc->nr_reclaimed; 1596 unsigned long nr_reclaimed = sc->nr_reclaimed;
1465 unsigned long swap_cluster_max = sc->swap_cluster_max; 1597 unsigned long swap_cluster_max = sc->swap_cluster_max;
1598 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1599 int noswap = 0;
1466 1600
1467 get_scan_ratio(zone, sc, percent); 1601 /* If we have no swap space, do not bother scanning anon pages. */
1602 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1603 noswap = 1;
1604 percent[0] = 0;
1605 percent[1] = 100;
1606 } else
1607 get_scan_ratio(zone, sc, percent);
1468 1608
1469 for_each_evictable_lru(l) { 1609 for_each_evictable_lru(l) {
1470 int file = is_file_lru(l); 1610 int file = is_file_lru(l);
1471 int scan; 1611 unsigned long scan;
1472 1612
1473 scan = zone_nr_pages(zone, sc, l); 1613 scan = zone_nr_lru_pages(zone, sc, l);
1474 if (priority) { 1614 if (priority || noswap) {
1475 scan >>= priority; 1615 scan >>= priority;
1476 scan = (scan * percent[file]) / 100; 1616 scan = (scan * percent[file]) / 100;
1477 } 1617 }
1478 if (scanning_global_lru(sc)) { 1618 nr[l] = nr_scan_try_batch(scan,
1479 zone->lru[l].nr_scan += scan; 1619 &reclaim_stat->nr_saved_scan[l],
1480 nr[l] = zone->lru[l].nr_scan; 1620 swap_cluster_max);
1481 if (nr[l] >= swap_cluster_max)
1482 zone->lru[l].nr_scan = 0;
1483 else
1484 nr[l] = 0;
1485 } else
1486 nr[l] = scan;
1487 } 1621 }
1488 1622
1489 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1623 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -1516,7 +1650,7 @@ static void shrink_zone(int priority, struct zone *zone,
1516 * Even if we did not try to evict anon pages at all, we want to 1650 * Even if we did not try to evict anon pages at all, we want to
1517 * rebalance the anon lru active/inactive ratio. 1651 * rebalance the anon lru active/inactive ratio.
1518 */ 1652 */
1519 if (inactive_anon_is_low(zone, sc)) 1653 if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0)
1520 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); 1654 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1521 1655
1522 throttle_vm_writeout(sc->gfp_mask); 1656 throttle_vm_writeout(sc->gfp_mask);
@@ -1527,11 +1661,13 @@ static void shrink_zone(int priority, struct zone *zone,
1527 * try to reclaim pages from zones which will satisfy the caller's allocation 1661 * try to reclaim pages from zones which will satisfy the caller's allocation
1528 * request. 1662 * request.
1529 * 1663 *
1530 * We reclaim from a zone even if that zone is over pages_high. Because: 1664 * We reclaim from a zone even if that zone is over high_wmark_pages(zone).
1665 * Because:
1531 * a) The caller may be trying to free *extra* pages to satisfy a higher-order 1666 * a) The caller may be trying to free *extra* pages to satisfy a higher-order
1532 * allocation or 1667 * allocation or
1533 * b) The zones may be over pages_high but they must go *over* pages_high to 1668 * b) The target zone may be at high_wmark_pages(zone) but the lower zones
1534 * satisfy the `incremental min' zone defense algorithm. 1669 * must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
1670 * zone defense algorithm.
1535 * 1671 *
1536 * If a zone is deemed to be full of pinned pages then just give it a light 1672 * If a zone is deemed to be full of pinned pages then just give it a light
1537 * scan then give up on it. 1673 * scan then give up on it.
@@ -1583,10 +1719,10 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
1583 * 1719 *
1584 * If the caller is !__GFP_FS then the probability of a failure is reasonably 1720 * If the caller is !__GFP_FS then the probability of a failure is reasonably
1585 * high - the zone may be full of dirty or under-writeback pages, which this 1721 * high - the zone may be full of dirty or under-writeback pages, which this
1586 * caller can't do much about. We kick pdflush and take explicit naps in the 1722 * caller can't do much about. We kick the writeback threads and take explicit
1587 * hope that some of these pages can be written. But if the allocating task 1723 * naps in the hope that some of these pages can be written. But if the
1588 * holds filesystem locks which prevent writeout this might not work, and the 1724 * allocating task holds filesystem locks which prevent writeout this might not
1589 * allocation attempt will fail. 1725 * work, and the allocation attempt will fail.
1590 * 1726 *
1591 * returns: 0, if no pages reclaimed 1727 * returns: 0, if no pages reclaimed
1592 * else, the number of pages reclaimed 1728 * else, the number of pages reclaimed
@@ -1616,7 +1752,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1616 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1752 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1617 continue; 1753 continue;
1618 1754
1619 lru_pages += zone_lru_pages(zone); 1755 lru_pages += zone_reclaimable_pages(zone);
1620 } 1756 }
1621 } 1757 }
1622 1758
@@ -1651,13 +1787,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1651 */ 1787 */
1652 if (total_scanned > sc->swap_cluster_max + 1788 if (total_scanned > sc->swap_cluster_max +
1653 sc->swap_cluster_max / 2) { 1789 sc->swap_cluster_max / 2) {
1654 wakeup_pdflush(laptop_mode ? 0 : total_scanned); 1790 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
1655 sc->may_writepage = 1; 1791 sc->may_writepage = 1;
1656 } 1792 }
1657 1793
1658 /* Take a nap, wait for some writeback to complete */ 1794 /* Take a nap, wait for some writeback to complete */
1659 if (sc->nr_scanned && priority < DEF_PRIORITY - 2) 1795 if (sc->nr_scanned && priority < DEF_PRIORITY - 2)
1660 congestion_wait(WRITE, HZ/10); 1796 congestion_wait(BLK_RW_ASYNC, HZ/10);
1661 } 1797 }
1662 /* top priority shrink_zones still had more to do? don't OOM, then */ 1798 /* top priority shrink_zones still had more to do? don't OOM, then */
1663 if (!sc->all_unreclaimable && scanning_global_lru(sc)) 1799 if (!sc->all_unreclaimable && scanning_global_lru(sc))
@@ -1697,6 +1833,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1697 .may_writepage = !laptop_mode, 1833 .may_writepage = !laptop_mode,
1698 .swap_cluster_max = SWAP_CLUSTER_MAX, 1834 .swap_cluster_max = SWAP_CLUSTER_MAX,
1699 .may_unmap = 1, 1835 .may_unmap = 1,
1836 .may_swap = 1,
1700 .swappiness = vm_swappiness, 1837 .swappiness = vm_swappiness,
1701 .order = order, 1838 .order = order,
1702 .mem_cgroup = NULL, 1839 .mem_cgroup = NULL,
@@ -1709,14 +1846,49 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1709 1846
1710#ifdef CONFIG_CGROUP_MEM_RES_CTLR 1847#ifdef CONFIG_CGROUP_MEM_RES_CTLR
1711 1848
1849unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
1850 gfp_t gfp_mask, bool noswap,
1851 unsigned int swappiness,
1852 struct zone *zone, int nid)
1853{
1854 struct scan_control sc = {
1855 .may_writepage = !laptop_mode,
1856 .may_unmap = 1,
1857 .may_swap = !noswap,
1858 .swap_cluster_max = SWAP_CLUSTER_MAX,
1859 .swappiness = swappiness,
1860 .order = 0,
1861 .mem_cgroup = mem,
1862 .isolate_pages = mem_cgroup_isolate_pages,
1863 };
1864 nodemask_t nm = nodemask_of_node(nid);
1865
1866 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1867 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
1868 sc.nodemask = &nm;
1869 sc.nr_reclaimed = 0;
1870 sc.nr_scanned = 0;
1871 /*
1872 * NOTE: Although we can get the priority field, using it
1873 * here is not a good idea, since it limits the pages we can scan.
1874 * if we don't reclaim here, the shrink_zone from balance_pgdat
1875 * will pick up pages from other mem cgroup's as well. We hack
1876 * the priority and make it zero.
1877 */
1878 shrink_zone(0, zone, &sc);
1879 return sc.nr_reclaimed;
1880}
1881
1712unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, 1882unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1713 gfp_t gfp_mask, 1883 gfp_t gfp_mask,
1714 bool noswap, 1884 bool noswap,
1715 unsigned int swappiness) 1885 unsigned int swappiness)
1716{ 1886{
1887 struct zonelist *zonelist;
1717 struct scan_control sc = { 1888 struct scan_control sc = {
1718 .may_writepage = !laptop_mode, 1889 .may_writepage = !laptop_mode,
1719 .may_unmap = 1, 1890 .may_unmap = 1,
1891 .may_swap = !noswap,
1720 .swap_cluster_max = SWAP_CLUSTER_MAX, 1892 .swap_cluster_max = SWAP_CLUSTER_MAX,
1721 .swappiness = swappiness, 1893 .swappiness = swappiness,
1722 .order = 0, 1894 .order = 0,
@@ -1724,10 +1896,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1724 .isolate_pages = mem_cgroup_isolate_pages, 1896 .isolate_pages = mem_cgroup_isolate_pages,
1725 .nodemask = NULL, /* we don't care the placement */ 1897 .nodemask = NULL, /* we don't care the placement */
1726 }; 1898 };
1727 struct zonelist *zonelist;
1728
1729 if (noswap)
1730 sc.may_unmap = 0;
1731 1899
1732 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 1900 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1733 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 1901 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -1738,7 +1906,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1738 1906
1739/* 1907/*
1740 * For kswapd, balance_pgdat() will work across all this node's zones until 1908 * For kswapd, balance_pgdat() will work across all this node's zones until
1741 * they are all at pages_high. 1909 * they are all at high_wmark_pages(zone).
1742 * 1910 *
1743 * Returns the number of pages which were actually freed. 1911 * Returns the number of pages which were actually freed.
1744 * 1912 *
@@ -1751,11 +1919,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1751 * the zone for when the problem goes away. 1919 * the zone for when the problem goes away.
1752 * 1920 *
1753 * kswapd scans the zones in the highmem->normal->dma direction. It skips 1921 * kswapd scans the zones in the highmem->normal->dma direction. It skips
1754 * zones which have free_pages > pages_high, but once a zone is found to have 1922 * zones which have free_pages > high_wmark_pages(zone), but once a zone is
1755 * free_pages <= pages_high, we scan that zone and the lower zones regardless 1923 * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
1756 * of the number of free pages in the lower zones. This interoperates with 1924 * lower zones regardless of the number of free pages in the lower zones. This
1757 * the page allocator fallback scheme to ensure that aging of pages is balanced 1925 * interoperates with the page allocator fallback scheme to ensure that aging
1758 * across the zones. 1926 * of pages is balanced across the zones.
1759 */ 1927 */
1760static unsigned long balance_pgdat(pg_data_t *pgdat, int order) 1928static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1761{ 1929{
@@ -1767,6 +1935,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1767 struct scan_control sc = { 1935 struct scan_control sc = {
1768 .gfp_mask = GFP_KERNEL, 1936 .gfp_mask = GFP_KERNEL,
1769 .may_unmap = 1, 1937 .may_unmap = 1,
1938 .may_swap = 1,
1770 .swap_cluster_max = SWAP_CLUSTER_MAX, 1939 .swap_cluster_max = SWAP_CLUSTER_MAX,
1771 .swappiness = vm_swappiness, 1940 .swappiness = vm_swappiness,
1772 .order = order, 1941 .order = order,
@@ -1775,7 +1944,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1775 }; 1944 };
1776 /* 1945 /*
1777 * temp_priority is used to remember the scanning priority at which 1946 * temp_priority is used to remember the scanning priority at which
1778 * this zone was successfully refilled to free_pages == pages_high. 1947 * this zone was successfully refilled to
1948 * free_pages == high_wmark_pages(zone).
1779 */ 1949 */
1780 int temp_priority[MAX_NR_ZONES]; 1950 int temp_priority[MAX_NR_ZONES];
1781 1951
@@ -1820,8 +1990,8 @@ loop_again:
1820 shrink_active_list(SWAP_CLUSTER_MAX, zone, 1990 shrink_active_list(SWAP_CLUSTER_MAX, zone,
1821 &sc, priority, 0); 1991 &sc, priority, 0);
1822 1992
1823 if (!zone_watermark_ok(zone, order, zone->pages_high, 1993 if (!zone_watermark_ok(zone, order,
1824 0, 0)) { 1994 high_wmark_pages(zone), 0, 0)) {
1825 end_zone = i; 1995 end_zone = i;
1826 break; 1996 break;
1827 } 1997 }
@@ -1832,7 +2002,7 @@ loop_again:
1832 for (i = 0; i <= end_zone; i++) { 2002 for (i = 0; i <= end_zone; i++) {
1833 struct zone *zone = pgdat->node_zones + i; 2003 struct zone *zone = pgdat->node_zones + i;
1834 2004
1835 lru_pages += zone_lru_pages(zone); 2005 lru_pages += zone_reclaimable_pages(zone);
1836 } 2006 }
1837 2007
1838 /* 2008 /*
@@ -1847,6 +2017,7 @@ loop_again:
1847 for (i = 0; i <= end_zone; i++) { 2017 for (i = 0; i <= end_zone; i++) {
1848 struct zone *zone = pgdat->node_zones + i; 2018 struct zone *zone = pgdat->node_zones + i;
1849 int nr_slab; 2019 int nr_slab;
2020 int nid, zid;
1850 2021
1851 if (!populated_zone(zone)) 2022 if (!populated_zone(zone))
1852 continue; 2023 continue;
@@ -1855,18 +2026,27 @@ loop_again:
1855 priority != DEF_PRIORITY) 2026 priority != DEF_PRIORITY)
1856 continue; 2027 continue;
1857 2028
1858 if (!zone_watermark_ok(zone, order, zone->pages_high, 2029 if (!zone_watermark_ok(zone, order,
1859 end_zone, 0)) 2030 high_wmark_pages(zone), end_zone, 0))
1860 all_zones_ok = 0; 2031 all_zones_ok = 0;
1861 temp_priority[i] = priority; 2032 temp_priority[i] = priority;
1862 sc.nr_scanned = 0; 2033 sc.nr_scanned = 0;
1863 note_zone_scanning_priority(zone, priority); 2034 note_zone_scanning_priority(zone, priority);
2035
2036 nid = pgdat->node_id;
2037 zid = zone_idx(zone);
2038 /*
2039 * Call soft limit reclaim before calling shrink_zone.
2040 * For now we ignore the return value
2041 */
2042 mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask,
2043 nid, zid);
1864 /* 2044 /*
1865 * We put equal pressure on every zone, unless one 2045 * We put equal pressure on every zone, unless one
1866 * zone has way too many pages free already. 2046 * zone has way too many pages free already.
1867 */ 2047 */
1868 if (!zone_watermark_ok(zone, order, 8*zone->pages_high, 2048 if (!zone_watermark_ok(zone, order,
1869 end_zone, 0)) 2049 8*high_wmark_pages(zone), end_zone, 0))
1870 shrink_zone(priority, zone, &sc); 2050 shrink_zone(priority, zone, &sc);
1871 reclaim_state->reclaimed_slab = 0; 2051 reclaim_state->reclaimed_slab = 0;
1872 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 2052 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
@@ -1876,7 +2056,7 @@ loop_again:
1876 if (zone_is_all_unreclaimable(zone)) 2056 if (zone_is_all_unreclaimable(zone))
1877 continue; 2057 continue;
1878 if (nr_slab == 0 && zone->pages_scanned >= 2058 if (nr_slab == 0 && zone->pages_scanned >=
1879 (zone_lru_pages(zone) * 6)) 2059 (zone_reclaimable_pages(zone) * 6))
1880 zone_set_flag(zone, 2060 zone_set_flag(zone,
1881 ZONE_ALL_UNRECLAIMABLE); 2061 ZONE_ALL_UNRECLAIMABLE);
1882 /* 2062 /*
@@ -1895,7 +2075,7 @@ loop_again:
1895 * another pass across the zones. 2075 * another pass across the zones.
1896 */ 2076 */
1897 if (total_scanned && priority < DEF_PRIORITY - 2) 2077 if (total_scanned && priority < DEF_PRIORITY - 2)
1898 congestion_wait(WRITE, HZ/10); 2078 congestion_wait(BLK_RW_ASYNC, HZ/10);
1899 2079
1900 /* 2080 /*
1901 * We do this so kswapd doesn't build up large priorities for 2081 * We do this so kswapd doesn't build up large priorities for
@@ -1967,7 +2147,7 @@ static int kswapd(void *p)
1967 struct reclaim_state reclaim_state = { 2147 struct reclaim_state reclaim_state = {
1968 .reclaimed_slab = 0, 2148 .reclaimed_slab = 0,
1969 }; 2149 };
1970 node_to_cpumask_ptr(cpumask, pgdat->node_id); 2150 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
1971 2151
1972 lockdep_set_current_reclaim_state(GFP_KERNEL); 2152 lockdep_set_current_reclaim_state(GFP_KERNEL);
1973 2153
@@ -2032,7 +2212,7 @@ void wakeup_kswapd(struct zone *zone, int order)
2032 return; 2212 return;
2033 2213
2034 pgdat = zone->zone_pgdat; 2214 pgdat = zone->zone_pgdat;
2035 if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) 2215 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
2036 return; 2216 return;
2037 if (pgdat->kswapd_max_order < order) 2217 if (pgdat->kswapd_max_order < order)
2038 pgdat->kswapd_max_order = order; 2218 pgdat->kswapd_max_order = order;
@@ -2043,15 +2223,42 @@ void wakeup_kswapd(struct zone *zone, int order)
2043 wake_up_interruptible(&pgdat->kswapd_wait); 2223 wake_up_interruptible(&pgdat->kswapd_wait);
2044} 2224}
2045 2225
2046unsigned long global_lru_pages(void) 2226/*
2227 * The reclaimable count would be mostly accurate.
2228 * The less reclaimable pages may be
2229 * - mlocked pages, which will be moved to unevictable list when encountered
2230 * - mapped pages, which may require several travels to be reclaimed
2231 * - dirty pages, which is not "instantly" reclaimable
2232 */
2233unsigned long global_reclaimable_pages(void)
2047{ 2234{
2048 return global_page_state(NR_ACTIVE_ANON) 2235 int nr;
2049 + global_page_state(NR_ACTIVE_FILE) 2236
2050 + global_page_state(NR_INACTIVE_ANON) 2237 nr = global_page_state(NR_ACTIVE_FILE) +
2051 + global_page_state(NR_INACTIVE_FILE); 2238 global_page_state(NR_INACTIVE_FILE);
2239
2240 if (nr_swap_pages > 0)
2241 nr += global_page_state(NR_ACTIVE_ANON) +
2242 global_page_state(NR_INACTIVE_ANON);
2243
2244 return nr;
2245}
2246
2247unsigned long zone_reclaimable_pages(struct zone *zone)
2248{
2249 int nr;
2250
2251 nr = zone_page_state(zone, NR_ACTIVE_FILE) +
2252 zone_page_state(zone, NR_INACTIVE_FILE);
2253
2254 if (nr_swap_pages > 0)
2255 nr += zone_page_state(zone, NR_ACTIVE_ANON) +
2256 zone_page_state(zone, NR_INACTIVE_ANON);
2257
2258 return nr;
2052} 2259}
2053 2260
2054#ifdef CONFIG_PM 2261#ifdef CONFIG_HIBERNATION
2055/* 2262/*
2056 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages 2263 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages
2057 * from LRU lists system-wide, for given pass and priority. 2264 * from LRU lists system-wide, for given pass and priority.
@@ -2063,6 +2270,7 @@ static void shrink_all_zones(unsigned long nr_pages, int prio,
2063{ 2270{
2064 struct zone *zone; 2271 struct zone *zone;
2065 unsigned long nr_reclaimed = 0; 2272 unsigned long nr_reclaimed = 0;
2273 struct zone_reclaim_stat *reclaim_stat;
2066 2274
2067 for_each_populated_zone(zone) { 2275 for_each_populated_zone(zone) {
2068 enum lru_list l; 2276 enum lru_list l;
@@ -2079,22 +2287,25 @@ static void shrink_all_zones(unsigned long nr_pages, int prio,
2079 l == LRU_ACTIVE_FILE)) 2287 l == LRU_ACTIVE_FILE))
2080 continue; 2288 continue;
2081 2289
2082 zone->lru[l].nr_scan += (lru_pages >> prio) + 1; 2290 reclaim_stat = get_reclaim_stat(zone, sc);
2083 if (zone->lru[l].nr_scan >= nr_pages || pass > 3) { 2291 reclaim_stat->nr_saved_scan[l] +=
2292 (lru_pages >> prio) + 1;
2293 if (reclaim_stat->nr_saved_scan[l]
2294 >= nr_pages || pass > 3) {
2084 unsigned long nr_to_scan; 2295 unsigned long nr_to_scan;
2085 2296
2086 zone->lru[l].nr_scan = 0; 2297 reclaim_stat->nr_saved_scan[l] = 0;
2087 nr_to_scan = min(nr_pages, lru_pages); 2298 nr_to_scan = min(nr_pages, lru_pages);
2088 nr_reclaimed += shrink_list(l, nr_to_scan, zone, 2299 nr_reclaimed += shrink_list(l, nr_to_scan, zone,
2089 sc, prio); 2300 sc, prio);
2090 if (nr_reclaimed >= nr_pages) { 2301 if (nr_reclaimed >= nr_pages) {
2091 sc->nr_reclaimed = nr_reclaimed; 2302 sc->nr_reclaimed += nr_reclaimed;
2092 return; 2303 return;
2093 } 2304 }
2094 } 2305 }
2095 } 2306 }
2096 } 2307 }
2097 sc->nr_reclaimed = nr_reclaimed; 2308 sc->nr_reclaimed += nr_reclaimed;
2098} 2309}
2099 2310
2100/* 2311/*
@@ -2115,11 +2326,12 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
2115 .may_unmap = 0, 2326 .may_unmap = 0,
2116 .may_writepage = 1, 2327 .may_writepage = 1,
2117 .isolate_pages = isolate_pages_global, 2328 .isolate_pages = isolate_pages_global,
2329 .nr_reclaimed = 0,
2118 }; 2330 };
2119 2331
2120 current->reclaim_state = &reclaim_state; 2332 current->reclaim_state = &reclaim_state;
2121 2333
2122 lru_pages = global_lru_pages(); 2334 lru_pages = global_reclaimable_pages();
2123 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); 2335 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
2124 /* If slab caches are huge, it's better to hit them first */ 2336 /* If slab caches are huge, it's better to hit them first */
2125 while (nr_slab >= lru_pages) { 2337 while (nr_slab >= lru_pages) {
@@ -2161,13 +2373,13 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
2161 2373
2162 reclaim_state.reclaimed_slab = 0; 2374 reclaim_state.reclaimed_slab = 0;
2163 shrink_slab(sc.nr_scanned, sc.gfp_mask, 2375 shrink_slab(sc.nr_scanned, sc.gfp_mask,
2164 global_lru_pages()); 2376 global_reclaimable_pages());
2165 sc.nr_reclaimed += reclaim_state.reclaimed_slab; 2377 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2166 if (sc.nr_reclaimed >= nr_pages) 2378 if (sc.nr_reclaimed >= nr_pages)
2167 goto out; 2379 goto out;
2168 2380
2169 if (sc.nr_scanned && prio < DEF_PRIORITY - 2) 2381 if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
2170 congestion_wait(WRITE, HZ / 10); 2382 congestion_wait(BLK_RW_ASYNC, HZ / 10);
2171 } 2383 }
2172 } 2384 }
2173 2385
@@ -2178,7 +2390,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
2178 if (!sc.nr_reclaimed) { 2390 if (!sc.nr_reclaimed) {
2179 do { 2391 do {
2180 reclaim_state.reclaimed_slab = 0; 2392 reclaim_state.reclaimed_slab = 0;
2181 shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages()); 2393 shrink_slab(nr_pages, sc.gfp_mask,
2394 global_reclaimable_pages());
2182 sc.nr_reclaimed += reclaim_state.reclaimed_slab; 2395 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2183 } while (sc.nr_reclaimed < nr_pages && 2396 } while (sc.nr_reclaimed < nr_pages &&
2184 reclaim_state.reclaimed_slab > 0); 2397 reclaim_state.reclaimed_slab > 0);
@@ -2190,7 +2403,7 @@ out:
2190 2403
2191 return sc.nr_reclaimed; 2404 return sc.nr_reclaimed;
2192} 2405}
2193#endif 2406#endif /* CONFIG_HIBERNATION */
2194 2407
2195/* It's optimal to keep kswapds on the same CPUs as their memory, but 2408/* It's optimal to keep kswapds on the same CPUs as their memory, but
2196 not required for correctness. So if the last cpu in a node goes 2409 not required for correctness. So if the last cpu in a node goes
@@ -2204,7 +2417,9 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
2204 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { 2417 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
2205 for_each_node_state(nid, N_HIGH_MEMORY) { 2418 for_each_node_state(nid, N_HIGH_MEMORY) {
2206 pg_data_t *pgdat = NODE_DATA(nid); 2419 pg_data_t *pgdat = NODE_DATA(nid);
2207 node_to_cpumask_ptr(mask, pgdat->node_id); 2420 const struct cpumask *mask;
2421
2422 mask = cpumask_of_node(pgdat->node_id);
2208 2423
2209 if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) 2424 if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
2210 /* One of our CPUs online: restore mask */ 2425 /* One of our CPUs online: restore mask */
@@ -2282,6 +2497,48 @@ int sysctl_min_unmapped_ratio = 1;
2282 */ 2497 */
2283int sysctl_min_slab_ratio = 5; 2498int sysctl_min_slab_ratio = 5;
2284 2499
2500static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
2501{
2502 unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
2503 unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
2504 zone_page_state(zone, NR_ACTIVE_FILE);
2505
2506 /*
2507 * It's possible for there to be more file mapped pages than
2508 * accounted for by the pages on the file LRU lists because
2509 * tmpfs pages accounted for as ANON can also be FILE_MAPPED
2510 */
2511 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
2512}
2513
2514/* Work out how many page cache pages we can reclaim in this reclaim_mode */
2515static long zone_pagecache_reclaimable(struct zone *zone)
2516{
2517 long nr_pagecache_reclaimable;
2518 long delta = 0;
2519
2520 /*
2521 * If RECLAIM_SWAP is set, then all file pages are considered
2522 * potentially reclaimable. Otherwise, we have to worry about
2523 * pages like swapcache and zone_unmapped_file_pages() provides
2524 * a better estimate
2525 */
2526 if (zone_reclaim_mode & RECLAIM_SWAP)
2527 nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
2528 else
2529 nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
2530
2531 /* If we can't clean pages, remove dirty pages from consideration */
2532 if (!(zone_reclaim_mode & RECLAIM_WRITE))
2533 delta += zone_page_state(zone, NR_FILE_DIRTY);
2534
2535 /* Watch for any possible underflows due to delta */
2536 if (unlikely(delta > nr_pagecache_reclaimable))
2537 delta = nr_pagecache_reclaimable;
2538
2539 return nr_pagecache_reclaimable - delta;
2540}
2541
2285/* 2542/*
2286 * Try to free up some pages from this zone through reclaim. 2543 * Try to free up some pages from this zone through reclaim.
2287 */ 2544 */
@@ -2295,6 +2552,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2295 struct scan_control sc = { 2552 struct scan_control sc = {
2296 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 2553 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
2297 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), 2554 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
2555 .may_swap = 1,
2298 .swap_cluster_max = max_t(unsigned long, nr_pages, 2556 .swap_cluster_max = max_t(unsigned long, nr_pages,
2299 SWAP_CLUSTER_MAX), 2557 SWAP_CLUSTER_MAX),
2300 .gfp_mask = gfp_mask, 2558 .gfp_mask = gfp_mask,
@@ -2315,9 +2573,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2315 reclaim_state.reclaimed_slab = 0; 2573 reclaim_state.reclaimed_slab = 0;
2316 p->reclaim_state = &reclaim_state; 2574 p->reclaim_state = &reclaim_state;
2317 2575
2318 if (zone_page_state(zone, NR_FILE_PAGES) - 2576 if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
2319 zone_page_state(zone, NR_FILE_MAPPED) >
2320 zone->min_unmapped_pages) {
2321 /* 2577 /*
2322 * Free memory by calling shrink zone with increasing 2578 * Free memory by calling shrink zone with increasing
2323 * priorities until we have enough memory freed. 2579 * priorities until we have enough memory freed.
@@ -2375,20 +2631,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2375 * if less than a specified percentage of the zone is used by 2631 * if less than a specified percentage of the zone is used by
2376 * unmapped file backed pages. 2632 * unmapped file backed pages.
2377 */ 2633 */
2378 if (zone_page_state(zone, NR_FILE_PAGES) - 2634 if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
2379 zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages 2635 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
2380 && zone_page_state(zone, NR_SLAB_RECLAIMABLE) 2636 return ZONE_RECLAIM_FULL;
2381 <= zone->min_slab_pages)
2382 return 0;
2383 2637
2384 if (zone_is_all_unreclaimable(zone)) 2638 if (zone_is_all_unreclaimable(zone))
2385 return 0; 2639 return ZONE_RECLAIM_FULL;
2386 2640
2387 /* 2641 /*
2388 * Do not scan if the allocation should not be delayed. 2642 * Do not scan if the allocation should not be delayed.
2389 */ 2643 */
2390 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) 2644 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
2391 return 0; 2645 return ZONE_RECLAIM_NOSCAN;
2392 2646
2393 /* 2647 /*
2394 * Only run zone reclaim on the local zone or on zones that do not 2648 * Only run zone reclaim on the local zone or on zones that do not
@@ -2398,18 +2652,21 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2398 */ 2652 */
2399 node_id = zone_to_nid(zone); 2653 node_id = zone_to_nid(zone);
2400 if (node_state(node_id, N_CPU) && node_id != numa_node_id()) 2654 if (node_state(node_id, N_CPU) && node_id != numa_node_id())
2401 return 0; 2655 return ZONE_RECLAIM_NOSCAN;
2402 2656
2403 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) 2657 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
2404 return 0; 2658 return ZONE_RECLAIM_NOSCAN;
2659
2405 ret = __zone_reclaim(zone, gfp_mask, order); 2660 ret = __zone_reclaim(zone, gfp_mask, order);
2406 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); 2661 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
2407 2662
2663 if (!ret)
2664 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
2665
2408 return ret; 2666 return ret;
2409} 2667}
2410#endif 2668#endif
2411 2669
2412#ifdef CONFIG_UNEVICTABLE_LRU
2413/* 2670/*
2414 * page_evictable - test whether a page is evictable 2671 * page_evictable - test whether a page is evictable
2415 * @page: the page to test 2672 * @page: the page to test
@@ -2454,7 +2711,7 @@ static void check_move_unevictable_page(struct page *page, struct zone *zone)
2454retry: 2711retry:
2455 ClearPageUnevictable(page); 2712 ClearPageUnevictable(page);
2456 if (page_evictable(page, NULL)) { 2713 if (page_evictable(page, NULL)) {
2457 enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page); 2714 enum lru_list l = page_lru_base_type(page);
2458 2715
2459 __dec_zone_state(zone, NR_UNEVICTABLE); 2716 __dec_zone_state(zone, NR_UNEVICTABLE);
2460 list_move(&page->lru, &zone->lru[l].list); 2717 list_move(&page->lru, &zone->lru[l].list);
@@ -2597,10 +2854,10 @@ static void scan_all_zones_unevictable_pages(void)
2597unsigned long scan_unevictable_pages; 2854unsigned long scan_unevictable_pages;
2598 2855
2599int scan_unevictable_handler(struct ctl_table *table, int write, 2856int scan_unevictable_handler(struct ctl_table *table, int write,
2600 struct file *file, void __user *buffer, 2857 void __user *buffer,
2601 size_t *length, loff_t *ppos) 2858 size_t *length, loff_t *ppos)
2602{ 2859{
2603 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 2860 proc_doulongvec_minmax(table, write, buffer, length, ppos);
2604 2861
2605 if (write && *(unsigned long *)table->data) 2862 if (write && *(unsigned long *)table->data)
2606 scan_all_zones_unevictable_pages(); 2863 scan_all_zones_unevictable_pages();
@@ -2656,4 +2913,3 @@ void scan_unevictable_unregister_node(struct node *node)
2656 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); 2913 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
2657} 2914}
2658 2915
2659#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9826766f1274..c81321f9feec 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -509,22 +509,11 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
509 continue; 509 continue;
510 510
511 page = pfn_to_page(pfn); 511 page = pfn_to_page(pfn);
512#ifdef CONFIG_ARCH_FLATMEM_HAS_HOLES 512
513 /* 513 /* Watch for unexpected holes punched in the memmap */
514 * Ordinarily, memory holes in flatmem still have a valid 514 if (!memmap_valid_within(pfn, page, zone))
515 * memmap for the PFN range. However, an architecture for
516 * embedded systems (e.g. ARM) can free up the memmap backing
517 * holes to save memory on the assumption the memmap is
518 * never used. The page_zone linkages are then broken even
519 * though pfn_valid() returns true. Skip the page if the
520 * linkages are broken. Even if this test passed, the impact
521 * is that the counters for the movable type are off but
522 * fragmentation monitoring is likely meaningless on small
523 * systems.
524 */
525 if (page_zone(page) != zone)
526 continue; 515 continue;
527#endif 516
528 mtype = get_pageblock_migratetype(page); 517 mtype = get_pageblock_migratetype(page);
529 518
530 if (mtype < MIGRATE_TYPES) 519 if (mtype < MIGRATE_TYPES)
@@ -640,10 +629,8 @@ static const char * const vmstat_text[] = {
640 "nr_active_anon", 629 "nr_active_anon",
641 "nr_inactive_file", 630 "nr_inactive_file",
642 "nr_active_file", 631 "nr_active_file",
643#ifdef CONFIG_UNEVICTABLE_LRU
644 "nr_unevictable", 632 "nr_unevictable",
645 "nr_mlock", 633 "nr_mlock",
646#endif
647 "nr_anon_pages", 634 "nr_anon_pages",
648 "nr_mapped", 635 "nr_mapped",
649 "nr_file_pages", 636 "nr_file_pages",
@@ -652,11 +639,14 @@ static const char * const vmstat_text[] = {
652 "nr_slab_reclaimable", 639 "nr_slab_reclaimable",
653 "nr_slab_unreclaimable", 640 "nr_slab_unreclaimable",
654 "nr_page_table_pages", 641 "nr_page_table_pages",
642 "nr_kernel_stack",
655 "nr_unstable", 643 "nr_unstable",
656 "nr_bounce", 644 "nr_bounce",
657 "nr_vmscan_write", 645 "nr_vmscan_write",
658 "nr_writeback_temp", 646 "nr_writeback_temp",
659 647 "nr_isolated_anon",
648 "nr_isolated_file",
649 "nr_shmem",
660#ifdef CONFIG_NUMA 650#ifdef CONFIG_NUMA
661 "numa_hit", 651 "numa_hit",
662 "numa_miss", 652 "numa_miss",
@@ -686,6 +676,9 @@ static const char * const vmstat_text[] = {
686 TEXTS_FOR_ZONES("pgscan_kswapd") 676 TEXTS_FOR_ZONES("pgscan_kswapd")
687 TEXTS_FOR_ZONES("pgscan_direct") 677 TEXTS_FOR_ZONES("pgscan_direct")
688 678
679#ifdef CONFIG_NUMA
680 "zone_reclaim_failed",
681#endif
689 "pginodesteal", 682 "pginodesteal",
690 "slabs_scanned", 683 "slabs_scanned",
691 "kswapd_steal", 684 "kswapd_steal",
@@ -698,7 +691,6 @@ static const char * const vmstat_text[] = {
698 "htlb_buddy_alloc_success", 691 "htlb_buddy_alloc_success",
699 "htlb_buddy_alloc_fail", 692 "htlb_buddy_alloc_fail",
700#endif 693#endif
701#ifdef CONFIG_UNEVICTABLE_LRU
702 "unevictable_pgs_culled", 694 "unevictable_pgs_culled",
703 "unevictable_pgs_scanned", 695 "unevictable_pgs_scanned",
704 "unevictable_pgs_rescued", 696 "unevictable_pgs_rescued",
@@ -708,7 +700,6 @@ static const char * const vmstat_text[] = {
708 "unevictable_pgs_stranded", 700 "unevictable_pgs_stranded",
709 "unevictable_pgs_mlockfreed", 701 "unevictable_pgs_mlockfreed",
710#endif 702#endif
711#endif
712}; 703};
713 704
714static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, 705static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
@@ -721,18 +712,14 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
721 "\n min %lu" 712 "\n min %lu"
722 "\n low %lu" 713 "\n low %lu"
723 "\n high %lu" 714 "\n high %lu"
724 "\n scanned %lu (aa: %lu ia: %lu af: %lu if: %lu)" 715 "\n scanned %lu"
725 "\n spanned %lu" 716 "\n spanned %lu"
726 "\n present %lu", 717 "\n present %lu",
727 zone_page_state(zone, NR_FREE_PAGES), 718 zone_page_state(zone, NR_FREE_PAGES),
728 zone->pages_min, 719 min_wmark_pages(zone),
729 zone->pages_low, 720 low_wmark_pages(zone),
730 zone->pages_high, 721 high_wmark_pages(zone),
731 zone->pages_scanned, 722 zone->pages_scanned,
732 zone->lru[LRU_ACTIVE_ANON].nr_scan,
733 zone->lru[LRU_INACTIVE_ANON].nr_scan,
734 zone->lru[LRU_ACTIVE_FILE].nr_scan,
735 zone->lru[LRU_INACTIVE_FILE].nr_scan,
736 zone->spanned_pages, 723 zone->spanned_pages,
737 zone->present_pages); 724 zone->present_pages);
738 725
@@ -891,7 +878,7 @@ static void vmstat_update(struct work_struct *w)
891{ 878{
892 refresh_cpu_vm_stats(smp_processor_id()); 879 refresh_cpu_vm_stats(smp_processor_id());
893 schedule_delayed_work(&__get_cpu_var(vmstat_work), 880 schedule_delayed_work(&__get_cpu_var(vmstat_work),
894 sysctl_stat_interval); 881 round_jiffies_relative(sysctl_stat_interval));
895} 882}
896 883
897static void __cpuinit start_cpu_timer(int cpu) 884static void __cpuinit start_cpu_timer(int cpu)
@@ -899,7 +886,8 @@ static void __cpuinit start_cpu_timer(int cpu)
899 struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu); 886 struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu);
900 887
901 INIT_DELAYED_WORK_DEFERRABLE(vmstat_work, vmstat_update); 888 INIT_DELAYED_WORK_DEFERRABLE(vmstat_work, vmstat_update);
902 schedule_delayed_work_on(cpu, vmstat_work, HZ + cpu); 889 schedule_delayed_work_on(cpu, vmstat_work,
890 __round_jiffies_relative(HZ, cpu));
903} 891}
904 892
905/* 893/*