diff options
author | Tejun Heo <tj@kernel.org> | 2010-01-04 19:17:33 -0500 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2010-01-04 19:17:33 -0500 |
commit | 32032df6c2f6c9c6b2ada2ce42322231824f70c2 (patch) | |
tree | b1ce838a37044bb38dfc128e2116ca35630e629a /mm | |
parent | 22b737f4c75197372d64afc6ed1bccd58c00e549 (diff) | |
parent | c5974b835a909ff15c3b7e6cf6789b5eb919f419 (diff) |
Merge branch 'master' into percpu
Conflicts:
arch/powerpc/platforms/pseries/hvCall.S
include/linux/percpu.h
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 35 | ||||
-rw-r--r-- | mm/Makefile | 1 | ||||
-rw-r--r-- | mm/backing-dev.c | 27 | ||||
-rw-r--r-- | mm/bootmem.c | 32 | ||||
-rw-r--r-- | mm/filemap.c | 66 | ||||
-rw-r--r-- | mm/highmem.c | 17 | ||||
-rw-r--r-- | mm/hugetlb.c | 551 | ||||
-rw-r--r-- | mm/hwpoison-inject.c | 113 | ||||
-rw-r--r-- | mm/internal.h | 35 | ||||
-rw-r--r-- | mm/kmemleak.c | 197 | ||||
-rw-r--r-- | mm/ksm.c | 962 | ||||
-rw-r--r-- | mm/madvise.c | 21 | ||||
-rw-r--r-- | mm/memcontrol.c | 448 | ||||
-rw-r--r-- | mm/memory-failure.c | 598 | ||||
-rw-r--r-- | mm/memory.c | 49 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 40 | ||||
-rw-r--r-- | mm/mempolicy.c | 82 | ||||
-rw-r--r-- | mm/migrate.c | 135 | ||||
-rw-r--r-- | mm/mincore.c | 37 | ||||
-rw-r--r-- | mm/mlock.c | 45 | ||||
-rw-r--r-- | mm/mmap.c | 136 | ||||
-rw-r--r-- | mm/mremap.c | 241 | ||||
-rw-r--r-- | mm/nommu.c | 39 | ||||
-rw-r--r-- | mm/oom_kill.c | 103 | ||||
-rw-r--r-- | mm/page-writeback.c | 15 | ||||
-rw-r--r-- | mm/page_alloc.c | 126 | ||||
-rw-r--r-- | mm/page_io.c | 17 | ||||
-rw-r--r-- | mm/pagewalk.c | 32 | ||||
-rw-r--r-- | mm/percpu.c | 162 | ||||
-rw-r--r-- | mm/readahead.c | 12 | ||||
-rw-r--r-- | mm/rmap.c | 354 | ||||
-rw-r--r-- | mm/shmem.c | 84 | ||||
-rw-r--r-- | mm/shmem_acl.c | 171 | ||||
-rw-r--r-- | mm/slab.c | 142 | ||||
-rw-r--r-- | mm/slub.c | 24 | ||||
-rw-r--r-- | mm/swapfile.c | 862 | ||||
-rw-r--r-- | mm/truncate.c | 8 | ||||
-rw-r--r-- | mm/vmalloc.c | 55 | ||||
-rw-r--r-- | mm/vmscan.c | 335 | ||||
-rw-r--r-- | mm/vmstat.c | 3 |
40 files changed, 4220 insertions, 2192 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index edd300aca173..17b8947aa7da 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -67,7 +67,7 @@ config DISCONTIGMEM | |||
67 | 67 | ||
68 | config SPARSEMEM | 68 | config SPARSEMEM |
69 | def_bool y | 69 | def_bool y |
70 | depends on SPARSEMEM_MANUAL | 70 | depends on (!SELECT_MEMORY_MODEL && ARCH_SPARSEMEM_ENABLE) || SPARSEMEM_MANUAL |
71 | 71 | ||
72 | config FLATMEM | 72 | config FLATMEM |
73 | def_bool y | 73 | def_bool y |
@@ -128,11 +128,8 @@ config SPARSEMEM_VMEMMAP | |||
128 | config MEMORY_HOTPLUG | 128 | config MEMORY_HOTPLUG |
129 | bool "Allow for memory hot-add" | 129 | bool "Allow for memory hot-add" |
130 | depends on SPARSEMEM || X86_64_ACPI_NUMA | 130 | depends on SPARSEMEM || X86_64_ACPI_NUMA |
131 | depends on HOTPLUG && !(HIBERNATION && !S390) && ARCH_ENABLE_MEMORY_HOTPLUG | 131 | depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG |
132 | depends on (IA64 || X86 || PPC64 || SUPERH || S390) | 132 | depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) |
133 | |||
134 | comment "Memory hotplug is currently incompatible with Software Suspend" | ||
135 | depends on SPARSEMEM && HOTPLUG && HIBERNATION && !S390 | ||
136 | 133 | ||
137 | config MEMORY_HOTPLUG_SPARSE | 134 | config MEMORY_HOTPLUG_SPARSE |
138 | def_bool y | 135 | def_bool y |
@@ -161,11 +158,13 @@ config PAGEFLAGS_EXTENDED | |||
161 | # Default to 4 for wider testing, though 8 might be more appropriate. | 158 | # Default to 4 for wider testing, though 8 might be more appropriate. |
162 | # ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock. | 159 | # ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock. |
163 | # PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes. | 160 | # PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes. |
161 | # DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page. | ||
164 | # | 162 | # |
165 | config SPLIT_PTLOCK_CPUS | 163 | config SPLIT_PTLOCK_CPUS |
166 | int | 164 | int |
167 | default "4096" if ARM && !CPU_CACHE_VIPT | 165 | default "999999" if ARM && !CPU_CACHE_VIPT |
168 | default "4096" if PARISC && !PA20 | 166 | default "999999" if PARISC && !PA20 |
167 | default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC | ||
169 | default "4" | 168 | default "4" |
170 | 169 | ||
171 | # | 170 | # |
@@ -203,14 +202,6 @@ config VIRT_TO_BUS | |||
203 | def_bool y | 202 | def_bool y |
204 | depends on !ARCH_NO_VIRT_TO_BUS | 203 | depends on !ARCH_NO_VIRT_TO_BUS |
205 | 204 | ||
206 | config HAVE_MLOCK | ||
207 | bool | ||
208 | default y if MMU=y | ||
209 | |||
210 | config HAVE_MLOCKED_PAGE_BIT | ||
211 | bool | ||
212 | default y if HAVE_MLOCK=y | ||
213 | |||
214 | config MMU_NOTIFIER | 205 | config MMU_NOTIFIER |
215 | bool | 206 | bool |
216 | 207 | ||
@@ -221,13 +212,16 @@ config KSM | |||
221 | Enable Kernel Samepage Merging: KSM periodically scans those areas | 212 | Enable Kernel Samepage Merging: KSM periodically scans those areas |
222 | of an application's address space that an app has advised may be | 213 | of an application's address space that an app has advised may be |
223 | mergeable. When it finds pages of identical content, it replaces | 214 | mergeable. When it finds pages of identical content, it replaces |
224 | the many instances by a single resident page with that content, so | 215 | the many instances by a single page with that content, so |
225 | saving memory until one or another app needs to modify the content. | 216 | saving memory until one or another app needs to modify the content. |
226 | Recommended for use with KVM, or with other duplicative applications. | 217 | Recommended for use with KVM, or with other duplicative applications. |
227 | See Documentation/vm/ksm.txt for more information. | 218 | See Documentation/vm/ksm.txt for more information: KSM is inactive |
219 | until a program has madvised that an area is MADV_MERGEABLE, and | ||
220 | root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set). | ||
228 | 221 | ||
229 | config DEFAULT_MMAP_MIN_ADDR | 222 | config DEFAULT_MMAP_MIN_ADDR |
230 | int "Low address space to protect from user allocation" | 223 | int "Low address space to protect from user allocation" |
224 | depends on MMU | ||
231 | default 4096 | 225 | default 4096 |
232 | help | 226 | help |
233 | This is the portion of low virtual memory which should be protected | 227 | This is the portion of low virtual memory which should be protected |
@@ -258,8 +252,9 @@ config MEMORY_FAILURE | |||
258 | special hardware support and typically ECC memory. | 252 | special hardware support and typically ECC memory. |
259 | 253 | ||
260 | config HWPOISON_INJECT | 254 | config HWPOISON_INJECT |
261 | tristate "Poison pages injector" | 255 | tristate "HWPoison pages injector" |
262 | depends on MEMORY_FAILURE && DEBUG_KERNEL | 256 | depends on MEMORY_FAILURE && DEBUG_KERNEL && PROC_FS |
257 | select PROC_PAGE_MONITOR | ||
263 | 258 | ||
264 | config NOMMU_INITIAL_TRIM_EXCESS | 259 | config NOMMU_INITIAL_TRIM_EXCESS |
265 | int "Turn on mmap() excess space trimming before booting" | 260 | int "Turn on mmap() excess space trimming before booting" |
diff --git a/mm/Makefile b/mm/Makefile index 82131d0f8d85..7a68d2ab5560 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -22,7 +22,6 @@ obj-$(CONFIG_HUGETLBFS) += hugetlb.o | |||
22 | obj-$(CONFIG_NUMA) += mempolicy.o | 22 | obj-$(CONFIG_NUMA) += mempolicy.o |
23 | obj-$(CONFIG_SPARSEMEM) += sparse.o | 23 | obj-$(CONFIG_SPARSEMEM) += sparse.o |
24 | obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o | 24 | obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o |
25 | obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o | ||
26 | obj-$(CONFIG_SLOB) += slob.o | 25 | obj-$(CONFIG_SLOB) += slob.o |
27 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o | 26 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o |
28 | obj-$(CONFIG_KSM) += ksm.o | 27 | obj-$(CONFIG_KSM) += ksm.o |
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 3d3accb1f800..0e8ca0347707 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -92,7 +92,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) | |||
92 | "BdiDirtyThresh: %8lu kB\n" | 92 | "BdiDirtyThresh: %8lu kB\n" |
93 | "DirtyThresh: %8lu kB\n" | 93 | "DirtyThresh: %8lu kB\n" |
94 | "BackgroundThresh: %8lu kB\n" | 94 | "BackgroundThresh: %8lu kB\n" |
95 | "WriteBack threads:%8lu\n" | 95 | "WritebackThreads: %8lu\n" |
96 | "b_dirty: %8lu\n" | 96 | "b_dirty: %8lu\n" |
97 | "b_io: %8lu\n" | 97 | "b_io: %8lu\n" |
98 | "b_more_io: %8lu\n" | 98 | "b_more_io: %8lu\n" |
@@ -604,15 +604,36 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) | |||
604 | 604 | ||
605 | /* | 605 | /* |
606 | * Finally, kill the kernel threads. We don't need to be RCU | 606 | * Finally, kill the kernel threads. We don't need to be RCU |
607 | * safe anymore, since the bdi is gone from visibility. | 607 | * safe anymore, since the bdi is gone from visibility. Force |
608 | * unfreeze of the thread before calling kthread_stop(), otherwise | ||
609 | * it would never exet if it is currently stuck in the refrigerator. | ||
608 | */ | 610 | */ |
609 | list_for_each_entry(wb, &bdi->wb_list, list) | 611 | list_for_each_entry(wb, &bdi->wb_list, list) { |
612 | thaw_process(wb->task); | ||
610 | kthread_stop(wb->task); | 613 | kthread_stop(wb->task); |
614 | } | ||
615 | } | ||
616 | |||
617 | /* | ||
618 | * This bdi is going away now, make sure that no super_blocks point to it | ||
619 | */ | ||
620 | static void bdi_prune_sb(struct backing_dev_info *bdi) | ||
621 | { | ||
622 | struct super_block *sb; | ||
623 | |||
624 | spin_lock(&sb_lock); | ||
625 | list_for_each_entry(sb, &super_blocks, s_list) { | ||
626 | if (sb->s_bdi == bdi) | ||
627 | sb->s_bdi = NULL; | ||
628 | } | ||
629 | spin_unlock(&sb_lock); | ||
611 | } | 630 | } |
612 | 631 | ||
613 | void bdi_unregister(struct backing_dev_info *bdi) | 632 | void bdi_unregister(struct backing_dev_info *bdi) |
614 | { | 633 | { |
615 | if (bdi->dev) { | 634 | if (bdi->dev) { |
635 | bdi_prune_sb(bdi); | ||
636 | |||
616 | if (!bdi_cap_flush_forker(bdi)) | 637 | if (!bdi_cap_flush_forker(bdi)) |
617 | bdi_wb_shutdown(bdi); | 638 | bdi_wb_shutdown(bdi); |
618 | bdi_debug_unregister(bdi); | 639 | bdi_debug_unregister(bdi); |
diff --git a/mm/bootmem.c b/mm/bootmem.c index 555d5d2731c6..7d1486875e1c 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -143,6 +143,30 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) | |||
143 | return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); | 143 | return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); |
144 | } | 144 | } |
145 | 145 | ||
146 | /* | ||
147 | * free_bootmem_late - free bootmem pages directly to page allocator | ||
148 | * @addr: starting address of the range | ||
149 | * @size: size of the range in bytes | ||
150 | * | ||
151 | * This is only useful when the bootmem allocator has already been torn | ||
152 | * down, but we are still initializing the system. Pages are given directly | ||
153 | * to the page allocator, no bootmem metadata is updated because it is gone. | ||
154 | */ | ||
155 | void __init free_bootmem_late(unsigned long addr, unsigned long size) | ||
156 | { | ||
157 | unsigned long cursor, end; | ||
158 | |||
159 | kmemleak_free_part(__va(addr), size); | ||
160 | |||
161 | cursor = PFN_UP(addr); | ||
162 | end = PFN_DOWN(addr + size); | ||
163 | |||
164 | for (; cursor < end; cursor++) { | ||
165 | __free_pages_bootmem(pfn_to_page(cursor), 0); | ||
166 | totalram_pages++; | ||
167 | } | ||
168 | } | ||
169 | |||
146 | static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | 170 | static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) |
147 | { | 171 | { |
148 | int aligned; | 172 | int aligned; |
@@ -408,8 +432,8 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size, | |||
408 | return mark_bootmem(start, end, 1, flags); | 432 | return mark_bootmem(start, end, 1, flags); |
409 | } | 433 | } |
410 | 434 | ||
411 | static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, | 435 | static unsigned long __init align_idx(struct bootmem_data *bdata, |
412 | unsigned long step) | 436 | unsigned long idx, unsigned long step) |
413 | { | 437 | { |
414 | unsigned long base = bdata->node_min_pfn; | 438 | unsigned long base = bdata->node_min_pfn; |
415 | 439 | ||
@@ -421,8 +445,8 @@ static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, | |||
421 | return ALIGN(base + idx, step) - base; | 445 | return ALIGN(base + idx, step) - base; |
422 | } | 446 | } |
423 | 447 | ||
424 | static unsigned long align_off(struct bootmem_data *bdata, unsigned long off, | 448 | static unsigned long __init align_off(struct bootmem_data *bdata, |
425 | unsigned long align) | 449 | unsigned long off, unsigned long align) |
426 | { | 450 | { |
427 | unsigned long base = PFN_PHYS(bdata->node_min_pfn); | 451 | unsigned long base = PFN_PHYS(bdata->node_min_pfn); |
428 | 452 | ||
diff --git a/mm/filemap.c b/mm/filemap.c index ef169f37156d..96ac6b0eb6cb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -260,27 +260,27 @@ int filemap_flush(struct address_space *mapping) | |||
260 | EXPORT_SYMBOL(filemap_flush); | 260 | EXPORT_SYMBOL(filemap_flush); |
261 | 261 | ||
262 | /** | 262 | /** |
263 | * wait_on_page_writeback_range - wait for writeback to complete | 263 | * filemap_fdatawait_range - wait for writeback to complete |
264 | * @mapping: target address_space | 264 | * @mapping: address space structure to wait for |
265 | * @start: beginning page index | 265 | * @start_byte: offset in bytes where the range starts |
266 | * @end: ending page index | 266 | * @end_byte: offset in bytes where the range ends (inclusive) |
267 | * | 267 | * |
268 | * Wait for writeback to complete against pages indexed by start->end | 268 | * Walk the list of under-writeback pages of the given address space |
269 | * inclusive | 269 | * in the given range and wait for all of them. |
270 | */ | 270 | */ |
271 | int wait_on_page_writeback_range(struct address_space *mapping, | 271 | int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, |
272 | pgoff_t start, pgoff_t end) | 272 | loff_t end_byte) |
273 | { | 273 | { |
274 | pgoff_t index = start_byte >> PAGE_CACHE_SHIFT; | ||
275 | pgoff_t end = end_byte >> PAGE_CACHE_SHIFT; | ||
274 | struct pagevec pvec; | 276 | struct pagevec pvec; |
275 | int nr_pages; | 277 | int nr_pages; |
276 | int ret = 0; | 278 | int ret = 0; |
277 | pgoff_t index; | ||
278 | 279 | ||
279 | if (end < start) | 280 | if (end_byte < start_byte) |
280 | return 0; | 281 | return 0; |
281 | 282 | ||
282 | pagevec_init(&pvec, 0); | 283 | pagevec_init(&pvec, 0); |
283 | index = start; | ||
284 | while ((index <= end) && | 284 | while ((index <= end) && |
285 | (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, | 285 | (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, |
286 | PAGECACHE_TAG_WRITEBACK, | 286 | PAGECACHE_TAG_WRITEBACK, |
@@ -310,25 +310,6 @@ int wait_on_page_writeback_range(struct address_space *mapping, | |||
310 | 310 | ||
311 | return ret; | 311 | return ret; |
312 | } | 312 | } |
313 | |||
314 | /** | ||
315 | * filemap_fdatawait_range - wait for all under-writeback pages to complete in a given range | ||
316 | * @mapping: address space structure to wait for | ||
317 | * @start: offset in bytes where the range starts | ||
318 | * @end: offset in bytes where the range ends (inclusive) | ||
319 | * | ||
320 | * Walk the list of under-writeback pages of the given address space | ||
321 | * in the given range and wait for all of them. | ||
322 | * | ||
323 | * This is just a simple wrapper so that callers don't have to convert offsets | ||
324 | * to page indexes themselves | ||
325 | */ | ||
326 | int filemap_fdatawait_range(struct address_space *mapping, loff_t start, | ||
327 | loff_t end) | ||
328 | { | ||
329 | return wait_on_page_writeback_range(mapping, start >> PAGE_CACHE_SHIFT, | ||
330 | end >> PAGE_CACHE_SHIFT); | ||
331 | } | ||
332 | EXPORT_SYMBOL(filemap_fdatawait_range); | 313 | EXPORT_SYMBOL(filemap_fdatawait_range); |
333 | 314 | ||
334 | /** | 315 | /** |
@@ -345,8 +326,7 @@ int filemap_fdatawait(struct address_space *mapping) | |||
345 | if (i_size == 0) | 326 | if (i_size == 0) |
346 | return 0; | 327 | return 0; |
347 | 328 | ||
348 | return wait_on_page_writeback_range(mapping, 0, | 329 | return filemap_fdatawait_range(mapping, 0, i_size - 1); |
349 | (i_size - 1) >> PAGE_CACHE_SHIFT); | ||
350 | } | 330 | } |
351 | EXPORT_SYMBOL(filemap_fdatawait); | 331 | EXPORT_SYMBOL(filemap_fdatawait); |
352 | 332 | ||
@@ -393,9 +373,8 @@ int filemap_write_and_wait_range(struct address_space *mapping, | |||
393 | WB_SYNC_ALL); | 373 | WB_SYNC_ALL); |
394 | /* See comment of filemap_write_and_wait() */ | 374 | /* See comment of filemap_write_and_wait() */ |
395 | if (err != -EIO) { | 375 | if (err != -EIO) { |
396 | int err2 = wait_on_page_writeback_range(mapping, | 376 | int err2 = filemap_fdatawait_range(mapping, |
397 | lstart >> PAGE_CACHE_SHIFT, | 377 | lstart, lend); |
398 | lend >> PAGE_CACHE_SHIFT); | ||
399 | if (!err) | 378 | if (!err) |
400 | err = err2; | 379 | err = err2; |
401 | } | 380 | } |
@@ -1844,7 +1823,7 @@ static size_t __iovec_copy_from_user_inatomic(char *vaddr, | |||
1844 | 1823 | ||
1845 | /* | 1824 | /* |
1846 | * Copy as much as we can into the page and return the number of bytes which | 1825 | * Copy as much as we can into the page and return the number of bytes which |
1847 | * were sucessfully copied. If a fault is encountered then return the number of | 1826 | * were successfully copied. If a fault is encountered then return the number of |
1848 | * bytes which were copied. | 1827 | * bytes which were copied. |
1849 | */ | 1828 | */ |
1850 | size_t iov_iter_copy_from_user_atomic(struct page *page, | 1829 | size_t iov_iter_copy_from_user_atomic(struct page *page, |
@@ -2261,7 +2240,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
2261 | size_t count, ssize_t written) | 2240 | size_t count, ssize_t written) |
2262 | { | 2241 | { |
2263 | struct file *file = iocb->ki_filp; | 2242 | struct file *file = iocb->ki_filp; |
2264 | struct address_space *mapping = file->f_mapping; | ||
2265 | ssize_t status; | 2243 | ssize_t status; |
2266 | struct iov_iter i; | 2244 | struct iov_iter i; |
2267 | 2245 | ||
@@ -2273,15 +2251,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
2273 | *ppos = pos + status; | 2251 | *ppos = pos + status; |
2274 | } | 2252 | } |
2275 | 2253 | ||
2276 | /* | ||
2277 | * If we get here for O_DIRECT writes then we must have fallen through | ||
2278 | * to buffered writes (block instantiation inside i_size). So we sync | ||
2279 | * the file data here, to try to honour O_DIRECT expectations. | ||
2280 | */ | ||
2281 | if (unlikely(file->f_flags & O_DIRECT) && written) | ||
2282 | status = filemap_write_and_wait_range(mapping, | ||
2283 | pos, pos + written - 1); | ||
2284 | |||
2285 | return written ? written : status; | 2254 | return written ? written : status; |
2286 | } | 2255 | } |
2287 | EXPORT_SYMBOL(generic_file_buffered_write); | 2256 | EXPORT_SYMBOL(generic_file_buffered_write); |
@@ -2380,10 +2349,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
2380 | * semantics. | 2349 | * semantics. |
2381 | */ | 2350 | */ |
2382 | endbyte = pos + written_buffered - written - 1; | 2351 | endbyte = pos + written_buffered - written - 1; |
2383 | err = do_sync_mapping_range(file->f_mapping, pos, endbyte, | 2352 | err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); |
2384 | SYNC_FILE_RANGE_WAIT_BEFORE| | ||
2385 | SYNC_FILE_RANGE_WRITE| | ||
2386 | SYNC_FILE_RANGE_WAIT_AFTER); | ||
2387 | if (err == 0) { | 2353 | if (err == 0) { |
2388 | written = written_buffered; | 2354 | written = written_buffered; |
2389 | invalidate_mapping_pages(mapping, | 2355 | invalidate_mapping_pages(mapping, |
diff --git a/mm/highmem.c b/mm/highmem.c index 25878cc49daa..9c1e627f282e 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
@@ -426,16 +426,21 @@ void __init page_address_init(void) | |||
426 | 426 | ||
427 | void debug_kmap_atomic(enum km_type type) | 427 | void debug_kmap_atomic(enum km_type type) |
428 | { | 428 | { |
429 | static unsigned warn_count = 10; | 429 | static int warn_count = 10; |
430 | 430 | ||
431 | if (unlikely(warn_count == 0)) | 431 | if (unlikely(warn_count < 0)) |
432 | return; | 432 | return; |
433 | 433 | ||
434 | if (unlikely(in_interrupt())) { | 434 | if (unlikely(in_interrupt())) { |
435 | if (in_irq()) { | 435 | if (in_nmi()) { |
436 | if (type != KM_NMI && type != KM_NMI_PTE) { | ||
437 | WARN_ON(1); | ||
438 | warn_count--; | ||
439 | } | ||
440 | } else if (in_irq()) { | ||
436 | if (type != KM_IRQ0 && type != KM_IRQ1 && | 441 | if (type != KM_IRQ0 && type != KM_IRQ1 && |
437 | type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ && | 442 | type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ && |
438 | type != KM_BOUNCE_READ) { | 443 | type != KM_BOUNCE_READ && type != KM_IRQ_PTE) { |
439 | WARN_ON(1); | 444 | WARN_ON(1); |
440 | warn_count--; | 445 | warn_count--; |
441 | } | 446 | } |
@@ -452,7 +457,9 @@ void debug_kmap_atomic(enum km_type type) | |||
452 | } | 457 | } |
453 | 458 | ||
454 | if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ || | 459 | if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ || |
455 | type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) { | 460 | type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ || |
461 | type == KM_IRQ_PTE || type == KM_NMI || | ||
462 | type == KM_NMI_PTE ) { | ||
456 | if (!irqs_disabled()) { | 463 | if (!irqs_disabled()) { |
457 | WARN_ON(1); | 464 | WARN_ON(1); |
458 | warn_count--; | 465 | warn_count--; |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5d7601b02874..65f38c218207 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <asm/io.h> | 24 | #include <asm/io.h> |
25 | 25 | ||
26 | #include <linux/hugetlb.h> | 26 | #include <linux/hugetlb.h> |
27 | #include <linux/node.h> | ||
27 | #include "internal.h" | 28 | #include "internal.h" |
28 | 29 | ||
29 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | 30 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; |
@@ -622,42 +623,66 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | |||
622 | } | 623 | } |
623 | 624 | ||
624 | /* | 625 | /* |
625 | * Use a helper variable to find the next node and then | 626 | * common helper functions for hstate_next_node_to_{alloc|free}. |
626 | * copy it back to next_nid_to_alloc afterwards: | 627 | * We may have allocated or freed a huge page based on a different |
627 | * otherwise there's a window in which a racer might | 628 | * nodes_allowed previously, so h->next_node_to_{alloc|free} might |
628 | * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. | 629 | * be outside of *nodes_allowed. Ensure that we use an allowed |
629 | * But we don't need to use a spin_lock here: it really | 630 | * node for alloc or free. |
630 | * doesn't matter if occasionally a racer chooses the | ||
631 | * same nid as we do. Move nid forward in the mask even | ||
632 | * if we just successfully allocated a hugepage so that | ||
633 | * the next caller gets hugepages on the next node. | ||
634 | */ | 631 | */ |
635 | static int hstate_next_node_to_alloc(struct hstate *h) | 632 | static int next_node_allowed(int nid, nodemask_t *nodes_allowed) |
636 | { | 633 | { |
637 | int next_nid; | 634 | nid = next_node(nid, *nodes_allowed); |
638 | next_nid = next_node(h->next_nid_to_alloc, node_online_map); | 635 | if (nid == MAX_NUMNODES) |
639 | if (next_nid == MAX_NUMNODES) | 636 | nid = first_node(*nodes_allowed); |
640 | next_nid = first_node(node_online_map); | 637 | VM_BUG_ON(nid >= MAX_NUMNODES); |
641 | h->next_nid_to_alloc = next_nid; | 638 | |
642 | return next_nid; | 639 | return nid; |
640 | } | ||
641 | |||
642 | static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) | ||
643 | { | ||
644 | if (!node_isset(nid, *nodes_allowed)) | ||
645 | nid = next_node_allowed(nid, nodes_allowed); | ||
646 | return nid; | ||
647 | } | ||
648 | |||
649 | /* | ||
650 | * returns the previously saved node ["this node"] from which to | ||
651 | * allocate a persistent huge page for the pool and advance the | ||
652 | * next node from which to allocate, handling wrap at end of node | ||
653 | * mask. | ||
654 | */ | ||
655 | static int hstate_next_node_to_alloc(struct hstate *h, | ||
656 | nodemask_t *nodes_allowed) | ||
657 | { | ||
658 | int nid; | ||
659 | |||
660 | VM_BUG_ON(!nodes_allowed); | ||
661 | |||
662 | nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); | ||
663 | h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); | ||
664 | |||
665 | return nid; | ||
643 | } | 666 | } |
644 | 667 | ||
645 | static int alloc_fresh_huge_page(struct hstate *h) | 668 | static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) |
646 | { | 669 | { |
647 | struct page *page; | 670 | struct page *page; |
648 | int start_nid; | 671 | int start_nid; |
649 | int next_nid; | 672 | int next_nid; |
650 | int ret = 0; | 673 | int ret = 0; |
651 | 674 | ||
652 | start_nid = h->next_nid_to_alloc; | 675 | start_nid = hstate_next_node_to_alloc(h, nodes_allowed); |
653 | next_nid = start_nid; | 676 | next_nid = start_nid; |
654 | 677 | ||
655 | do { | 678 | do { |
656 | page = alloc_fresh_huge_page_node(h, next_nid); | 679 | page = alloc_fresh_huge_page_node(h, next_nid); |
657 | if (page) | 680 | if (page) { |
658 | ret = 1; | 681 | ret = 1; |
659 | next_nid = hstate_next_node_to_alloc(h); | 682 | break; |
660 | } while (!page && next_nid != start_nid); | 683 | } |
684 | next_nid = hstate_next_node_to_alloc(h, nodes_allowed); | ||
685 | } while (next_nid != start_nid); | ||
661 | 686 | ||
662 | if (ret) | 687 | if (ret) |
663 | count_vm_event(HTLB_BUDDY_PGALLOC); | 688 | count_vm_event(HTLB_BUDDY_PGALLOC); |
@@ -668,17 +693,21 @@ static int alloc_fresh_huge_page(struct hstate *h) | |||
668 | } | 693 | } |
669 | 694 | ||
670 | /* | 695 | /* |
671 | * helper for free_pool_huge_page() - find next node | 696 | * helper for free_pool_huge_page() - return the previously saved |
672 | * from which to free a huge page | 697 | * node ["this node"] from which to free a huge page. Advance the |
698 | * next node id whether or not we find a free huge page to free so | ||
699 | * that the next attempt to free addresses the next node. | ||
673 | */ | 700 | */ |
674 | static int hstate_next_node_to_free(struct hstate *h) | 701 | static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) |
675 | { | 702 | { |
676 | int next_nid; | 703 | int nid; |
677 | next_nid = next_node(h->next_nid_to_free, node_online_map); | 704 | |
678 | if (next_nid == MAX_NUMNODES) | 705 | VM_BUG_ON(!nodes_allowed); |
679 | next_nid = first_node(node_online_map); | 706 | |
680 | h->next_nid_to_free = next_nid; | 707 | nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); |
681 | return next_nid; | 708 | h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); |
709 | |||
710 | return nid; | ||
682 | } | 711 | } |
683 | 712 | ||
684 | /* | 713 | /* |
@@ -687,13 +716,14 @@ static int hstate_next_node_to_free(struct hstate *h) | |||
687 | * balanced over allowed nodes. | 716 | * balanced over allowed nodes. |
688 | * Called with hugetlb_lock locked. | 717 | * Called with hugetlb_lock locked. |
689 | */ | 718 | */ |
690 | static int free_pool_huge_page(struct hstate *h, bool acct_surplus) | 719 | static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, |
720 | bool acct_surplus) | ||
691 | { | 721 | { |
692 | int start_nid; | 722 | int start_nid; |
693 | int next_nid; | 723 | int next_nid; |
694 | int ret = 0; | 724 | int ret = 0; |
695 | 725 | ||
696 | start_nid = h->next_nid_to_free; | 726 | start_nid = hstate_next_node_to_free(h, nodes_allowed); |
697 | next_nid = start_nid; | 727 | next_nid = start_nid; |
698 | 728 | ||
699 | do { | 729 | do { |
@@ -715,9 +745,10 @@ static int free_pool_huge_page(struct hstate *h, bool acct_surplus) | |||
715 | } | 745 | } |
716 | update_and_free_page(h, page); | 746 | update_and_free_page(h, page); |
717 | ret = 1; | 747 | ret = 1; |
748 | break; | ||
718 | } | 749 | } |
719 | next_nid = hstate_next_node_to_free(h); | 750 | next_nid = hstate_next_node_to_free(h, nodes_allowed); |
720 | } while (!ret && next_nid != start_nid); | 751 | } while (next_nid != start_nid); |
721 | 752 | ||
722 | return ret; | 753 | return ret; |
723 | } | 754 | } |
@@ -911,14 +942,14 @@ static void return_unused_surplus_pages(struct hstate *h, | |||
911 | 942 | ||
912 | /* | 943 | /* |
913 | * We want to release as many surplus pages as possible, spread | 944 | * We want to release as many surplus pages as possible, spread |
914 | * evenly across all nodes. Iterate across all nodes until we | 945 | * evenly across all nodes with memory. Iterate across these nodes |
915 | * can no longer free unreserved surplus pages. This occurs when | 946 | * until we can no longer free unreserved surplus pages. This occurs |
916 | * the nodes with surplus pages have no free pages. | 947 | * when the nodes with surplus pages have no free pages. |
917 | * free_pool_huge_page() will balance the the frees across the | 948 | * free_pool_huge_page() will balance the the freed pages across the |
918 | * on-line nodes for us and will handle the hstate accounting. | 949 | * on-line nodes with memory and will handle the hstate accounting. |
919 | */ | 950 | */ |
920 | while (nr_pages--) { | 951 | while (nr_pages--) { |
921 | if (!free_pool_huge_page(h, 1)) | 952 | if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1)) |
922 | break; | 953 | break; |
923 | } | 954 | } |
924 | } | 955 | } |
@@ -1022,16 +1053,16 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1022 | int __weak alloc_bootmem_huge_page(struct hstate *h) | 1053 | int __weak alloc_bootmem_huge_page(struct hstate *h) |
1023 | { | 1054 | { |
1024 | struct huge_bootmem_page *m; | 1055 | struct huge_bootmem_page *m; |
1025 | int nr_nodes = nodes_weight(node_online_map); | 1056 | int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); |
1026 | 1057 | ||
1027 | while (nr_nodes) { | 1058 | while (nr_nodes) { |
1028 | void *addr; | 1059 | void *addr; |
1029 | 1060 | ||
1030 | addr = __alloc_bootmem_node_nopanic( | 1061 | addr = __alloc_bootmem_node_nopanic( |
1031 | NODE_DATA(h->next_nid_to_alloc), | 1062 | NODE_DATA(hstate_next_node_to_alloc(h, |
1063 | &node_states[N_HIGH_MEMORY])), | ||
1032 | huge_page_size(h), huge_page_size(h), 0); | 1064 | huge_page_size(h), huge_page_size(h), 0); |
1033 | 1065 | ||
1034 | hstate_next_node_to_alloc(h); | ||
1035 | if (addr) { | 1066 | if (addr) { |
1036 | /* | 1067 | /* |
1037 | * Use the beginning of the huge page to store the | 1068 | * Use the beginning of the huge page to store the |
@@ -1084,7 +1115,8 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) | |||
1084 | if (h->order >= MAX_ORDER) { | 1115 | if (h->order >= MAX_ORDER) { |
1085 | if (!alloc_bootmem_huge_page(h)) | 1116 | if (!alloc_bootmem_huge_page(h)) |
1086 | break; | 1117 | break; |
1087 | } else if (!alloc_fresh_huge_page(h)) | 1118 | } else if (!alloc_fresh_huge_page(h, |
1119 | &node_states[N_HIGH_MEMORY])) | ||
1088 | break; | 1120 | break; |
1089 | } | 1121 | } |
1090 | h->max_huge_pages = i; | 1122 | h->max_huge_pages = i; |
@@ -1126,14 +1158,15 @@ static void __init report_hugepages(void) | |||
1126 | } | 1158 | } |
1127 | 1159 | ||
1128 | #ifdef CONFIG_HIGHMEM | 1160 | #ifdef CONFIG_HIGHMEM |
1129 | static void try_to_free_low(struct hstate *h, unsigned long count) | 1161 | static void try_to_free_low(struct hstate *h, unsigned long count, |
1162 | nodemask_t *nodes_allowed) | ||
1130 | { | 1163 | { |
1131 | int i; | 1164 | int i; |
1132 | 1165 | ||
1133 | if (h->order >= MAX_ORDER) | 1166 | if (h->order >= MAX_ORDER) |
1134 | return; | 1167 | return; |
1135 | 1168 | ||
1136 | for (i = 0; i < MAX_NUMNODES; ++i) { | 1169 | for_each_node_mask(i, *nodes_allowed) { |
1137 | struct page *page, *next; | 1170 | struct page *page, *next; |
1138 | struct list_head *freel = &h->hugepage_freelists[i]; | 1171 | struct list_head *freel = &h->hugepage_freelists[i]; |
1139 | list_for_each_entry_safe(page, next, freel, lru) { | 1172 | list_for_each_entry_safe(page, next, freel, lru) { |
@@ -1149,7 +1182,8 @@ static void try_to_free_low(struct hstate *h, unsigned long count) | |||
1149 | } | 1182 | } |
1150 | } | 1183 | } |
1151 | #else | 1184 | #else |
1152 | static inline void try_to_free_low(struct hstate *h, unsigned long count) | 1185 | static inline void try_to_free_low(struct hstate *h, unsigned long count, |
1186 | nodemask_t *nodes_allowed) | ||
1153 | { | 1187 | { |
1154 | } | 1188 | } |
1155 | #endif | 1189 | #endif |
@@ -1159,7 +1193,8 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count) | |||
1159 | * balanced by operating on them in a round-robin fashion. | 1193 | * balanced by operating on them in a round-robin fashion. |
1160 | * Returns 1 if an adjustment was made. | 1194 | * Returns 1 if an adjustment was made. |
1161 | */ | 1195 | */ |
1162 | static int adjust_pool_surplus(struct hstate *h, int delta) | 1196 | static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, |
1197 | int delta) | ||
1163 | { | 1198 | { |
1164 | int start_nid, next_nid; | 1199 | int start_nid, next_nid; |
1165 | int ret = 0; | 1200 | int ret = 0; |
@@ -1167,29 +1202,33 @@ static int adjust_pool_surplus(struct hstate *h, int delta) | |||
1167 | VM_BUG_ON(delta != -1 && delta != 1); | 1202 | VM_BUG_ON(delta != -1 && delta != 1); |
1168 | 1203 | ||
1169 | if (delta < 0) | 1204 | if (delta < 0) |
1170 | start_nid = h->next_nid_to_alloc; | 1205 | start_nid = hstate_next_node_to_alloc(h, nodes_allowed); |
1171 | else | 1206 | else |
1172 | start_nid = h->next_nid_to_free; | 1207 | start_nid = hstate_next_node_to_free(h, nodes_allowed); |
1173 | next_nid = start_nid; | 1208 | next_nid = start_nid; |
1174 | 1209 | ||
1175 | do { | 1210 | do { |
1176 | int nid = next_nid; | 1211 | int nid = next_nid; |
1177 | if (delta < 0) { | 1212 | if (delta < 0) { |
1178 | next_nid = hstate_next_node_to_alloc(h); | ||
1179 | /* | 1213 | /* |
1180 | * To shrink on this node, there must be a surplus page | 1214 | * To shrink on this node, there must be a surplus page |
1181 | */ | 1215 | */ |
1182 | if (!h->surplus_huge_pages_node[nid]) | 1216 | if (!h->surplus_huge_pages_node[nid]) { |
1217 | next_nid = hstate_next_node_to_alloc(h, | ||
1218 | nodes_allowed); | ||
1183 | continue; | 1219 | continue; |
1220 | } | ||
1184 | } | 1221 | } |
1185 | if (delta > 0) { | 1222 | if (delta > 0) { |
1186 | next_nid = hstate_next_node_to_free(h); | ||
1187 | /* | 1223 | /* |
1188 | * Surplus cannot exceed the total number of pages | 1224 | * Surplus cannot exceed the total number of pages |
1189 | */ | 1225 | */ |
1190 | if (h->surplus_huge_pages_node[nid] >= | 1226 | if (h->surplus_huge_pages_node[nid] >= |
1191 | h->nr_huge_pages_node[nid]) | 1227 | h->nr_huge_pages_node[nid]) { |
1228 | next_nid = hstate_next_node_to_free(h, | ||
1229 | nodes_allowed); | ||
1192 | continue; | 1230 | continue; |
1231 | } | ||
1193 | } | 1232 | } |
1194 | 1233 | ||
1195 | h->surplus_huge_pages += delta; | 1234 | h->surplus_huge_pages += delta; |
@@ -1202,7 +1241,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta) | |||
1202 | } | 1241 | } |
1203 | 1242 | ||
1204 | #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) | 1243 | #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) |
1205 | static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | 1244 | static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, |
1245 | nodemask_t *nodes_allowed) | ||
1206 | { | 1246 | { |
1207 | unsigned long min_count, ret; | 1247 | unsigned long min_count, ret; |
1208 | 1248 | ||
@@ -1222,7 +1262,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | |||
1222 | */ | 1262 | */ |
1223 | spin_lock(&hugetlb_lock); | 1263 | spin_lock(&hugetlb_lock); |
1224 | while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { | 1264 | while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { |
1225 | if (!adjust_pool_surplus(h, -1)) | 1265 | if (!adjust_pool_surplus(h, nodes_allowed, -1)) |
1226 | break; | 1266 | break; |
1227 | } | 1267 | } |
1228 | 1268 | ||
@@ -1233,11 +1273,14 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | |||
1233 | * and reducing the surplus. | 1273 | * and reducing the surplus. |
1234 | */ | 1274 | */ |
1235 | spin_unlock(&hugetlb_lock); | 1275 | spin_unlock(&hugetlb_lock); |
1236 | ret = alloc_fresh_huge_page(h); | 1276 | ret = alloc_fresh_huge_page(h, nodes_allowed); |
1237 | spin_lock(&hugetlb_lock); | 1277 | spin_lock(&hugetlb_lock); |
1238 | if (!ret) | 1278 | if (!ret) |
1239 | goto out; | 1279 | goto out; |
1240 | 1280 | ||
1281 | /* Bail for signals. Probably ctrl-c from user */ | ||
1282 | if (signal_pending(current)) | ||
1283 | goto out; | ||
1241 | } | 1284 | } |
1242 | 1285 | ||
1243 | /* | 1286 | /* |
@@ -1257,13 +1300,13 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | |||
1257 | */ | 1300 | */ |
1258 | min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; | 1301 | min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; |
1259 | min_count = max(count, min_count); | 1302 | min_count = max(count, min_count); |
1260 | try_to_free_low(h, min_count); | 1303 | try_to_free_low(h, min_count, nodes_allowed); |
1261 | while (min_count < persistent_huge_pages(h)) { | 1304 | while (min_count < persistent_huge_pages(h)) { |
1262 | if (!free_pool_huge_page(h, 0)) | 1305 | if (!free_pool_huge_page(h, nodes_allowed, 0)) |
1263 | break; | 1306 | break; |
1264 | } | 1307 | } |
1265 | while (count < persistent_huge_pages(h)) { | 1308 | while (count < persistent_huge_pages(h)) { |
1266 | if (!adjust_pool_surplus(h, 1)) | 1309 | if (!adjust_pool_surplus(h, nodes_allowed, 1)) |
1267 | break; | 1310 | break; |
1268 | } | 1311 | } |
1269 | out: | 1312 | out: |
@@ -1282,43 +1325,117 @@ out: | |||
1282 | static struct kobject *hugepages_kobj; | 1325 | static struct kobject *hugepages_kobj; |
1283 | static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; | 1326 | static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; |
1284 | 1327 | ||
1285 | static struct hstate *kobj_to_hstate(struct kobject *kobj) | 1328 | static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); |
1329 | |||
1330 | static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) | ||
1286 | { | 1331 | { |
1287 | int i; | 1332 | int i; |
1333 | |||
1288 | for (i = 0; i < HUGE_MAX_HSTATE; i++) | 1334 | for (i = 0; i < HUGE_MAX_HSTATE; i++) |
1289 | if (hstate_kobjs[i] == kobj) | 1335 | if (hstate_kobjs[i] == kobj) { |
1336 | if (nidp) | ||
1337 | *nidp = NUMA_NO_NODE; | ||
1290 | return &hstates[i]; | 1338 | return &hstates[i]; |
1291 | BUG(); | 1339 | } |
1292 | return NULL; | 1340 | |
1341 | return kobj_to_node_hstate(kobj, nidp); | ||
1293 | } | 1342 | } |
1294 | 1343 | ||
1295 | static ssize_t nr_hugepages_show(struct kobject *kobj, | 1344 | static ssize_t nr_hugepages_show_common(struct kobject *kobj, |
1296 | struct kobj_attribute *attr, char *buf) | 1345 | struct kobj_attribute *attr, char *buf) |
1297 | { | 1346 | { |
1298 | struct hstate *h = kobj_to_hstate(kobj); | 1347 | struct hstate *h; |
1299 | return sprintf(buf, "%lu\n", h->nr_huge_pages); | 1348 | unsigned long nr_huge_pages; |
1349 | int nid; | ||
1350 | |||
1351 | h = kobj_to_hstate(kobj, &nid); | ||
1352 | if (nid == NUMA_NO_NODE) | ||
1353 | nr_huge_pages = h->nr_huge_pages; | ||
1354 | else | ||
1355 | nr_huge_pages = h->nr_huge_pages_node[nid]; | ||
1356 | |||
1357 | return sprintf(buf, "%lu\n", nr_huge_pages); | ||
1300 | } | 1358 | } |
1301 | static ssize_t nr_hugepages_store(struct kobject *kobj, | 1359 | static ssize_t nr_hugepages_store_common(bool obey_mempolicy, |
1302 | struct kobj_attribute *attr, const char *buf, size_t count) | 1360 | struct kobject *kobj, struct kobj_attribute *attr, |
1361 | const char *buf, size_t len) | ||
1303 | { | 1362 | { |
1304 | int err; | 1363 | int err; |
1305 | unsigned long input; | 1364 | int nid; |
1306 | struct hstate *h = kobj_to_hstate(kobj); | 1365 | unsigned long count; |
1366 | struct hstate *h; | ||
1367 | NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); | ||
1307 | 1368 | ||
1308 | err = strict_strtoul(buf, 10, &input); | 1369 | err = strict_strtoul(buf, 10, &count); |
1309 | if (err) | 1370 | if (err) |
1310 | return 0; | 1371 | return 0; |
1311 | 1372 | ||
1312 | h->max_huge_pages = set_max_huge_pages(h, input); | 1373 | h = kobj_to_hstate(kobj, &nid); |
1374 | if (nid == NUMA_NO_NODE) { | ||
1375 | /* | ||
1376 | * global hstate attribute | ||
1377 | */ | ||
1378 | if (!(obey_mempolicy && | ||
1379 | init_nodemask_of_mempolicy(nodes_allowed))) { | ||
1380 | NODEMASK_FREE(nodes_allowed); | ||
1381 | nodes_allowed = &node_states[N_HIGH_MEMORY]; | ||
1382 | } | ||
1383 | } else if (nodes_allowed) { | ||
1384 | /* | ||
1385 | * per node hstate attribute: adjust count to global, | ||
1386 | * but restrict alloc/free to the specified node. | ||
1387 | */ | ||
1388 | count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; | ||
1389 | init_nodemask_of_node(nodes_allowed, nid); | ||
1390 | } else | ||
1391 | nodes_allowed = &node_states[N_HIGH_MEMORY]; | ||
1392 | |||
1393 | h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); | ||
1313 | 1394 | ||
1314 | return count; | 1395 | if (nodes_allowed != &node_states[N_HIGH_MEMORY]) |
1396 | NODEMASK_FREE(nodes_allowed); | ||
1397 | |||
1398 | return len; | ||
1399 | } | ||
1400 | |||
1401 | static ssize_t nr_hugepages_show(struct kobject *kobj, | ||
1402 | struct kobj_attribute *attr, char *buf) | ||
1403 | { | ||
1404 | return nr_hugepages_show_common(kobj, attr, buf); | ||
1405 | } | ||
1406 | |||
1407 | static ssize_t nr_hugepages_store(struct kobject *kobj, | ||
1408 | struct kobj_attribute *attr, const char *buf, size_t len) | ||
1409 | { | ||
1410 | return nr_hugepages_store_common(false, kobj, attr, buf, len); | ||
1315 | } | 1411 | } |
1316 | HSTATE_ATTR(nr_hugepages); | 1412 | HSTATE_ATTR(nr_hugepages); |
1317 | 1413 | ||
1414 | #ifdef CONFIG_NUMA | ||
1415 | |||
1416 | /* | ||
1417 | * hstate attribute for optionally mempolicy-based constraint on persistent | ||
1418 | * huge page alloc/free. | ||
1419 | */ | ||
1420 | static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, | ||
1421 | struct kobj_attribute *attr, char *buf) | ||
1422 | { | ||
1423 | return nr_hugepages_show_common(kobj, attr, buf); | ||
1424 | } | ||
1425 | |||
1426 | static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, | ||
1427 | struct kobj_attribute *attr, const char *buf, size_t len) | ||
1428 | { | ||
1429 | return nr_hugepages_store_common(true, kobj, attr, buf, len); | ||
1430 | } | ||
1431 | HSTATE_ATTR(nr_hugepages_mempolicy); | ||
1432 | #endif | ||
1433 | |||
1434 | |||
1318 | static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, | 1435 | static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, |
1319 | struct kobj_attribute *attr, char *buf) | 1436 | struct kobj_attribute *attr, char *buf) |
1320 | { | 1437 | { |
1321 | struct hstate *h = kobj_to_hstate(kobj); | 1438 | struct hstate *h = kobj_to_hstate(kobj, NULL); |
1322 | return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); | 1439 | return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); |
1323 | } | 1440 | } |
1324 | static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, | 1441 | static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, |
@@ -1326,7 +1443,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, | |||
1326 | { | 1443 | { |
1327 | int err; | 1444 | int err; |
1328 | unsigned long input; | 1445 | unsigned long input; |
1329 | struct hstate *h = kobj_to_hstate(kobj); | 1446 | struct hstate *h = kobj_to_hstate(kobj, NULL); |
1330 | 1447 | ||
1331 | err = strict_strtoul(buf, 10, &input); | 1448 | err = strict_strtoul(buf, 10, &input); |
1332 | if (err) | 1449 | if (err) |
@@ -1343,15 +1460,24 @@ HSTATE_ATTR(nr_overcommit_hugepages); | |||
1343 | static ssize_t free_hugepages_show(struct kobject *kobj, | 1460 | static ssize_t free_hugepages_show(struct kobject *kobj, |
1344 | struct kobj_attribute *attr, char *buf) | 1461 | struct kobj_attribute *attr, char *buf) |
1345 | { | 1462 | { |
1346 | struct hstate *h = kobj_to_hstate(kobj); | 1463 | struct hstate *h; |
1347 | return sprintf(buf, "%lu\n", h->free_huge_pages); | 1464 | unsigned long free_huge_pages; |
1465 | int nid; | ||
1466 | |||
1467 | h = kobj_to_hstate(kobj, &nid); | ||
1468 | if (nid == NUMA_NO_NODE) | ||
1469 | free_huge_pages = h->free_huge_pages; | ||
1470 | else | ||
1471 | free_huge_pages = h->free_huge_pages_node[nid]; | ||
1472 | |||
1473 | return sprintf(buf, "%lu\n", free_huge_pages); | ||
1348 | } | 1474 | } |
1349 | HSTATE_ATTR_RO(free_hugepages); | 1475 | HSTATE_ATTR_RO(free_hugepages); |
1350 | 1476 | ||
1351 | static ssize_t resv_hugepages_show(struct kobject *kobj, | 1477 | static ssize_t resv_hugepages_show(struct kobject *kobj, |
1352 | struct kobj_attribute *attr, char *buf) | 1478 | struct kobj_attribute *attr, char *buf) |
1353 | { | 1479 | { |
1354 | struct hstate *h = kobj_to_hstate(kobj); | 1480 | struct hstate *h = kobj_to_hstate(kobj, NULL); |
1355 | return sprintf(buf, "%lu\n", h->resv_huge_pages); | 1481 | return sprintf(buf, "%lu\n", h->resv_huge_pages); |
1356 | } | 1482 | } |
1357 | HSTATE_ATTR_RO(resv_hugepages); | 1483 | HSTATE_ATTR_RO(resv_hugepages); |
@@ -1359,8 +1485,17 @@ HSTATE_ATTR_RO(resv_hugepages); | |||
1359 | static ssize_t surplus_hugepages_show(struct kobject *kobj, | 1485 | static ssize_t surplus_hugepages_show(struct kobject *kobj, |
1360 | struct kobj_attribute *attr, char *buf) | 1486 | struct kobj_attribute *attr, char *buf) |
1361 | { | 1487 | { |
1362 | struct hstate *h = kobj_to_hstate(kobj); | 1488 | struct hstate *h; |
1363 | return sprintf(buf, "%lu\n", h->surplus_huge_pages); | 1489 | unsigned long surplus_huge_pages; |
1490 | int nid; | ||
1491 | |||
1492 | h = kobj_to_hstate(kobj, &nid); | ||
1493 | if (nid == NUMA_NO_NODE) | ||
1494 | surplus_huge_pages = h->surplus_huge_pages; | ||
1495 | else | ||
1496 | surplus_huge_pages = h->surplus_huge_pages_node[nid]; | ||
1497 | |||
1498 | return sprintf(buf, "%lu\n", surplus_huge_pages); | ||
1364 | } | 1499 | } |
1365 | HSTATE_ATTR_RO(surplus_hugepages); | 1500 | HSTATE_ATTR_RO(surplus_hugepages); |
1366 | 1501 | ||
@@ -1370,6 +1505,9 @@ static struct attribute *hstate_attrs[] = { | |||
1370 | &free_hugepages_attr.attr, | 1505 | &free_hugepages_attr.attr, |
1371 | &resv_hugepages_attr.attr, | 1506 | &resv_hugepages_attr.attr, |
1372 | &surplus_hugepages_attr.attr, | 1507 | &surplus_hugepages_attr.attr, |
1508 | #ifdef CONFIG_NUMA | ||
1509 | &nr_hugepages_mempolicy_attr.attr, | ||
1510 | #endif | ||
1373 | NULL, | 1511 | NULL, |
1374 | }; | 1512 | }; |
1375 | 1513 | ||
@@ -1377,19 +1515,21 @@ static struct attribute_group hstate_attr_group = { | |||
1377 | .attrs = hstate_attrs, | 1515 | .attrs = hstate_attrs, |
1378 | }; | 1516 | }; |
1379 | 1517 | ||
1380 | static int __init hugetlb_sysfs_add_hstate(struct hstate *h) | 1518 | static int __init hugetlb_sysfs_add_hstate(struct hstate *h, |
1519 | struct kobject *parent, | ||
1520 | struct kobject **hstate_kobjs, | ||
1521 | struct attribute_group *hstate_attr_group) | ||
1381 | { | 1522 | { |
1382 | int retval; | 1523 | int retval; |
1524 | int hi = h - hstates; | ||
1383 | 1525 | ||
1384 | hstate_kobjs[h - hstates] = kobject_create_and_add(h->name, | 1526 | hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); |
1385 | hugepages_kobj); | 1527 | if (!hstate_kobjs[hi]) |
1386 | if (!hstate_kobjs[h - hstates]) | ||
1387 | return -ENOMEM; | 1528 | return -ENOMEM; |
1388 | 1529 | ||
1389 | retval = sysfs_create_group(hstate_kobjs[h - hstates], | 1530 | retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); |
1390 | &hstate_attr_group); | ||
1391 | if (retval) | 1531 | if (retval) |
1392 | kobject_put(hstate_kobjs[h - hstates]); | 1532 | kobject_put(hstate_kobjs[hi]); |
1393 | 1533 | ||
1394 | return retval; | 1534 | return retval; |
1395 | } | 1535 | } |
@@ -1404,17 +1544,184 @@ static void __init hugetlb_sysfs_init(void) | |||
1404 | return; | 1544 | return; |
1405 | 1545 | ||
1406 | for_each_hstate(h) { | 1546 | for_each_hstate(h) { |
1407 | err = hugetlb_sysfs_add_hstate(h); | 1547 | err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, |
1548 | hstate_kobjs, &hstate_attr_group); | ||
1408 | if (err) | 1549 | if (err) |
1409 | printk(KERN_ERR "Hugetlb: Unable to add hstate %s", | 1550 | printk(KERN_ERR "Hugetlb: Unable to add hstate %s", |
1410 | h->name); | 1551 | h->name); |
1411 | } | 1552 | } |
1412 | } | 1553 | } |
1413 | 1554 | ||
1555 | #ifdef CONFIG_NUMA | ||
1556 | |||
1557 | /* | ||
1558 | * node_hstate/s - associate per node hstate attributes, via their kobjects, | ||
1559 | * with node sysdevs in node_devices[] using a parallel array. The array | ||
1560 | * index of a node sysdev or _hstate == node id. | ||
1561 | * This is here to avoid any static dependency of the node sysdev driver, in | ||
1562 | * the base kernel, on the hugetlb module. | ||
1563 | */ | ||
1564 | struct node_hstate { | ||
1565 | struct kobject *hugepages_kobj; | ||
1566 | struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; | ||
1567 | }; | ||
1568 | struct node_hstate node_hstates[MAX_NUMNODES]; | ||
1569 | |||
1570 | /* | ||
1571 | * A subset of global hstate attributes for node sysdevs | ||
1572 | */ | ||
1573 | static struct attribute *per_node_hstate_attrs[] = { | ||
1574 | &nr_hugepages_attr.attr, | ||
1575 | &free_hugepages_attr.attr, | ||
1576 | &surplus_hugepages_attr.attr, | ||
1577 | NULL, | ||
1578 | }; | ||
1579 | |||
1580 | static struct attribute_group per_node_hstate_attr_group = { | ||
1581 | .attrs = per_node_hstate_attrs, | ||
1582 | }; | ||
1583 | |||
1584 | /* | ||
1585 | * kobj_to_node_hstate - lookup global hstate for node sysdev hstate attr kobj. | ||
1586 | * Returns node id via non-NULL nidp. | ||
1587 | */ | ||
1588 | static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) | ||
1589 | { | ||
1590 | int nid; | ||
1591 | |||
1592 | for (nid = 0; nid < nr_node_ids; nid++) { | ||
1593 | struct node_hstate *nhs = &node_hstates[nid]; | ||
1594 | int i; | ||
1595 | for (i = 0; i < HUGE_MAX_HSTATE; i++) | ||
1596 | if (nhs->hstate_kobjs[i] == kobj) { | ||
1597 | if (nidp) | ||
1598 | *nidp = nid; | ||
1599 | return &hstates[i]; | ||
1600 | } | ||
1601 | } | ||
1602 | |||
1603 | BUG(); | ||
1604 | return NULL; | ||
1605 | } | ||
1606 | |||
1607 | /* | ||
1608 | * Unregister hstate attributes from a single node sysdev. | ||
1609 | * No-op if no hstate attributes attached. | ||
1610 | */ | ||
1611 | void hugetlb_unregister_node(struct node *node) | ||
1612 | { | ||
1613 | struct hstate *h; | ||
1614 | struct node_hstate *nhs = &node_hstates[node->sysdev.id]; | ||
1615 | |||
1616 | if (!nhs->hugepages_kobj) | ||
1617 | return; /* no hstate attributes */ | ||
1618 | |||
1619 | for_each_hstate(h) | ||
1620 | if (nhs->hstate_kobjs[h - hstates]) { | ||
1621 | kobject_put(nhs->hstate_kobjs[h - hstates]); | ||
1622 | nhs->hstate_kobjs[h - hstates] = NULL; | ||
1623 | } | ||
1624 | |||
1625 | kobject_put(nhs->hugepages_kobj); | ||
1626 | nhs->hugepages_kobj = NULL; | ||
1627 | } | ||
1628 | |||
1629 | /* | ||
1630 | * hugetlb module exit: unregister hstate attributes from node sysdevs | ||
1631 | * that have them. | ||
1632 | */ | ||
1633 | static void hugetlb_unregister_all_nodes(void) | ||
1634 | { | ||
1635 | int nid; | ||
1636 | |||
1637 | /* | ||
1638 | * disable node sysdev registrations. | ||
1639 | */ | ||
1640 | register_hugetlbfs_with_node(NULL, NULL); | ||
1641 | |||
1642 | /* | ||
1643 | * remove hstate attributes from any nodes that have them. | ||
1644 | */ | ||
1645 | for (nid = 0; nid < nr_node_ids; nid++) | ||
1646 | hugetlb_unregister_node(&node_devices[nid]); | ||
1647 | } | ||
1648 | |||
1649 | /* | ||
1650 | * Register hstate attributes for a single node sysdev. | ||
1651 | * No-op if attributes already registered. | ||
1652 | */ | ||
1653 | void hugetlb_register_node(struct node *node) | ||
1654 | { | ||
1655 | struct hstate *h; | ||
1656 | struct node_hstate *nhs = &node_hstates[node->sysdev.id]; | ||
1657 | int err; | ||
1658 | |||
1659 | if (nhs->hugepages_kobj) | ||
1660 | return; /* already allocated */ | ||
1661 | |||
1662 | nhs->hugepages_kobj = kobject_create_and_add("hugepages", | ||
1663 | &node->sysdev.kobj); | ||
1664 | if (!nhs->hugepages_kobj) | ||
1665 | return; | ||
1666 | |||
1667 | for_each_hstate(h) { | ||
1668 | err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, | ||
1669 | nhs->hstate_kobjs, | ||
1670 | &per_node_hstate_attr_group); | ||
1671 | if (err) { | ||
1672 | printk(KERN_ERR "Hugetlb: Unable to add hstate %s" | ||
1673 | " for node %d\n", | ||
1674 | h->name, node->sysdev.id); | ||
1675 | hugetlb_unregister_node(node); | ||
1676 | break; | ||
1677 | } | ||
1678 | } | ||
1679 | } | ||
1680 | |||
1681 | /* | ||
1682 | * hugetlb init time: register hstate attributes for all registered node | ||
1683 | * sysdevs of nodes that have memory. All on-line nodes should have | ||
1684 | * registered their associated sysdev by this time. | ||
1685 | */ | ||
1686 | static void hugetlb_register_all_nodes(void) | ||
1687 | { | ||
1688 | int nid; | ||
1689 | |||
1690 | for_each_node_state(nid, N_HIGH_MEMORY) { | ||
1691 | struct node *node = &node_devices[nid]; | ||
1692 | if (node->sysdev.id == nid) | ||
1693 | hugetlb_register_node(node); | ||
1694 | } | ||
1695 | |||
1696 | /* | ||
1697 | * Let the node sysdev driver know we're here so it can | ||
1698 | * [un]register hstate attributes on node hotplug. | ||
1699 | */ | ||
1700 | register_hugetlbfs_with_node(hugetlb_register_node, | ||
1701 | hugetlb_unregister_node); | ||
1702 | } | ||
1703 | #else /* !CONFIG_NUMA */ | ||
1704 | |||
1705 | static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) | ||
1706 | { | ||
1707 | BUG(); | ||
1708 | if (nidp) | ||
1709 | *nidp = -1; | ||
1710 | return NULL; | ||
1711 | } | ||
1712 | |||
1713 | static void hugetlb_unregister_all_nodes(void) { } | ||
1714 | |||
1715 | static void hugetlb_register_all_nodes(void) { } | ||
1716 | |||
1717 | #endif | ||
1718 | |||
1414 | static void __exit hugetlb_exit(void) | 1719 | static void __exit hugetlb_exit(void) |
1415 | { | 1720 | { |
1416 | struct hstate *h; | 1721 | struct hstate *h; |
1417 | 1722 | ||
1723 | hugetlb_unregister_all_nodes(); | ||
1724 | |||
1418 | for_each_hstate(h) { | 1725 | for_each_hstate(h) { |
1419 | kobject_put(hstate_kobjs[h - hstates]); | 1726 | kobject_put(hstate_kobjs[h - hstates]); |
1420 | } | 1727 | } |
@@ -1449,6 +1756,8 @@ static int __init hugetlb_init(void) | |||
1449 | 1756 | ||
1450 | hugetlb_sysfs_init(); | 1757 | hugetlb_sysfs_init(); |
1451 | 1758 | ||
1759 | hugetlb_register_all_nodes(); | ||
1760 | |||
1452 | return 0; | 1761 | return 0; |
1453 | } | 1762 | } |
1454 | module_init(hugetlb_init); | 1763 | module_init(hugetlb_init); |
@@ -1472,8 +1781,8 @@ void __init hugetlb_add_hstate(unsigned order) | |||
1472 | h->free_huge_pages = 0; | 1781 | h->free_huge_pages = 0; |
1473 | for (i = 0; i < MAX_NUMNODES; ++i) | 1782 | for (i = 0; i < MAX_NUMNODES; ++i) |
1474 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); | 1783 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); |
1475 | h->next_nid_to_alloc = first_node(node_online_map); | 1784 | h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); |
1476 | h->next_nid_to_free = first_node(node_online_map); | 1785 | h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); |
1477 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", | 1786 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", |
1478 | huge_page_size(h)/1024); | 1787 | huge_page_size(h)/1024); |
1479 | 1788 | ||
@@ -1536,9 +1845,9 @@ static unsigned int cpuset_mems_nr(unsigned int *array) | |||
1536 | } | 1845 | } |
1537 | 1846 | ||
1538 | #ifdef CONFIG_SYSCTL | 1847 | #ifdef CONFIG_SYSCTL |
1539 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, | 1848 | static int hugetlb_sysctl_handler_common(bool obey_mempolicy, |
1540 | void __user *buffer, | 1849 | struct ctl_table *table, int write, |
1541 | size_t *length, loff_t *ppos) | 1850 | void __user *buffer, size_t *length, loff_t *ppos) |
1542 | { | 1851 | { |
1543 | struct hstate *h = &default_hstate; | 1852 | struct hstate *h = &default_hstate; |
1544 | unsigned long tmp; | 1853 | unsigned long tmp; |
@@ -1550,12 +1859,40 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, | |||
1550 | table->maxlen = sizeof(unsigned long); | 1859 | table->maxlen = sizeof(unsigned long); |
1551 | proc_doulongvec_minmax(table, write, buffer, length, ppos); | 1860 | proc_doulongvec_minmax(table, write, buffer, length, ppos); |
1552 | 1861 | ||
1553 | if (write) | 1862 | if (write) { |
1554 | h->max_huge_pages = set_max_huge_pages(h, tmp); | 1863 | NODEMASK_ALLOC(nodemask_t, nodes_allowed, |
1864 | GFP_KERNEL | __GFP_NORETRY); | ||
1865 | if (!(obey_mempolicy && | ||
1866 | init_nodemask_of_mempolicy(nodes_allowed))) { | ||
1867 | NODEMASK_FREE(nodes_allowed); | ||
1868 | nodes_allowed = &node_states[N_HIGH_MEMORY]; | ||
1869 | } | ||
1870 | h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); | ||
1871 | |||
1872 | if (nodes_allowed != &node_states[N_HIGH_MEMORY]) | ||
1873 | NODEMASK_FREE(nodes_allowed); | ||
1874 | } | ||
1555 | 1875 | ||
1556 | return 0; | 1876 | return 0; |
1557 | } | 1877 | } |
1558 | 1878 | ||
1879 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, | ||
1880 | void __user *buffer, size_t *length, loff_t *ppos) | ||
1881 | { | ||
1882 | |||
1883 | return hugetlb_sysctl_handler_common(false, table, write, | ||
1884 | buffer, length, ppos); | ||
1885 | } | ||
1886 | |||
1887 | #ifdef CONFIG_NUMA | ||
1888 | int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, | ||
1889 | void __user *buffer, size_t *length, loff_t *ppos) | ||
1890 | { | ||
1891 | return hugetlb_sysctl_handler_common(true, table, write, | ||
1892 | buffer, length, ppos); | ||
1893 | } | ||
1894 | #endif /* CONFIG_NUMA */ | ||
1895 | |||
1559 | int hugetlb_treat_movable_handler(struct ctl_table *table, int write, | 1896 | int hugetlb_treat_movable_handler(struct ctl_table *table, int write, |
1560 | void __user *buffer, | 1897 | void __user *buffer, |
1561 | size_t *length, loff_t *ppos) | 1898 | size_t *length, loff_t *ppos) |
@@ -1903,6 +2240,12 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1903 | + (vma->vm_pgoff >> PAGE_SHIFT); | 2240 | + (vma->vm_pgoff >> PAGE_SHIFT); |
1904 | mapping = (struct address_space *)page_private(page); | 2241 | mapping = (struct address_space *)page_private(page); |
1905 | 2242 | ||
2243 | /* | ||
2244 | * Take the mapping lock for the duration of the table walk. As | ||
2245 | * this mapping should be shared between all the VMAs, | ||
2246 | * __unmap_hugepage_range() is called as the lock is already held | ||
2247 | */ | ||
2248 | spin_lock(&mapping->i_mmap_lock); | ||
1906 | vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 2249 | vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
1907 | /* Do not unmap the current VMA */ | 2250 | /* Do not unmap the current VMA */ |
1908 | if (iter_vma == vma) | 2251 | if (iter_vma == vma) |
@@ -1916,10 +2259,11 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1916 | * from the time of fork. This would look like data corruption | 2259 | * from the time of fork. This would look like data corruption |
1917 | */ | 2260 | */ |
1918 | if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) | 2261 | if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) |
1919 | unmap_hugepage_range(iter_vma, | 2262 | __unmap_hugepage_range(iter_vma, |
1920 | address, address + huge_page_size(h), | 2263 | address, address + huge_page_size(h), |
1921 | page); | 2264 | page); |
1922 | } | 2265 | } |
2266 | spin_unlock(&mapping->i_mmap_lock); | ||
1923 | 2267 | ||
1924 | return 1; | 2268 | return 1; |
1925 | } | 2269 | } |
@@ -1959,6 +2303,9 @@ retry_avoidcopy: | |||
1959 | outside_reserve = 1; | 2303 | outside_reserve = 1; |
1960 | 2304 | ||
1961 | page_cache_get(old_page); | 2305 | page_cache_get(old_page); |
2306 | |||
2307 | /* Drop page_table_lock as buddy allocator may be called */ | ||
2308 | spin_unlock(&mm->page_table_lock); | ||
1962 | new_page = alloc_huge_page(vma, address, outside_reserve); | 2309 | new_page = alloc_huge_page(vma, address, outside_reserve); |
1963 | 2310 | ||
1964 | if (IS_ERR(new_page)) { | 2311 | if (IS_ERR(new_page)) { |
@@ -1976,19 +2323,25 @@ retry_avoidcopy: | |||
1976 | if (unmap_ref_private(mm, vma, old_page, address)) { | 2323 | if (unmap_ref_private(mm, vma, old_page, address)) { |
1977 | BUG_ON(page_count(old_page) != 1); | 2324 | BUG_ON(page_count(old_page) != 1); |
1978 | BUG_ON(huge_pte_none(pte)); | 2325 | BUG_ON(huge_pte_none(pte)); |
2326 | spin_lock(&mm->page_table_lock); | ||
1979 | goto retry_avoidcopy; | 2327 | goto retry_avoidcopy; |
1980 | } | 2328 | } |
1981 | WARN_ON_ONCE(1); | 2329 | WARN_ON_ONCE(1); |
1982 | } | 2330 | } |
1983 | 2331 | ||
2332 | /* Caller expects lock to be held */ | ||
2333 | spin_lock(&mm->page_table_lock); | ||
1984 | return -PTR_ERR(new_page); | 2334 | return -PTR_ERR(new_page); |
1985 | } | 2335 | } |
1986 | 2336 | ||
1987 | spin_unlock(&mm->page_table_lock); | ||
1988 | copy_huge_page(new_page, old_page, address, vma); | 2337 | copy_huge_page(new_page, old_page, address, vma); |
1989 | __SetPageUptodate(new_page); | 2338 | __SetPageUptodate(new_page); |
1990 | spin_lock(&mm->page_table_lock); | ||
1991 | 2339 | ||
2340 | /* | ||
2341 | * Retake the page_table_lock to check for racing updates | ||
2342 | * before the page tables are altered | ||
2343 | */ | ||
2344 | spin_lock(&mm->page_table_lock); | ||
1992 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); | 2345 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); |
1993 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { | 2346 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { |
1994 | /* Break COW */ | 2347 | /* Break COW */ |
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index e1d85137f086..10ea71905c1f 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c | |||
@@ -3,18 +3,68 @@ | |||
3 | #include <linux/debugfs.h> | 3 | #include <linux/debugfs.h> |
4 | #include <linux/kernel.h> | 4 | #include <linux/kernel.h> |
5 | #include <linux/mm.h> | 5 | #include <linux/mm.h> |
6 | #include <linux/swap.h> | ||
7 | #include <linux/pagemap.h> | ||
8 | #include "internal.h" | ||
6 | 9 | ||
7 | static struct dentry *hwpoison_dir, *corrupt_pfn; | 10 | static struct dentry *hwpoison_dir; |
8 | 11 | ||
9 | static int hwpoison_inject(void *data, u64 val) | 12 | static int hwpoison_inject(void *data, u64 val) |
10 | { | 13 | { |
14 | unsigned long pfn = val; | ||
15 | struct page *p; | ||
16 | int err; | ||
17 | |||
18 | if (!capable(CAP_SYS_ADMIN)) | ||
19 | return -EPERM; | ||
20 | |||
21 | if (!hwpoison_filter_enable) | ||
22 | goto inject; | ||
23 | if (!pfn_valid(pfn)) | ||
24 | return -ENXIO; | ||
25 | |||
26 | p = pfn_to_page(pfn); | ||
27 | /* | ||
28 | * This implies unable to support free buddy pages. | ||
29 | */ | ||
30 | if (!get_page_unless_zero(p)) | ||
31 | return 0; | ||
32 | |||
33 | if (!PageLRU(p)) | ||
34 | shake_page(p, 0); | ||
35 | /* | ||
36 | * This implies unable to support non-LRU pages. | ||
37 | */ | ||
38 | if (!PageLRU(p)) | ||
39 | return 0; | ||
40 | |||
41 | /* | ||
42 | * do a racy check with elevated page count, to make sure PG_hwpoison | ||
43 | * will only be set for the targeted owner (or on a free page). | ||
44 | * We temporarily take page lock for try_get_mem_cgroup_from_page(). | ||
45 | * __memory_failure() will redo the check reliably inside page lock. | ||
46 | */ | ||
47 | lock_page(p); | ||
48 | err = hwpoison_filter(p); | ||
49 | unlock_page(p); | ||
50 | if (err) | ||
51 | return 0; | ||
52 | |||
53 | inject: | ||
54 | printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); | ||
55 | return __memory_failure(pfn, 18, MF_COUNT_INCREASED); | ||
56 | } | ||
57 | |||
58 | static int hwpoison_unpoison(void *data, u64 val) | ||
59 | { | ||
11 | if (!capable(CAP_SYS_ADMIN)) | 60 | if (!capable(CAP_SYS_ADMIN)) |
12 | return -EPERM; | 61 | return -EPERM; |
13 | printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val); | 62 | |
14 | return __memory_failure(val, 18, 0); | 63 | return unpoison_memory(val); |
15 | } | 64 | } |
16 | 65 | ||
17 | DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); | 66 | DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); |
67 | DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n"); | ||
18 | 68 | ||
19 | static void pfn_inject_exit(void) | 69 | static void pfn_inject_exit(void) |
20 | { | 70 | { |
@@ -24,16 +74,63 @@ static void pfn_inject_exit(void) | |||
24 | 74 | ||
25 | static int pfn_inject_init(void) | 75 | static int pfn_inject_init(void) |
26 | { | 76 | { |
77 | struct dentry *dentry; | ||
78 | |||
27 | hwpoison_dir = debugfs_create_dir("hwpoison", NULL); | 79 | hwpoison_dir = debugfs_create_dir("hwpoison", NULL); |
28 | if (hwpoison_dir == NULL) | 80 | if (hwpoison_dir == NULL) |
29 | return -ENOMEM; | 81 | return -ENOMEM; |
30 | corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, | 82 | |
83 | /* | ||
84 | * Note that the below poison/unpoison interfaces do not involve | ||
85 | * hardware status change, hence do not require hardware support. | ||
86 | * They are mainly for testing hwpoison in software level. | ||
87 | */ | ||
88 | dentry = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, | ||
31 | NULL, &hwpoison_fops); | 89 | NULL, &hwpoison_fops); |
32 | if (corrupt_pfn == NULL) { | 90 | if (!dentry) |
33 | pfn_inject_exit(); | 91 | goto fail; |
34 | return -ENOMEM; | 92 | |
35 | } | 93 | dentry = debugfs_create_file("unpoison-pfn", 0600, hwpoison_dir, |
94 | NULL, &unpoison_fops); | ||
95 | if (!dentry) | ||
96 | goto fail; | ||
97 | |||
98 | dentry = debugfs_create_u32("corrupt-filter-enable", 0600, | ||
99 | hwpoison_dir, &hwpoison_filter_enable); | ||
100 | if (!dentry) | ||
101 | goto fail; | ||
102 | |||
103 | dentry = debugfs_create_u32("corrupt-filter-dev-major", 0600, | ||
104 | hwpoison_dir, &hwpoison_filter_dev_major); | ||
105 | if (!dentry) | ||
106 | goto fail; | ||
107 | |||
108 | dentry = debugfs_create_u32("corrupt-filter-dev-minor", 0600, | ||
109 | hwpoison_dir, &hwpoison_filter_dev_minor); | ||
110 | if (!dentry) | ||
111 | goto fail; | ||
112 | |||
113 | dentry = debugfs_create_u64("corrupt-filter-flags-mask", 0600, | ||
114 | hwpoison_dir, &hwpoison_filter_flags_mask); | ||
115 | if (!dentry) | ||
116 | goto fail; | ||
117 | |||
118 | dentry = debugfs_create_u64("corrupt-filter-flags-value", 0600, | ||
119 | hwpoison_dir, &hwpoison_filter_flags_value); | ||
120 | if (!dentry) | ||
121 | goto fail; | ||
122 | |||
123 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | ||
124 | dentry = debugfs_create_u64("corrupt-filter-memcg", 0600, | ||
125 | hwpoison_dir, &hwpoison_filter_memcg); | ||
126 | if (!dentry) | ||
127 | goto fail; | ||
128 | #endif | ||
129 | |||
36 | return 0; | 130 | return 0; |
131 | fail: | ||
132 | pfn_inject_exit(); | ||
133 | return -ENOMEM; | ||
37 | } | 134 | } |
38 | 135 | ||
39 | module_init(pfn_inject_init); | 136 | module_init(pfn_inject_init); |
diff --git a/mm/internal.h b/mm/internal.h index 22ec8d2b0fb8..6a697bb97fc5 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -50,6 +50,9 @@ extern void putback_lru_page(struct page *page); | |||
50 | */ | 50 | */ |
51 | extern void __free_pages_bootmem(struct page *page, unsigned int order); | 51 | extern void __free_pages_bootmem(struct page *page, unsigned int order); |
52 | extern void prep_compound_page(struct page *page, unsigned long order); | 52 | extern void prep_compound_page(struct page *page, unsigned long order); |
53 | #ifdef CONFIG_MEMORY_FAILURE | ||
54 | extern bool is_free_buddy_page(struct page *page); | ||
55 | #endif | ||
53 | 56 | ||
54 | 57 | ||
55 | /* | 58 | /* |
@@ -63,7 +66,7 @@ static inline unsigned long page_order(struct page *page) | |||
63 | return page_private(page); | 66 | return page_private(page); |
64 | } | 67 | } |
65 | 68 | ||
66 | #ifdef CONFIG_HAVE_MLOCK | 69 | #ifdef CONFIG_MMU |
67 | extern long mlock_vma_pages_range(struct vm_area_struct *vma, | 70 | extern long mlock_vma_pages_range(struct vm_area_struct *vma, |
68 | unsigned long start, unsigned long end); | 71 | unsigned long start, unsigned long end); |
69 | extern void munlock_vma_pages_range(struct vm_area_struct *vma, | 72 | extern void munlock_vma_pages_range(struct vm_area_struct *vma, |
@@ -72,22 +75,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) | |||
72 | { | 75 | { |
73 | munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); | 76 | munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); |
74 | } | 77 | } |
75 | #endif | ||
76 | 78 | ||
77 | /* | 79 | /* |
78 | * unevictable_migrate_page() called only from migrate_page_copy() to | ||
79 | * migrate unevictable flag to new page. | ||
80 | * Note that the old page has been isolated from the LRU lists at this | ||
81 | * point so we don't need to worry about LRU statistics. | ||
82 | */ | ||
83 | static inline void unevictable_migrate_page(struct page *new, struct page *old) | ||
84 | { | ||
85 | if (TestClearPageUnevictable(old)) | ||
86 | SetPageUnevictable(new); | ||
87 | } | ||
88 | |||
89 | #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT | ||
90 | /* | ||
91 | * Called only in fault path via page_evictable() for a new page | 80 | * Called only in fault path via page_evictable() for a new page |
92 | * to determine if it's being mapped into a LOCKED vma. | 81 | * to determine if it's being mapped into a LOCKED vma. |
93 | * If so, mark page as mlocked. | 82 | * If so, mark page as mlocked. |
@@ -107,9 +96,10 @@ static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page) | |||
107 | } | 96 | } |
108 | 97 | ||
109 | /* | 98 | /* |
110 | * must be called with vma's mmap_sem held for read, and page locked. | 99 | * must be called with vma's mmap_sem held for read or write, and page locked. |
111 | */ | 100 | */ |
112 | extern void mlock_vma_page(struct page *page); | 101 | extern void mlock_vma_page(struct page *page); |
102 | extern void munlock_vma_page(struct page *page); | ||
113 | 103 | ||
114 | /* | 104 | /* |
115 | * Clear the page's PageMlocked(). This can be useful in a situation where | 105 | * Clear the page's PageMlocked(). This can be useful in a situation where |
@@ -144,7 +134,7 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) | |||
144 | } | 134 | } |
145 | } | 135 | } |
146 | 136 | ||
147 | #else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ | 137 | #else /* !CONFIG_MMU */ |
148 | static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) | 138 | static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) |
149 | { | 139 | { |
150 | return 0; | 140 | return 0; |
@@ -153,7 +143,7 @@ static inline void clear_page_mlock(struct page *page) { } | |||
153 | static inline void mlock_vma_page(struct page *page) { } | 143 | static inline void mlock_vma_page(struct page *page) { } |
154 | static inline void mlock_migrate_page(struct page *new, struct page *old) { } | 144 | static inline void mlock_migrate_page(struct page *new, struct page *old) { } |
155 | 145 | ||
156 | #endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ | 146 | #endif /* !CONFIG_MMU */ |
157 | 147 | ||
158 | /* | 148 | /* |
159 | * Return the mem_map entry representing the 'offset' subpage within | 149 | * Return the mem_map entry representing the 'offset' subpage within |
@@ -260,3 +250,12 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
260 | #define ZONE_RECLAIM_SOME 0 | 250 | #define ZONE_RECLAIM_SOME 0 |
261 | #define ZONE_RECLAIM_SUCCESS 1 | 251 | #define ZONE_RECLAIM_SUCCESS 1 |
262 | #endif | 252 | #endif |
253 | |||
254 | extern int hwpoison_filter(struct page *p); | ||
255 | |||
256 | extern u32 hwpoison_filter_dev_major; | ||
257 | extern u32 hwpoison_filter_dev_minor; | ||
258 | extern u64 hwpoison_filter_flags_mask; | ||
259 | extern u64 hwpoison_filter_flags_value; | ||
260 | extern u64 hwpoison_filter_memcg; | ||
261 | extern u32 hwpoison_filter_enable; | ||
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 4ea4510e2996..5b069e4f5e48 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -93,6 +93,7 @@ | |||
93 | #include <linux/nodemask.h> | 93 | #include <linux/nodemask.h> |
94 | #include <linux/mm.h> | 94 | #include <linux/mm.h> |
95 | #include <linux/workqueue.h> | 95 | #include <linux/workqueue.h> |
96 | #include <linux/crc32.h> | ||
96 | 97 | ||
97 | #include <asm/sections.h> | 98 | #include <asm/sections.h> |
98 | #include <asm/processor.h> | 99 | #include <asm/processor.h> |
@@ -108,7 +109,6 @@ | |||
108 | #define MSECS_MIN_AGE 5000 /* minimum object age for reporting */ | 109 | #define MSECS_MIN_AGE 5000 /* minimum object age for reporting */ |
109 | #define SECS_FIRST_SCAN 60 /* delay before the first scan */ | 110 | #define SECS_FIRST_SCAN 60 /* delay before the first scan */ |
110 | #define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ | 111 | #define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ |
111 | #define GRAY_LIST_PASSES 25 /* maximum number of gray list scans */ | ||
112 | #define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */ | 112 | #define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */ |
113 | 113 | ||
114 | #define BYTES_PER_POINTER sizeof(void *) | 114 | #define BYTES_PER_POINTER sizeof(void *) |
@@ -119,8 +119,8 @@ | |||
119 | /* scanning area inside a memory block */ | 119 | /* scanning area inside a memory block */ |
120 | struct kmemleak_scan_area { | 120 | struct kmemleak_scan_area { |
121 | struct hlist_node node; | 121 | struct hlist_node node; |
122 | unsigned long offset; | 122 | unsigned long start; |
123 | size_t length; | 123 | size_t size; |
124 | }; | 124 | }; |
125 | 125 | ||
126 | #define KMEMLEAK_GREY 0 | 126 | #define KMEMLEAK_GREY 0 |
@@ -149,6 +149,8 @@ struct kmemleak_object { | |||
149 | int min_count; | 149 | int min_count; |
150 | /* the total number of pointers found pointing to this object */ | 150 | /* the total number of pointers found pointing to this object */ |
151 | int count; | 151 | int count; |
152 | /* checksum for detecting modified objects */ | ||
153 | u32 checksum; | ||
152 | /* memory ranges to be scanned inside an object (empty for all) */ | 154 | /* memory ranges to be scanned inside an object (empty for all) */ |
153 | struct hlist_head area_list; | 155 | struct hlist_head area_list; |
154 | unsigned long trace[MAX_TRACE]; | 156 | unsigned long trace[MAX_TRACE]; |
@@ -164,8 +166,6 @@ struct kmemleak_object { | |||
164 | #define OBJECT_REPORTED (1 << 1) | 166 | #define OBJECT_REPORTED (1 << 1) |
165 | /* flag set to not scan the object */ | 167 | /* flag set to not scan the object */ |
166 | #define OBJECT_NO_SCAN (1 << 2) | 168 | #define OBJECT_NO_SCAN (1 << 2) |
167 | /* flag set on newly allocated objects */ | ||
168 | #define OBJECT_NEW (1 << 3) | ||
169 | 169 | ||
170 | /* number of bytes to print per line; must be 16 or 32 */ | 170 | /* number of bytes to print per line; must be 16 or 32 */ |
171 | #define HEX_ROW_SIZE 16 | 171 | #define HEX_ROW_SIZE 16 |
@@ -241,8 +241,6 @@ struct early_log { | |||
241 | const void *ptr; /* allocated/freed memory block */ | 241 | const void *ptr; /* allocated/freed memory block */ |
242 | size_t size; /* memory block size */ | 242 | size_t size; /* memory block size */ |
243 | int min_count; /* minimum reference count */ | 243 | int min_count; /* minimum reference count */ |
244 | unsigned long offset; /* scan area offset */ | ||
245 | size_t length; /* scan area length */ | ||
246 | unsigned long trace[MAX_TRACE]; /* stack trace */ | 244 | unsigned long trace[MAX_TRACE]; /* stack trace */ |
247 | unsigned int trace_len; /* stack trace length */ | 245 | unsigned int trace_len; /* stack trace length */ |
248 | }; | 246 | }; |
@@ -323,11 +321,6 @@ static bool color_gray(const struct kmemleak_object *object) | |||
323 | object->count >= object->min_count; | 321 | object->count >= object->min_count; |
324 | } | 322 | } |
325 | 323 | ||
326 | static bool color_black(const struct kmemleak_object *object) | ||
327 | { | ||
328 | return object->min_count == KMEMLEAK_BLACK; | ||
329 | } | ||
330 | |||
331 | /* | 324 | /* |
332 | * Objects are considered unreferenced only if their color is white, they have | 325 | * Objects are considered unreferenced only if their color is white, they have |
333 | * not be deleted and have a minimum age to avoid false positives caused by | 326 | * not be deleted and have a minimum age to avoid false positives caused by |
@@ -335,7 +328,7 @@ static bool color_black(const struct kmemleak_object *object) | |||
335 | */ | 328 | */ |
336 | static bool unreferenced_object(struct kmemleak_object *object) | 329 | static bool unreferenced_object(struct kmemleak_object *object) |
337 | { | 330 | { |
338 | return (object->flags & OBJECT_ALLOCATED) && color_white(object) && | 331 | return (color_white(object) && object->flags & OBJECT_ALLOCATED) && |
339 | time_before_eq(object->jiffies + jiffies_min_age, | 332 | time_before_eq(object->jiffies + jiffies_min_age, |
340 | jiffies_last_scan); | 333 | jiffies_last_scan); |
341 | } | 334 | } |
@@ -348,11 +341,13 @@ static void print_unreferenced(struct seq_file *seq, | |||
348 | struct kmemleak_object *object) | 341 | struct kmemleak_object *object) |
349 | { | 342 | { |
350 | int i; | 343 | int i; |
344 | unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies); | ||
351 | 345 | ||
352 | seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", | 346 | seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", |
353 | object->pointer, object->size); | 347 | object->pointer, object->size); |
354 | seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n", | 348 | seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n", |
355 | object->comm, object->pid, object->jiffies); | 349 | object->comm, object->pid, object->jiffies, |
350 | msecs_age / 1000, msecs_age % 1000); | ||
356 | hex_dump_object(seq, object); | 351 | hex_dump_object(seq, object); |
357 | seq_printf(seq, " backtrace:\n"); | 352 | seq_printf(seq, " backtrace:\n"); |
358 | 353 | ||
@@ -381,6 +376,7 @@ static void dump_object_info(struct kmemleak_object *object) | |||
381 | pr_notice(" min_count = %d\n", object->min_count); | 376 | pr_notice(" min_count = %d\n", object->min_count); |
382 | pr_notice(" count = %d\n", object->count); | 377 | pr_notice(" count = %d\n", object->count); |
383 | pr_notice(" flags = 0x%lx\n", object->flags); | 378 | pr_notice(" flags = 0x%lx\n", object->flags); |
379 | pr_notice(" checksum = %d\n", object->checksum); | ||
384 | pr_notice(" backtrace:\n"); | 380 | pr_notice(" backtrace:\n"); |
385 | print_stack_trace(&trace, 4); | 381 | print_stack_trace(&trace, 4); |
386 | } | 382 | } |
@@ -522,12 +518,13 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, | |||
522 | INIT_HLIST_HEAD(&object->area_list); | 518 | INIT_HLIST_HEAD(&object->area_list); |
523 | spin_lock_init(&object->lock); | 519 | spin_lock_init(&object->lock); |
524 | atomic_set(&object->use_count, 1); | 520 | atomic_set(&object->use_count, 1); |
525 | object->flags = OBJECT_ALLOCATED | OBJECT_NEW; | 521 | object->flags = OBJECT_ALLOCATED; |
526 | object->pointer = ptr; | 522 | object->pointer = ptr; |
527 | object->size = size; | 523 | object->size = size; |
528 | object->min_count = min_count; | 524 | object->min_count = min_count; |
529 | object->count = -1; /* no color initially */ | 525 | object->count = 0; /* white color initially */ |
530 | object->jiffies = jiffies; | 526 | object->jiffies = jiffies; |
527 | object->checksum = 0; | ||
531 | 528 | ||
532 | /* task information */ | 529 | /* task information */ |
533 | if (in_irq()) { | 530 | if (in_irq()) { |
@@ -720,14 +717,13 @@ static void make_black_object(unsigned long ptr) | |||
720 | * Add a scanning area to the object. If at least one such area is added, | 717 | * Add a scanning area to the object. If at least one such area is added, |
721 | * kmemleak will only scan these ranges rather than the whole memory block. | 718 | * kmemleak will only scan these ranges rather than the whole memory block. |
722 | */ | 719 | */ |
723 | static void add_scan_area(unsigned long ptr, unsigned long offset, | 720 | static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) |
724 | size_t length, gfp_t gfp) | ||
725 | { | 721 | { |
726 | unsigned long flags; | 722 | unsigned long flags; |
727 | struct kmemleak_object *object; | 723 | struct kmemleak_object *object; |
728 | struct kmemleak_scan_area *area; | 724 | struct kmemleak_scan_area *area; |
729 | 725 | ||
730 | object = find_and_get_object(ptr, 0); | 726 | object = find_and_get_object(ptr, 1); |
731 | if (!object) { | 727 | if (!object) { |
732 | kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n", | 728 | kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n", |
733 | ptr); | 729 | ptr); |
@@ -741,7 +737,7 @@ static void add_scan_area(unsigned long ptr, unsigned long offset, | |||
741 | } | 737 | } |
742 | 738 | ||
743 | spin_lock_irqsave(&object->lock, flags); | 739 | spin_lock_irqsave(&object->lock, flags); |
744 | if (offset + length > object->size) { | 740 | if (ptr + size > object->pointer + object->size) { |
745 | kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); | 741 | kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); |
746 | dump_object_info(object); | 742 | dump_object_info(object); |
747 | kmem_cache_free(scan_area_cache, area); | 743 | kmem_cache_free(scan_area_cache, area); |
@@ -749,8 +745,8 @@ static void add_scan_area(unsigned long ptr, unsigned long offset, | |||
749 | } | 745 | } |
750 | 746 | ||
751 | INIT_HLIST_NODE(&area->node); | 747 | INIT_HLIST_NODE(&area->node); |
752 | area->offset = offset; | 748 | area->start = ptr; |
753 | area->length = length; | 749 | area->size = size; |
754 | 750 | ||
755 | hlist_add_head(&area->node, &object->area_list); | 751 | hlist_add_head(&area->node, &object->area_list); |
756 | out_unlock: | 752 | out_unlock: |
@@ -786,7 +782,7 @@ static void object_no_scan(unsigned long ptr) | |||
786 | * processed later once kmemleak is fully initialized. | 782 | * processed later once kmemleak is fully initialized. |
787 | */ | 783 | */ |
788 | static void __init log_early(int op_type, const void *ptr, size_t size, | 784 | static void __init log_early(int op_type, const void *ptr, size_t size, |
789 | int min_count, unsigned long offset, size_t length) | 785 | int min_count) |
790 | { | 786 | { |
791 | unsigned long flags; | 787 | unsigned long flags; |
792 | struct early_log *log; | 788 | struct early_log *log; |
@@ -808,8 +804,6 @@ static void __init log_early(int op_type, const void *ptr, size_t size, | |||
808 | log->ptr = ptr; | 804 | log->ptr = ptr; |
809 | log->size = size; | 805 | log->size = size; |
810 | log->min_count = min_count; | 806 | log->min_count = min_count; |
811 | log->offset = offset; | ||
812 | log->length = length; | ||
813 | if (op_type == KMEMLEAK_ALLOC) | 807 | if (op_type == KMEMLEAK_ALLOC) |
814 | log->trace_len = __save_stack_trace(log->trace); | 808 | log->trace_len = __save_stack_trace(log->trace); |
815 | crt_early_log++; | 809 | crt_early_log++; |
@@ -833,12 +827,15 @@ static void early_alloc(struct early_log *log) | |||
833 | */ | 827 | */ |
834 | rcu_read_lock(); | 828 | rcu_read_lock(); |
835 | object = create_object((unsigned long)log->ptr, log->size, | 829 | object = create_object((unsigned long)log->ptr, log->size, |
836 | log->min_count, GFP_KERNEL); | 830 | log->min_count, GFP_ATOMIC); |
831 | if (!object) | ||
832 | goto out; | ||
837 | spin_lock_irqsave(&object->lock, flags); | 833 | spin_lock_irqsave(&object->lock, flags); |
838 | for (i = 0; i < log->trace_len; i++) | 834 | for (i = 0; i < log->trace_len; i++) |
839 | object->trace[i] = log->trace[i]; | 835 | object->trace[i] = log->trace[i]; |
840 | object->trace_len = log->trace_len; | 836 | object->trace_len = log->trace_len; |
841 | spin_unlock_irqrestore(&object->lock, flags); | 837 | spin_unlock_irqrestore(&object->lock, flags); |
838 | out: | ||
842 | rcu_read_unlock(); | 839 | rcu_read_unlock(); |
843 | } | 840 | } |
844 | 841 | ||
@@ -855,7 +852,7 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, | |||
855 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 852 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
856 | create_object((unsigned long)ptr, size, min_count, gfp); | 853 | create_object((unsigned long)ptr, size, min_count, gfp); |
857 | else if (atomic_read(&kmemleak_early_log)) | 854 | else if (atomic_read(&kmemleak_early_log)) |
858 | log_early(KMEMLEAK_ALLOC, ptr, size, min_count, 0, 0); | 855 | log_early(KMEMLEAK_ALLOC, ptr, size, min_count); |
859 | } | 856 | } |
860 | EXPORT_SYMBOL_GPL(kmemleak_alloc); | 857 | EXPORT_SYMBOL_GPL(kmemleak_alloc); |
861 | 858 | ||
@@ -870,7 +867,7 @@ void __ref kmemleak_free(const void *ptr) | |||
870 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 867 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
871 | delete_object_full((unsigned long)ptr); | 868 | delete_object_full((unsigned long)ptr); |
872 | else if (atomic_read(&kmemleak_early_log)) | 869 | else if (atomic_read(&kmemleak_early_log)) |
873 | log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0); | 870 | log_early(KMEMLEAK_FREE, ptr, 0, 0); |
874 | } | 871 | } |
875 | EXPORT_SYMBOL_GPL(kmemleak_free); | 872 | EXPORT_SYMBOL_GPL(kmemleak_free); |
876 | 873 | ||
@@ -885,7 +882,7 @@ void __ref kmemleak_free_part(const void *ptr, size_t size) | |||
885 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 882 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
886 | delete_object_part((unsigned long)ptr, size); | 883 | delete_object_part((unsigned long)ptr, size); |
887 | else if (atomic_read(&kmemleak_early_log)) | 884 | else if (atomic_read(&kmemleak_early_log)) |
888 | log_early(KMEMLEAK_FREE_PART, ptr, size, 0, 0, 0); | 885 | log_early(KMEMLEAK_FREE_PART, ptr, size, 0); |
889 | } | 886 | } |
890 | EXPORT_SYMBOL_GPL(kmemleak_free_part); | 887 | EXPORT_SYMBOL_GPL(kmemleak_free_part); |
891 | 888 | ||
@@ -900,7 +897,7 @@ void __ref kmemleak_not_leak(const void *ptr) | |||
900 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 897 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
901 | make_gray_object((unsigned long)ptr); | 898 | make_gray_object((unsigned long)ptr); |
902 | else if (atomic_read(&kmemleak_early_log)) | 899 | else if (atomic_read(&kmemleak_early_log)) |
903 | log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0, 0, 0); | 900 | log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0); |
904 | } | 901 | } |
905 | EXPORT_SYMBOL(kmemleak_not_leak); | 902 | EXPORT_SYMBOL(kmemleak_not_leak); |
906 | 903 | ||
@@ -916,22 +913,21 @@ void __ref kmemleak_ignore(const void *ptr) | |||
916 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 913 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
917 | make_black_object((unsigned long)ptr); | 914 | make_black_object((unsigned long)ptr); |
918 | else if (atomic_read(&kmemleak_early_log)) | 915 | else if (atomic_read(&kmemleak_early_log)) |
919 | log_early(KMEMLEAK_IGNORE, ptr, 0, 0, 0, 0); | 916 | log_early(KMEMLEAK_IGNORE, ptr, 0, 0); |
920 | } | 917 | } |
921 | EXPORT_SYMBOL(kmemleak_ignore); | 918 | EXPORT_SYMBOL(kmemleak_ignore); |
922 | 919 | ||
923 | /* | 920 | /* |
924 | * Limit the range to be scanned in an allocated memory block. | 921 | * Limit the range to be scanned in an allocated memory block. |
925 | */ | 922 | */ |
926 | void __ref kmemleak_scan_area(const void *ptr, unsigned long offset, | 923 | void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) |
927 | size_t length, gfp_t gfp) | ||
928 | { | 924 | { |
929 | pr_debug("%s(0x%p)\n", __func__, ptr); | 925 | pr_debug("%s(0x%p)\n", __func__, ptr); |
930 | 926 | ||
931 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 927 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
932 | add_scan_area((unsigned long)ptr, offset, length, gfp); | 928 | add_scan_area((unsigned long)ptr, size, gfp); |
933 | else if (atomic_read(&kmemleak_early_log)) | 929 | else if (atomic_read(&kmemleak_early_log)) |
934 | log_early(KMEMLEAK_SCAN_AREA, ptr, 0, 0, offset, length); | 930 | log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0); |
935 | } | 931 | } |
936 | EXPORT_SYMBOL(kmemleak_scan_area); | 932 | EXPORT_SYMBOL(kmemleak_scan_area); |
937 | 933 | ||
@@ -945,11 +941,25 @@ void __ref kmemleak_no_scan(const void *ptr) | |||
945 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 941 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
946 | object_no_scan((unsigned long)ptr); | 942 | object_no_scan((unsigned long)ptr); |
947 | else if (atomic_read(&kmemleak_early_log)) | 943 | else if (atomic_read(&kmemleak_early_log)) |
948 | log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0, 0, 0); | 944 | log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0); |
949 | } | 945 | } |
950 | EXPORT_SYMBOL(kmemleak_no_scan); | 946 | EXPORT_SYMBOL(kmemleak_no_scan); |
951 | 947 | ||
952 | /* | 948 | /* |
949 | * Update an object's checksum and return true if it was modified. | ||
950 | */ | ||
951 | static bool update_checksum(struct kmemleak_object *object) | ||
952 | { | ||
953 | u32 old_csum = object->checksum; | ||
954 | |||
955 | if (!kmemcheck_is_obj_initialized(object->pointer, object->size)) | ||
956 | return false; | ||
957 | |||
958 | object->checksum = crc32(0, (void *)object->pointer, object->size); | ||
959 | return object->checksum != old_csum; | ||
960 | } | ||
961 | |||
962 | /* | ||
953 | * Memory scanning is a long process and it needs to be interruptable. This | 963 | * Memory scanning is a long process and it needs to be interruptable. This |
954 | * function checks whether such interrupt condition occured. | 964 | * function checks whether such interrupt condition occured. |
955 | */ | 965 | */ |
@@ -1028,11 +1038,14 @@ static void scan_block(void *_start, void *_end, | |||
1028 | * added to the gray_list. | 1038 | * added to the gray_list. |
1029 | */ | 1039 | */ |
1030 | object->count++; | 1040 | object->count++; |
1031 | if (color_gray(object)) | 1041 | if (color_gray(object)) { |
1032 | list_add_tail(&object->gray_list, &gray_list); | 1042 | list_add_tail(&object->gray_list, &gray_list); |
1033 | else | 1043 | spin_unlock_irqrestore(&object->lock, flags); |
1034 | put_object(object); | 1044 | continue; |
1045 | } | ||
1046 | |||
1035 | spin_unlock_irqrestore(&object->lock, flags); | 1047 | spin_unlock_irqrestore(&object->lock, flags); |
1048 | put_object(object); | ||
1036 | } | 1049 | } |
1037 | } | 1050 | } |
1038 | 1051 | ||
@@ -1047,8 +1060,8 @@ static void scan_object(struct kmemleak_object *object) | |||
1047 | unsigned long flags; | 1060 | unsigned long flags; |
1048 | 1061 | ||
1049 | /* | 1062 | /* |
1050 | * Once the object->lock is aquired, the corresponding memory block | 1063 | * Once the object->lock is acquired, the corresponding memory block |
1051 | * cannot be freed (the same lock is aquired in delete_object). | 1064 | * cannot be freed (the same lock is acquired in delete_object). |
1052 | */ | 1065 | */ |
1053 | spin_lock_irqsave(&object->lock, flags); | 1066 | spin_lock_irqsave(&object->lock, flags); |
1054 | if (object->flags & OBJECT_NO_SCAN) | 1067 | if (object->flags & OBJECT_NO_SCAN) |
@@ -1072,14 +1085,47 @@ static void scan_object(struct kmemleak_object *object) | |||
1072 | } | 1085 | } |
1073 | } else | 1086 | } else |
1074 | hlist_for_each_entry(area, elem, &object->area_list, node) | 1087 | hlist_for_each_entry(area, elem, &object->area_list, node) |
1075 | scan_block((void *)(object->pointer + area->offset), | 1088 | scan_block((void *)area->start, |
1076 | (void *)(object->pointer + area->offset | 1089 | (void *)(area->start + area->size), |
1077 | + area->length), object, 0); | 1090 | object, 0); |
1078 | out: | 1091 | out: |
1079 | spin_unlock_irqrestore(&object->lock, flags); | 1092 | spin_unlock_irqrestore(&object->lock, flags); |
1080 | } | 1093 | } |
1081 | 1094 | ||
1082 | /* | 1095 | /* |
1096 | * Scan the objects already referenced (gray objects). More objects will be | ||
1097 | * referenced and, if there are no memory leaks, all the objects are scanned. | ||
1098 | */ | ||
1099 | static void scan_gray_list(void) | ||
1100 | { | ||
1101 | struct kmemleak_object *object, *tmp; | ||
1102 | |||
1103 | /* | ||
1104 | * The list traversal is safe for both tail additions and removals | ||
1105 | * from inside the loop. The kmemleak objects cannot be freed from | ||
1106 | * outside the loop because their use_count was incremented. | ||
1107 | */ | ||
1108 | object = list_entry(gray_list.next, typeof(*object), gray_list); | ||
1109 | while (&object->gray_list != &gray_list) { | ||
1110 | cond_resched(); | ||
1111 | |||
1112 | /* may add new objects to the list */ | ||
1113 | if (!scan_should_stop()) | ||
1114 | scan_object(object); | ||
1115 | |||
1116 | tmp = list_entry(object->gray_list.next, typeof(*object), | ||
1117 | gray_list); | ||
1118 | |||
1119 | /* remove the object from the list and release it */ | ||
1120 | list_del(&object->gray_list); | ||
1121 | put_object(object); | ||
1122 | |||
1123 | object = tmp; | ||
1124 | } | ||
1125 | WARN_ON(!list_empty(&gray_list)); | ||
1126 | } | ||
1127 | |||
1128 | /* | ||
1083 | * Scan data sections and all the referenced memory blocks allocated via the | 1129 | * Scan data sections and all the referenced memory blocks allocated via the |
1084 | * kernel's standard allocators. This function must be called with the | 1130 | * kernel's standard allocators. This function must be called with the |
1085 | * scan_mutex held. | 1131 | * scan_mutex held. |
@@ -1087,10 +1133,9 @@ out: | |||
1087 | static void kmemleak_scan(void) | 1133 | static void kmemleak_scan(void) |
1088 | { | 1134 | { |
1089 | unsigned long flags; | 1135 | unsigned long flags; |
1090 | struct kmemleak_object *object, *tmp; | 1136 | struct kmemleak_object *object; |
1091 | int i; | 1137 | int i; |
1092 | int new_leaks = 0; | 1138 | int new_leaks = 0; |
1093 | int gray_list_pass = 0; | ||
1094 | 1139 | ||
1095 | jiffies_last_scan = jiffies; | 1140 | jiffies_last_scan = jiffies; |
1096 | 1141 | ||
@@ -1111,7 +1156,6 @@ static void kmemleak_scan(void) | |||
1111 | #endif | 1156 | #endif |
1112 | /* reset the reference count (whiten the object) */ | 1157 | /* reset the reference count (whiten the object) */ |
1113 | object->count = 0; | 1158 | object->count = 0; |
1114 | object->flags &= ~OBJECT_NEW; | ||
1115 | if (color_gray(object) && get_object(object)) | 1159 | if (color_gray(object) && get_object(object)) |
1116 | list_add_tail(&object->gray_list, &gray_list); | 1160 | list_add_tail(&object->gray_list, &gray_list); |
1117 | 1161 | ||
@@ -1169,62 +1213,36 @@ static void kmemleak_scan(void) | |||
1169 | 1213 | ||
1170 | /* | 1214 | /* |
1171 | * Scan the objects already referenced from the sections scanned | 1215 | * Scan the objects already referenced from the sections scanned |
1172 | * above. More objects will be referenced and, if there are no memory | 1216 | * above. |
1173 | * leaks, all the objects will be scanned. The list traversal is safe | ||
1174 | * for both tail additions and removals from inside the loop. The | ||
1175 | * kmemleak objects cannot be freed from outside the loop because their | ||
1176 | * use_count was increased. | ||
1177 | */ | 1217 | */ |
1178 | repeat: | 1218 | scan_gray_list(); |
1179 | object = list_entry(gray_list.next, typeof(*object), gray_list); | ||
1180 | while (&object->gray_list != &gray_list) { | ||
1181 | cond_resched(); | ||
1182 | |||
1183 | /* may add new objects to the list */ | ||
1184 | if (!scan_should_stop()) | ||
1185 | scan_object(object); | ||
1186 | |||
1187 | tmp = list_entry(object->gray_list.next, typeof(*object), | ||
1188 | gray_list); | ||
1189 | |||
1190 | /* remove the object from the list and release it */ | ||
1191 | list_del(&object->gray_list); | ||
1192 | put_object(object); | ||
1193 | |||
1194 | object = tmp; | ||
1195 | } | ||
1196 | |||
1197 | if (scan_should_stop() || ++gray_list_pass >= GRAY_LIST_PASSES) | ||
1198 | goto scan_end; | ||
1199 | 1219 | ||
1200 | /* | 1220 | /* |
1201 | * Check for new objects allocated during this scanning and add them | 1221 | * Check for new or unreferenced objects modified since the previous |
1202 | * to the gray list. | 1222 | * scan and color them gray until the next scan. |
1203 | */ | 1223 | */ |
1204 | rcu_read_lock(); | 1224 | rcu_read_lock(); |
1205 | list_for_each_entry_rcu(object, &object_list, object_list) { | 1225 | list_for_each_entry_rcu(object, &object_list, object_list) { |
1206 | spin_lock_irqsave(&object->lock, flags); | 1226 | spin_lock_irqsave(&object->lock, flags); |
1207 | if ((object->flags & OBJECT_NEW) && !color_black(object) && | 1227 | if (color_white(object) && (object->flags & OBJECT_ALLOCATED) |
1208 | get_object(object)) { | 1228 | && update_checksum(object) && get_object(object)) { |
1209 | object->flags &= ~OBJECT_NEW; | 1229 | /* color it gray temporarily */ |
1230 | object->count = object->min_count; | ||
1210 | list_add_tail(&object->gray_list, &gray_list); | 1231 | list_add_tail(&object->gray_list, &gray_list); |
1211 | } | 1232 | } |
1212 | spin_unlock_irqrestore(&object->lock, flags); | 1233 | spin_unlock_irqrestore(&object->lock, flags); |
1213 | } | 1234 | } |
1214 | rcu_read_unlock(); | 1235 | rcu_read_unlock(); |
1215 | 1236 | ||
1216 | if (!list_empty(&gray_list)) | 1237 | /* |
1217 | goto repeat; | 1238 | * Re-scan the gray list for modified unreferenced objects. |
1218 | 1239 | */ | |
1219 | scan_end: | 1240 | scan_gray_list(); |
1220 | WARN_ON(!list_empty(&gray_list)); | ||
1221 | 1241 | ||
1222 | /* | 1242 | /* |
1223 | * If scanning was stopped or new objects were being allocated at a | 1243 | * If scanning was stopped do not report any new unreferenced objects. |
1224 | * higher rate than gray list scanning, do not report any new | ||
1225 | * unreferenced objects. | ||
1226 | */ | 1244 | */ |
1227 | if (scan_should_stop() || gray_list_pass >= GRAY_LIST_PASSES) | 1245 | if (scan_should_stop()) |
1228 | return; | 1246 | return; |
1229 | 1247 | ||
1230 | /* | 1248 | /* |
@@ -1639,8 +1657,7 @@ void __init kmemleak_init(void) | |||
1639 | kmemleak_ignore(log->ptr); | 1657 | kmemleak_ignore(log->ptr); |
1640 | break; | 1658 | break; |
1641 | case KMEMLEAK_SCAN_AREA: | 1659 | case KMEMLEAK_SCAN_AREA: |
1642 | kmemleak_scan_area(log->ptr, log->offset, log->length, | 1660 | kmemleak_scan_area(log->ptr, log->size, GFP_KERNEL); |
1643 | GFP_KERNEL); | ||
1644 | break; | 1661 | break; |
1645 | case KMEMLEAK_NO_SCAN: | 1662 | case KMEMLEAK_NO_SCAN: |
1646 | kmemleak_no_scan(log->ptr); | 1663 | kmemleak_no_scan(log->ptr); |
@@ -29,11 +29,13 @@ | |||
29 | #include <linux/wait.h> | 29 | #include <linux/wait.h> |
30 | #include <linux/slab.h> | 30 | #include <linux/slab.h> |
31 | #include <linux/rbtree.h> | 31 | #include <linux/rbtree.h> |
32 | #include <linux/memory.h> | ||
32 | #include <linux/mmu_notifier.h> | 33 | #include <linux/mmu_notifier.h> |
33 | #include <linux/swap.h> | 34 | #include <linux/swap.h> |
34 | #include <linux/ksm.h> | 35 | #include <linux/ksm.h> |
35 | 36 | ||
36 | #include <asm/tlbflush.h> | 37 | #include <asm/tlbflush.h> |
38 | #include "internal.h" | ||
37 | 39 | ||
38 | /* | 40 | /* |
39 | * A few notes about the KSM scanning process, | 41 | * A few notes about the KSM scanning process, |
@@ -79,13 +81,13 @@ | |||
79 | * struct mm_slot - ksm information per mm that is being scanned | 81 | * struct mm_slot - ksm information per mm that is being scanned |
80 | * @link: link to the mm_slots hash list | 82 | * @link: link to the mm_slots hash list |
81 | * @mm_list: link into the mm_slots list, rooted in ksm_mm_head | 83 | * @mm_list: link into the mm_slots list, rooted in ksm_mm_head |
82 | * @rmap_list: head for this mm_slot's list of rmap_items | 84 | * @rmap_list: head for this mm_slot's singly-linked list of rmap_items |
83 | * @mm: the mm that this information is valid for | 85 | * @mm: the mm that this information is valid for |
84 | */ | 86 | */ |
85 | struct mm_slot { | 87 | struct mm_slot { |
86 | struct hlist_node link; | 88 | struct hlist_node link; |
87 | struct list_head mm_list; | 89 | struct list_head mm_list; |
88 | struct list_head rmap_list; | 90 | struct rmap_item *rmap_list; |
89 | struct mm_struct *mm; | 91 | struct mm_struct *mm; |
90 | }; | 92 | }; |
91 | 93 | ||
@@ -93,7 +95,7 @@ struct mm_slot { | |||
93 | * struct ksm_scan - cursor for scanning | 95 | * struct ksm_scan - cursor for scanning |
94 | * @mm_slot: the current mm_slot we are scanning | 96 | * @mm_slot: the current mm_slot we are scanning |
95 | * @address: the next address inside that to be scanned | 97 | * @address: the next address inside that to be scanned |
96 | * @rmap_item: the current rmap that we are scanning inside the rmap_list | 98 | * @rmap_list: link to the next rmap to be scanned in the rmap_list |
97 | * @seqnr: count of completed full scans (needed when removing unstable node) | 99 | * @seqnr: count of completed full scans (needed when removing unstable node) |
98 | * | 100 | * |
99 | * There is only the one ksm_scan instance of this cursor structure. | 101 | * There is only the one ksm_scan instance of this cursor structure. |
@@ -101,37 +103,51 @@ struct mm_slot { | |||
101 | struct ksm_scan { | 103 | struct ksm_scan { |
102 | struct mm_slot *mm_slot; | 104 | struct mm_slot *mm_slot; |
103 | unsigned long address; | 105 | unsigned long address; |
104 | struct rmap_item *rmap_item; | 106 | struct rmap_item **rmap_list; |
105 | unsigned long seqnr; | 107 | unsigned long seqnr; |
106 | }; | 108 | }; |
107 | 109 | ||
108 | /** | 110 | /** |
111 | * struct stable_node - node of the stable rbtree | ||
112 | * @node: rb node of this ksm page in the stable tree | ||
113 | * @hlist: hlist head of rmap_items using this ksm page | ||
114 | * @kpfn: page frame number of this ksm page | ||
115 | */ | ||
116 | struct stable_node { | ||
117 | struct rb_node node; | ||
118 | struct hlist_head hlist; | ||
119 | unsigned long kpfn; | ||
120 | }; | ||
121 | |||
122 | /** | ||
109 | * struct rmap_item - reverse mapping item for virtual addresses | 123 | * struct rmap_item - reverse mapping item for virtual addresses |
110 | * @link: link into mm_slot's rmap_list (rmap_list is per mm) | 124 | * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list |
125 | * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree | ||
111 | * @mm: the memory structure this rmap_item is pointing into | 126 | * @mm: the memory structure this rmap_item is pointing into |
112 | * @address: the virtual address this rmap_item tracks (+ flags in low bits) | 127 | * @address: the virtual address this rmap_item tracks (+ flags in low bits) |
113 | * @oldchecksum: previous checksum of the page at that virtual address | 128 | * @oldchecksum: previous checksum of the page at that virtual address |
114 | * @node: rb_node of this rmap_item in either unstable or stable tree | 129 | * @node: rb node of this rmap_item in the unstable tree |
115 | * @next: next rmap_item hanging off the same node of the stable tree | 130 | * @head: pointer to stable_node heading this list in the stable tree |
116 | * @prev: previous rmap_item hanging off the same node of the stable tree | 131 | * @hlist: link into hlist of rmap_items hanging off that stable_node |
117 | */ | 132 | */ |
118 | struct rmap_item { | 133 | struct rmap_item { |
119 | struct list_head link; | 134 | struct rmap_item *rmap_list; |
135 | struct anon_vma *anon_vma; /* when stable */ | ||
120 | struct mm_struct *mm; | 136 | struct mm_struct *mm; |
121 | unsigned long address; /* + low bits used for flags below */ | 137 | unsigned long address; /* + low bits used for flags below */ |
138 | unsigned int oldchecksum; /* when unstable */ | ||
122 | union { | 139 | union { |
123 | unsigned int oldchecksum; /* when unstable */ | 140 | struct rb_node node; /* when node of unstable tree */ |
124 | struct rmap_item *next; /* when stable */ | 141 | struct { /* when listed from stable tree */ |
125 | }; | 142 | struct stable_node *head; |
126 | union { | 143 | struct hlist_node hlist; |
127 | struct rb_node node; /* when tree node */ | 144 | }; |
128 | struct rmap_item *prev; /* in stable list */ | ||
129 | }; | 145 | }; |
130 | }; | 146 | }; |
131 | 147 | ||
132 | #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ | 148 | #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ |
133 | #define NODE_FLAG 0x100 /* is a node of unstable or stable tree */ | 149 | #define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ |
134 | #define STABLE_FLAG 0x200 /* is a node or list item of stable tree */ | 150 | #define STABLE_FLAG 0x200 /* is listed from the stable tree */ |
135 | 151 | ||
136 | /* The stable and unstable tree heads */ | 152 | /* The stable and unstable tree heads */ |
137 | static struct rb_root root_stable_tree = RB_ROOT; | 153 | static struct rb_root root_stable_tree = RB_ROOT; |
@@ -148,6 +164,7 @@ static struct ksm_scan ksm_scan = { | |||
148 | }; | 164 | }; |
149 | 165 | ||
150 | static struct kmem_cache *rmap_item_cache; | 166 | static struct kmem_cache *rmap_item_cache; |
167 | static struct kmem_cache *stable_node_cache; | ||
151 | static struct kmem_cache *mm_slot_cache; | 168 | static struct kmem_cache *mm_slot_cache; |
152 | 169 | ||
153 | /* The number of nodes in the stable tree */ | 170 | /* The number of nodes in the stable tree */ |
@@ -162,9 +179,6 @@ static unsigned long ksm_pages_unshared; | |||
162 | /* The number of rmap_items in use: to calculate pages_volatile */ | 179 | /* The number of rmap_items in use: to calculate pages_volatile */ |
163 | static unsigned long ksm_rmap_items; | 180 | static unsigned long ksm_rmap_items; |
164 | 181 | ||
165 | /* Limit on the number of unswappable pages used */ | ||
166 | static unsigned long ksm_max_kernel_pages; | ||
167 | |||
168 | /* Number of pages ksmd should scan in one batch */ | 182 | /* Number of pages ksmd should scan in one batch */ |
169 | static unsigned int ksm_thread_pages_to_scan = 100; | 183 | static unsigned int ksm_thread_pages_to_scan = 100; |
170 | 184 | ||
@@ -184,24 +198,25 @@ static DEFINE_SPINLOCK(ksm_mmlist_lock); | |||
184 | sizeof(struct __struct), __alignof__(struct __struct),\ | 198 | sizeof(struct __struct), __alignof__(struct __struct),\ |
185 | (__flags), NULL) | 199 | (__flags), NULL) |
186 | 200 | ||
187 | static void __init ksm_init_max_kernel_pages(void) | ||
188 | { | ||
189 | ksm_max_kernel_pages = nr_free_buffer_pages() / 4; | ||
190 | } | ||
191 | |||
192 | static int __init ksm_slab_init(void) | 201 | static int __init ksm_slab_init(void) |
193 | { | 202 | { |
194 | rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0); | 203 | rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0); |
195 | if (!rmap_item_cache) | 204 | if (!rmap_item_cache) |
196 | goto out; | 205 | goto out; |
197 | 206 | ||
207 | stable_node_cache = KSM_KMEM_CACHE(stable_node, 0); | ||
208 | if (!stable_node_cache) | ||
209 | goto out_free1; | ||
210 | |||
198 | mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0); | 211 | mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0); |
199 | if (!mm_slot_cache) | 212 | if (!mm_slot_cache) |
200 | goto out_free; | 213 | goto out_free2; |
201 | 214 | ||
202 | return 0; | 215 | return 0; |
203 | 216 | ||
204 | out_free: | 217 | out_free2: |
218 | kmem_cache_destroy(stable_node_cache); | ||
219 | out_free1: | ||
205 | kmem_cache_destroy(rmap_item_cache); | 220 | kmem_cache_destroy(rmap_item_cache); |
206 | out: | 221 | out: |
207 | return -ENOMEM; | 222 | return -ENOMEM; |
@@ -210,6 +225,7 @@ out: | |||
210 | static void __init ksm_slab_free(void) | 225 | static void __init ksm_slab_free(void) |
211 | { | 226 | { |
212 | kmem_cache_destroy(mm_slot_cache); | 227 | kmem_cache_destroy(mm_slot_cache); |
228 | kmem_cache_destroy(stable_node_cache); | ||
213 | kmem_cache_destroy(rmap_item_cache); | 229 | kmem_cache_destroy(rmap_item_cache); |
214 | mm_slot_cache = NULL; | 230 | mm_slot_cache = NULL; |
215 | } | 231 | } |
@@ -231,6 +247,16 @@ static inline void free_rmap_item(struct rmap_item *rmap_item) | |||
231 | kmem_cache_free(rmap_item_cache, rmap_item); | 247 | kmem_cache_free(rmap_item_cache, rmap_item); |
232 | } | 248 | } |
233 | 249 | ||
250 | static inline struct stable_node *alloc_stable_node(void) | ||
251 | { | ||
252 | return kmem_cache_alloc(stable_node_cache, GFP_KERNEL); | ||
253 | } | ||
254 | |||
255 | static inline void free_stable_node(struct stable_node *stable_node) | ||
256 | { | ||
257 | kmem_cache_free(stable_node_cache, stable_node); | ||
258 | } | ||
259 | |||
234 | static inline struct mm_slot *alloc_mm_slot(void) | 260 | static inline struct mm_slot *alloc_mm_slot(void) |
235 | { | 261 | { |
236 | if (!mm_slot_cache) /* initialization failed */ | 262 | if (!mm_slot_cache) /* initialization failed */ |
@@ -280,7 +306,6 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm, | |||
280 | bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) | 306 | bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) |
281 | % MM_SLOTS_HASH_HEADS]; | 307 | % MM_SLOTS_HASH_HEADS]; |
282 | mm_slot->mm = mm; | 308 | mm_slot->mm = mm; |
283 | INIT_LIST_HEAD(&mm_slot->rmap_list); | ||
284 | hlist_add_head(&mm_slot->link, bucket); | 309 | hlist_add_head(&mm_slot->link, bucket); |
285 | } | 310 | } |
286 | 311 | ||
@@ -289,6 +314,25 @@ static inline int in_stable_tree(struct rmap_item *rmap_item) | |||
289 | return rmap_item->address & STABLE_FLAG; | 314 | return rmap_item->address & STABLE_FLAG; |
290 | } | 315 | } |
291 | 316 | ||
317 | static void hold_anon_vma(struct rmap_item *rmap_item, | ||
318 | struct anon_vma *anon_vma) | ||
319 | { | ||
320 | rmap_item->anon_vma = anon_vma; | ||
321 | atomic_inc(&anon_vma->ksm_refcount); | ||
322 | } | ||
323 | |||
324 | static void drop_anon_vma(struct rmap_item *rmap_item) | ||
325 | { | ||
326 | struct anon_vma *anon_vma = rmap_item->anon_vma; | ||
327 | |||
328 | if (atomic_dec_and_lock(&anon_vma->ksm_refcount, &anon_vma->lock)) { | ||
329 | int empty = list_empty(&anon_vma->head); | ||
330 | spin_unlock(&anon_vma->lock); | ||
331 | if (empty) | ||
332 | anon_vma_free(anon_vma); | ||
333 | } | ||
334 | } | ||
335 | |||
292 | /* | 336 | /* |
293 | * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's | 337 | * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's |
294 | * page tables after it has passed through ksm_exit() - which, if necessary, | 338 | * page tables after it has passed through ksm_exit() - which, if necessary, |
@@ -361,10 +405,18 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) | |||
361 | return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; | 405 | return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; |
362 | } | 406 | } |
363 | 407 | ||
364 | static void break_cow(struct mm_struct *mm, unsigned long addr) | 408 | static void break_cow(struct rmap_item *rmap_item) |
365 | { | 409 | { |
410 | struct mm_struct *mm = rmap_item->mm; | ||
411 | unsigned long addr = rmap_item->address; | ||
366 | struct vm_area_struct *vma; | 412 | struct vm_area_struct *vma; |
367 | 413 | ||
414 | /* | ||
415 | * It is not an accident that whenever we want to break COW | ||
416 | * to undo, we also need to drop a reference to the anon_vma. | ||
417 | */ | ||
418 | drop_anon_vma(rmap_item); | ||
419 | |||
368 | down_read(&mm->mmap_sem); | 420 | down_read(&mm->mmap_sem); |
369 | if (ksm_test_exit(mm)) | 421 | if (ksm_test_exit(mm)) |
370 | goto out; | 422 | goto out; |
@@ -408,21 +460,77 @@ out: page = NULL; | |||
408 | return page; | 460 | return page; |
409 | } | 461 | } |
410 | 462 | ||
463 | static void remove_node_from_stable_tree(struct stable_node *stable_node) | ||
464 | { | ||
465 | struct rmap_item *rmap_item; | ||
466 | struct hlist_node *hlist; | ||
467 | |||
468 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | ||
469 | if (rmap_item->hlist.next) | ||
470 | ksm_pages_sharing--; | ||
471 | else | ||
472 | ksm_pages_shared--; | ||
473 | drop_anon_vma(rmap_item); | ||
474 | rmap_item->address &= PAGE_MASK; | ||
475 | cond_resched(); | ||
476 | } | ||
477 | |||
478 | rb_erase(&stable_node->node, &root_stable_tree); | ||
479 | free_stable_node(stable_node); | ||
480 | } | ||
481 | |||
411 | /* | 482 | /* |
412 | * get_ksm_page: checks if the page at the virtual address in rmap_item | 483 | * get_ksm_page: checks if the page indicated by the stable node |
413 | * is still PageKsm, in which case we can trust the content of the page, | 484 | * is still its ksm page, despite having held no reference to it. |
414 | * and it returns the gotten page; but NULL if the page has been zapped. | 485 | * In which case we can trust the content of the page, and it |
486 | * returns the gotten page; but if the page has now been zapped, | ||
487 | * remove the stale node from the stable tree and return NULL. | ||
488 | * | ||
489 | * You would expect the stable_node to hold a reference to the ksm page. | ||
490 | * But if it increments the page's count, swapping out has to wait for | ||
491 | * ksmd to come around again before it can free the page, which may take | ||
492 | * seconds or even minutes: much too unresponsive. So instead we use a | ||
493 | * "keyhole reference": access to the ksm page from the stable node peeps | ||
494 | * out through its keyhole to see if that page still holds the right key, | ||
495 | * pointing back to this stable node. This relies on freeing a PageAnon | ||
496 | * page to reset its page->mapping to NULL, and relies on no other use of | ||
497 | * a page to put something that might look like our key in page->mapping. | ||
498 | * | ||
499 | * include/linux/pagemap.h page_cache_get_speculative() is a good reference, | ||
500 | * but this is different - made simpler by ksm_thread_mutex being held, but | ||
501 | * interesting for assuming that no other use of the struct page could ever | ||
502 | * put our expected_mapping into page->mapping (or a field of the union which | ||
503 | * coincides with page->mapping). The RCU calls are not for KSM at all, but | ||
504 | * to keep the page_count protocol described with page_cache_get_speculative. | ||
505 | * | ||
506 | * Note: it is possible that get_ksm_page() will return NULL one moment, | ||
507 | * then page the next, if the page is in between page_freeze_refs() and | ||
508 | * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page | ||
509 | * is on its way to being freed; but it is an anomaly to bear in mind. | ||
415 | */ | 510 | */ |
416 | static struct page *get_ksm_page(struct rmap_item *rmap_item) | 511 | static struct page *get_ksm_page(struct stable_node *stable_node) |
417 | { | 512 | { |
418 | struct page *page; | 513 | struct page *page; |
419 | 514 | void *expected_mapping; | |
420 | page = get_mergeable_page(rmap_item); | 515 | |
421 | if (page && !PageKsm(page)) { | 516 | page = pfn_to_page(stable_node->kpfn); |
517 | expected_mapping = (void *)stable_node + | ||
518 | (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); | ||
519 | rcu_read_lock(); | ||
520 | if (page->mapping != expected_mapping) | ||
521 | goto stale; | ||
522 | if (!get_page_unless_zero(page)) | ||
523 | goto stale; | ||
524 | if (page->mapping != expected_mapping) { | ||
422 | put_page(page); | 525 | put_page(page); |
423 | page = NULL; | 526 | goto stale; |
424 | } | 527 | } |
528 | rcu_read_unlock(); | ||
425 | return page; | 529 | return page; |
530 | stale: | ||
531 | rcu_read_unlock(); | ||
532 | remove_node_from_stable_tree(stable_node); | ||
533 | return NULL; | ||
426 | } | 534 | } |
427 | 535 | ||
428 | /* | 536 | /* |
@@ -431,35 +539,29 @@ static struct page *get_ksm_page(struct rmap_item *rmap_item) | |||
431 | */ | 539 | */ |
432 | static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) | 540 | static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) |
433 | { | 541 | { |
434 | if (in_stable_tree(rmap_item)) { | 542 | if (rmap_item->address & STABLE_FLAG) { |
435 | struct rmap_item *next_item = rmap_item->next; | 543 | struct stable_node *stable_node; |
436 | 544 | struct page *page; | |
437 | if (rmap_item->address & NODE_FLAG) { | ||
438 | if (next_item) { | ||
439 | rb_replace_node(&rmap_item->node, | ||
440 | &next_item->node, | ||
441 | &root_stable_tree); | ||
442 | next_item->address |= NODE_FLAG; | ||
443 | ksm_pages_sharing--; | ||
444 | } else { | ||
445 | rb_erase(&rmap_item->node, &root_stable_tree); | ||
446 | ksm_pages_shared--; | ||
447 | } | ||
448 | } else { | ||
449 | struct rmap_item *prev_item = rmap_item->prev; | ||
450 | 545 | ||
451 | BUG_ON(prev_item->next != rmap_item); | 546 | stable_node = rmap_item->head; |
452 | prev_item->next = next_item; | 547 | page = get_ksm_page(stable_node); |
453 | if (next_item) { | 548 | if (!page) |
454 | BUG_ON(next_item->prev != rmap_item); | 549 | goto out; |
455 | next_item->prev = rmap_item->prev; | 550 | |
456 | } | 551 | lock_page(page); |
552 | hlist_del(&rmap_item->hlist); | ||
553 | unlock_page(page); | ||
554 | put_page(page); | ||
555 | |||
556 | if (stable_node->hlist.first) | ||
457 | ksm_pages_sharing--; | 557 | ksm_pages_sharing--; |
458 | } | 558 | else |
559 | ksm_pages_shared--; | ||
459 | 560 | ||
460 | rmap_item->next = NULL; | 561 | drop_anon_vma(rmap_item); |
562 | rmap_item->address &= PAGE_MASK; | ||
461 | 563 | ||
462 | } else if (rmap_item->address & NODE_FLAG) { | 564 | } else if (rmap_item->address & UNSTABLE_FLAG) { |
463 | unsigned char age; | 565 | unsigned char age; |
464 | /* | 566 | /* |
465 | * Usually ksmd can and must skip the rb_erase, because | 567 | * Usually ksmd can and must skip the rb_erase, because |
@@ -472,24 +574,21 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) | |||
472 | BUG_ON(age > 1); | 574 | BUG_ON(age > 1); |
473 | if (!age) | 575 | if (!age) |
474 | rb_erase(&rmap_item->node, &root_unstable_tree); | 576 | rb_erase(&rmap_item->node, &root_unstable_tree); |
577 | |||
475 | ksm_pages_unshared--; | 578 | ksm_pages_unshared--; |
579 | rmap_item->address &= PAGE_MASK; | ||
476 | } | 580 | } |
477 | 581 | out: | |
478 | rmap_item->address &= PAGE_MASK; | ||
479 | |||
480 | cond_resched(); /* we're called from many long loops */ | 582 | cond_resched(); /* we're called from many long loops */ |
481 | } | 583 | } |
482 | 584 | ||
483 | static void remove_trailing_rmap_items(struct mm_slot *mm_slot, | 585 | static void remove_trailing_rmap_items(struct mm_slot *mm_slot, |
484 | struct list_head *cur) | 586 | struct rmap_item **rmap_list) |
485 | { | 587 | { |
486 | struct rmap_item *rmap_item; | 588 | while (*rmap_list) { |
487 | 589 | struct rmap_item *rmap_item = *rmap_list; | |
488 | while (cur != &mm_slot->rmap_list) { | 590 | *rmap_list = rmap_item->rmap_list; |
489 | rmap_item = list_entry(cur, struct rmap_item, link); | ||
490 | cur = cur->next; | ||
491 | remove_rmap_item_from_tree(rmap_item); | 591 | remove_rmap_item_from_tree(rmap_item); |
492 | list_del(&rmap_item->link); | ||
493 | free_rmap_item(rmap_item); | 592 | free_rmap_item(rmap_item); |
494 | } | 593 | } |
495 | } | 594 | } |
@@ -555,7 +654,7 @@ static int unmerge_and_remove_all_rmap_items(void) | |||
555 | goto error; | 654 | goto error; |
556 | } | 655 | } |
557 | 656 | ||
558 | remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next); | 657 | remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list); |
559 | 658 | ||
560 | spin_lock(&ksm_mmlist_lock); | 659 | spin_lock(&ksm_mmlist_lock); |
561 | ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, | 660 | ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, |
@@ -651,7 +750,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, | |||
651 | * Check that no O_DIRECT or similar I/O is in progress on the | 750 | * Check that no O_DIRECT or similar I/O is in progress on the |
652 | * page | 751 | * page |
653 | */ | 752 | */ |
654 | if ((page_mapcount(page) + 2 + swapped) != page_count(page)) { | 753 | if (page_mapcount(page) + 1 + swapped != page_count(page)) { |
655 | set_pte_at_notify(mm, addr, ptep, entry); | 754 | set_pte_at_notify(mm, addr, ptep, entry); |
656 | goto out_unlock; | 755 | goto out_unlock; |
657 | } | 756 | } |
@@ -669,15 +768,15 @@ out: | |||
669 | 768 | ||
670 | /** | 769 | /** |
671 | * replace_page - replace page in vma by new ksm page | 770 | * replace_page - replace page in vma by new ksm page |
672 | * @vma: vma that holds the pte pointing to oldpage | 771 | * @vma: vma that holds the pte pointing to page |
673 | * @oldpage: the page we are replacing by newpage | 772 | * @page: the page we are replacing by kpage |
674 | * @newpage: the ksm page we replace oldpage by | 773 | * @kpage: the ksm page we replace page by |
675 | * @orig_pte: the original value of the pte | 774 | * @orig_pte: the original value of the pte |
676 | * | 775 | * |
677 | * Returns 0 on success, -EFAULT on failure. | 776 | * Returns 0 on success, -EFAULT on failure. |
678 | */ | 777 | */ |
679 | static int replace_page(struct vm_area_struct *vma, struct page *oldpage, | 778 | static int replace_page(struct vm_area_struct *vma, struct page *page, |
680 | struct page *newpage, pte_t orig_pte) | 779 | struct page *kpage, pte_t orig_pte) |
681 | { | 780 | { |
682 | struct mm_struct *mm = vma->vm_mm; | 781 | struct mm_struct *mm = vma->vm_mm; |
683 | pgd_t *pgd; | 782 | pgd_t *pgd; |
@@ -686,12 +785,9 @@ static int replace_page(struct vm_area_struct *vma, struct page *oldpage, | |||
686 | pte_t *ptep; | 785 | pte_t *ptep; |
687 | spinlock_t *ptl; | 786 | spinlock_t *ptl; |
688 | unsigned long addr; | 787 | unsigned long addr; |
689 | pgprot_t prot; | ||
690 | int err = -EFAULT; | 788 | int err = -EFAULT; |
691 | 789 | ||
692 | prot = vm_get_page_prot(vma->vm_flags & ~VM_WRITE); | 790 | addr = page_address_in_vma(page, vma); |
693 | |||
694 | addr = page_address_in_vma(oldpage, vma); | ||
695 | if (addr == -EFAULT) | 791 | if (addr == -EFAULT) |
696 | goto out; | 792 | goto out; |
697 | 793 | ||
@@ -713,15 +809,15 @@ static int replace_page(struct vm_area_struct *vma, struct page *oldpage, | |||
713 | goto out; | 809 | goto out; |
714 | } | 810 | } |
715 | 811 | ||
716 | get_page(newpage); | 812 | get_page(kpage); |
717 | page_add_ksm_rmap(newpage); | 813 | page_add_anon_rmap(kpage, vma, addr); |
718 | 814 | ||
719 | flush_cache_page(vma, addr, pte_pfn(*ptep)); | 815 | flush_cache_page(vma, addr, pte_pfn(*ptep)); |
720 | ptep_clear_flush(vma, addr, ptep); | 816 | ptep_clear_flush(vma, addr, ptep); |
721 | set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot)); | 817 | set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); |
722 | 818 | ||
723 | page_remove_rmap(oldpage); | 819 | page_remove_rmap(page); |
724 | put_page(oldpage); | 820 | put_page(page); |
725 | 821 | ||
726 | pte_unmap_unlock(ptep, ptl); | 822 | pte_unmap_unlock(ptep, ptl); |
727 | err = 0; | 823 | err = 0; |
@@ -731,32 +827,27 @@ out: | |||
731 | 827 | ||
732 | /* | 828 | /* |
733 | * try_to_merge_one_page - take two pages and merge them into one | 829 | * try_to_merge_one_page - take two pages and merge them into one |
734 | * @vma: the vma that hold the pte pointing into oldpage | 830 | * @vma: the vma that holds the pte pointing to page |
735 | * @oldpage: the page that we want to replace with newpage | 831 | * @page: the PageAnon page that we want to replace with kpage |
736 | * @newpage: the page that we want to map instead of oldpage | 832 | * @kpage: the PageKsm page that we want to map instead of page, |
737 | * | 833 | * or NULL the first time when we want to use page as kpage. |
738 | * Note: | ||
739 | * oldpage should be a PageAnon page, while newpage should be a PageKsm page, | ||
740 | * or a newly allocated kernel page which page_add_ksm_rmap will make PageKsm. | ||
741 | * | 834 | * |
742 | * This function returns 0 if the pages were merged, -EFAULT otherwise. | 835 | * This function returns 0 if the pages were merged, -EFAULT otherwise. |
743 | */ | 836 | */ |
744 | static int try_to_merge_one_page(struct vm_area_struct *vma, | 837 | static int try_to_merge_one_page(struct vm_area_struct *vma, |
745 | struct page *oldpage, | 838 | struct page *page, struct page *kpage) |
746 | struct page *newpage) | ||
747 | { | 839 | { |
748 | pte_t orig_pte = __pte(0); | 840 | pte_t orig_pte = __pte(0); |
749 | int err = -EFAULT; | 841 | int err = -EFAULT; |
750 | 842 | ||
843 | if (page == kpage) /* ksm page forked */ | ||
844 | return 0; | ||
845 | |||
751 | if (!(vma->vm_flags & VM_MERGEABLE)) | 846 | if (!(vma->vm_flags & VM_MERGEABLE)) |
752 | goto out; | 847 | goto out; |
753 | 848 | if (!PageAnon(page)) | |
754 | if (!PageAnon(oldpage)) | ||
755 | goto out; | 849 | goto out; |
756 | 850 | ||
757 | get_page(newpage); | ||
758 | get_page(oldpage); | ||
759 | |||
760 | /* | 851 | /* |
761 | * We need the page lock to read a stable PageSwapCache in | 852 | * We need the page lock to read a stable PageSwapCache in |
762 | * write_protect_page(). We use trylock_page() instead of | 853 | * write_protect_page(). We use trylock_page() instead of |
@@ -764,26 +855,39 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, | |||
764 | * prefer to continue scanning and merging different pages, | 855 | * prefer to continue scanning and merging different pages, |
765 | * then come back to this page when it is unlocked. | 856 | * then come back to this page when it is unlocked. |
766 | */ | 857 | */ |
767 | if (!trylock_page(oldpage)) | 858 | if (!trylock_page(page)) |
768 | goto out_putpage; | 859 | goto out; |
769 | /* | 860 | /* |
770 | * If this anonymous page is mapped only here, its pte may need | 861 | * If this anonymous page is mapped only here, its pte may need |
771 | * to be write-protected. If it's mapped elsewhere, all of its | 862 | * to be write-protected. If it's mapped elsewhere, all of its |
772 | * ptes are necessarily already write-protected. But in either | 863 | * ptes are necessarily already write-protected. But in either |
773 | * case, we need to lock and check page_count is not raised. | 864 | * case, we need to lock and check page_count is not raised. |
774 | */ | 865 | */ |
775 | if (write_protect_page(vma, oldpage, &orig_pte)) { | 866 | if (write_protect_page(vma, page, &orig_pte) == 0) { |
776 | unlock_page(oldpage); | 867 | if (!kpage) { |
777 | goto out_putpage; | 868 | /* |
869 | * While we hold page lock, upgrade page from | ||
870 | * PageAnon+anon_vma to PageKsm+NULL stable_node: | ||
871 | * stable_tree_insert() will update stable_node. | ||
872 | */ | ||
873 | set_page_stable_node(page, NULL); | ||
874 | mark_page_accessed(page); | ||
875 | err = 0; | ||
876 | } else if (pages_identical(page, kpage)) | ||
877 | err = replace_page(vma, page, kpage, orig_pte); | ||
778 | } | 878 | } |
779 | unlock_page(oldpage); | ||
780 | 879 | ||
781 | if (pages_identical(oldpage, newpage)) | 880 | if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { |
782 | err = replace_page(vma, oldpage, newpage, orig_pte); | 881 | munlock_vma_page(page); |
882 | if (!PageMlocked(kpage)) { | ||
883 | unlock_page(page); | ||
884 | lock_page(kpage); | ||
885 | mlock_vma_page(kpage); | ||
886 | page = kpage; /* for final unlock */ | ||
887 | } | ||
888 | } | ||
783 | 889 | ||
784 | out_putpage: | 890 | unlock_page(page); |
785 | put_page(oldpage); | ||
786 | put_page(newpage); | ||
787 | out: | 891 | out: |
788 | return err; | 892 | return err; |
789 | } | 893 | } |
@@ -791,26 +895,31 @@ out: | |||
791 | /* | 895 | /* |
792 | * try_to_merge_with_ksm_page - like try_to_merge_two_pages, | 896 | * try_to_merge_with_ksm_page - like try_to_merge_two_pages, |
793 | * but no new kernel page is allocated: kpage must already be a ksm page. | 897 | * but no new kernel page is allocated: kpage must already be a ksm page. |
898 | * | ||
899 | * This function returns 0 if the pages were merged, -EFAULT otherwise. | ||
794 | */ | 900 | */ |
795 | static int try_to_merge_with_ksm_page(struct mm_struct *mm1, | 901 | static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, |
796 | unsigned long addr1, | 902 | struct page *page, struct page *kpage) |
797 | struct page *page1, | ||
798 | struct page *kpage) | ||
799 | { | 903 | { |
904 | struct mm_struct *mm = rmap_item->mm; | ||
800 | struct vm_area_struct *vma; | 905 | struct vm_area_struct *vma; |
801 | int err = -EFAULT; | 906 | int err = -EFAULT; |
802 | 907 | ||
803 | down_read(&mm1->mmap_sem); | 908 | down_read(&mm->mmap_sem); |
804 | if (ksm_test_exit(mm1)) | 909 | if (ksm_test_exit(mm)) |
910 | goto out; | ||
911 | vma = find_vma(mm, rmap_item->address); | ||
912 | if (!vma || vma->vm_start > rmap_item->address) | ||
805 | goto out; | 913 | goto out; |
806 | 914 | ||
807 | vma = find_vma(mm1, addr1); | 915 | err = try_to_merge_one_page(vma, page, kpage); |
808 | if (!vma || vma->vm_start > addr1) | 916 | if (err) |
809 | goto out; | 917 | goto out; |
810 | 918 | ||
811 | err = try_to_merge_one_page(vma, page1, kpage); | 919 | /* Must get reference to anon_vma while still holding mmap_sem */ |
920 | hold_anon_vma(rmap_item, vma->anon_vma); | ||
812 | out: | 921 | out: |
813 | up_read(&mm1->mmap_sem); | 922 | up_read(&mm->mmap_sem); |
814 | return err; | 923 | return err; |
815 | } | 924 | } |
816 | 925 | ||
@@ -818,109 +927,73 @@ out: | |||
818 | * try_to_merge_two_pages - take two identical pages and prepare them | 927 | * try_to_merge_two_pages - take two identical pages and prepare them |
819 | * to be merged into one page. | 928 | * to be merged into one page. |
820 | * | 929 | * |
821 | * This function returns 0 if we successfully mapped two identical pages | 930 | * This function returns the kpage if we successfully merged two identical |
822 | * into one page, -EFAULT otherwise. | 931 | * pages into one ksm page, NULL otherwise. |
823 | * | 932 | * |
824 | * Note that this function allocates a new kernel page: if one of the pages | 933 | * Note that this function upgrades page to ksm page: if one of the pages |
825 | * is already a ksm page, try_to_merge_with_ksm_page should be used. | 934 | * is already a ksm page, try_to_merge_with_ksm_page should be used. |
826 | */ | 935 | */ |
827 | static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1, | 936 | static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, |
828 | struct page *page1, struct mm_struct *mm2, | 937 | struct page *page, |
829 | unsigned long addr2, struct page *page2) | 938 | struct rmap_item *tree_rmap_item, |
939 | struct page *tree_page) | ||
830 | { | 940 | { |
831 | struct vm_area_struct *vma; | 941 | int err; |
832 | struct page *kpage; | ||
833 | int err = -EFAULT; | ||
834 | |||
835 | /* | ||
836 | * The number of nodes in the stable tree | ||
837 | * is the number of kernel pages that we hold. | ||
838 | */ | ||
839 | if (ksm_max_kernel_pages && | ||
840 | ksm_max_kernel_pages <= ksm_pages_shared) | ||
841 | return err; | ||
842 | |||
843 | kpage = alloc_page(GFP_HIGHUSER); | ||
844 | if (!kpage) | ||
845 | return err; | ||
846 | |||
847 | down_read(&mm1->mmap_sem); | ||
848 | if (ksm_test_exit(mm1)) { | ||
849 | up_read(&mm1->mmap_sem); | ||
850 | goto out; | ||
851 | } | ||
852 | vma = find_vma(mm1, addr1); | ||
853 | if (!vma || vma->vm_start > addr1) { | ||
854 | up_read(&mm1->mmap_sem); | ||
855 | goto out; | ||
856 | } | ||
857 | |||
858 | copy_user_highpage(kpage, page1, addr1, vma); | ||
859 | err = try_to_merge_one_page(vma, page1, kpage); | ||
860 | up_read(&mm1->mmap_sem); | ||
861 | 942 | ||
943 | err = try_to_merge_with_ksm_page(rmap_item, page, NULL); | ||
862 | if (!err) { | 944 | if (!err) { |
863 | err = try_to_merge_with_ksm_page(mm2, addr2, page2, kpage); | 945 | err = try_to_merge_with_ksm_page(tree_rmap_item, |
946 | tree_page, page); | ||
864 | /* | 947 | /* |
865 | * If that fails, we have a ksm page with only one pte | 948 | * If that fails, we have a ksm page with only one pte |
866 | * pointing to it: so break it. | 949 | * pointing to it: so break it. |
867 | */ | 950 | */ |
868 | if (err) | 951 | if (err) |
869 | break_cow(mm1, addr1); | 952 | break_cow(rmap_item); |
870 | } | 953 | } |
871 | out: | 954 | return err ? NULL : page; |
872 | put_page(kpage); | ||
873 | return err; | ||
874 | } | 955 | } |
875 | 956 | ||
876 | /* | 957 | /* |
877 | * stable_tree_search - search page inside the stable tree | 958 | * stable_tree_search - search for page inside the stable tree |
878 | * @page: the page that we are searching identical pages to. | ||
879 | * @page2: pointer into identical page that we are holding inside the stable | ||
880 | * tree that we have found. | ||
881 | * @rmap_item: the reverse mapping item | ||
882 | * | 959 | * |
883 | * This function checks if there is a page inside the stable tree | 960 | * This function checks if there is a page inside the stable tree |
884 | * with identical content to the page that we are scanning right now. | 961 | * with identical content to the page that we are scanning right now. |
885 | * | 962 | * |
886 | * This function return rmap_item pointer to the identical item if found, | 963 | * This function returns the stable tree node of identical content if found, |
887 | * NULL otherwise. | 964 | * NULL otherwise. |
888 | */ | 965 | */ |
889 | static struct rmap_item *stable_tree_search(struct page *page, | 966 | static struct page *stable_tree_search(struct page *page) |
890 | struct page **page2, | ||
891 | struct rmap_item *rmap_item) | ||
892 | { | 967 | { |
893 | struct rb_node *node = root_stable_tree.rb_node; | 968 | struct rb_node *node = root_stable_tree.rb_node; |
969 | struct stable_node *stable_node; | ||
970 | |||
971 | stable_node = page_stable_node(page); | ||
972 | if (stable_node) { /* ksm page forked */ | ||
973 | get_page(page); | ||
974 | return page; | ||
975 | } | ||
894 | 976 | ||
895 | while (node) { | 977 | while (node) { |
896 | struct rmap_item *tree_rmap_item, *next_rmap_item; | 978 | struct page *tree_page; |
897 | int ret; | 979 | int ret; |
898 | 980 | ||
899 | tree_rmap_item = rb_entry(node, struct rmap_item, node); | 981 | cond_resched(); |
900 | while (tree_rmap_item) { | 982 | stable_node = rb_entry(node, struct stable_node, node); |
901 | BUG_ON(!in_stable_tree(tree_rmap_item)); | 983 | tree_page = get_ksm_page(stable_node); |
902 | cond_resched(); | 984 | if (!tree_page) |
903 | page2[0] = get_ksm_page(tree_rmap_item); | ||
904 | if (page2[0]) | ||
905 | break; | ||
906 | next_rmap_item = tree_rmap_item->next; | ||
907 | remove_rmap_item_from_tree(tree_rmap_item); | ||
908 | tree_rmap_item = next_rmap_item; | ||
909 | } | ||
910 | if (!tree_rmap_item) | ||
911 | return NULL; | 985 | return NULL; |
912 | 986 | ||
913 | ret = memcmp_pages(page, page2[0]); | 987 | ret = memcmp_pages(page, tree_page); |
914 | 988 | ||
915 | if (ret < 0) { | 989 | if (ret < 0) { |
916 | put_page(page2[0]); | 990 | put_page(tree_page); |
917 | node = node->rb_left; | 991 | node = node->rb_left; |
918 | } else if (ret > 0) { | 992 | } else if (ret > 0) { |
919 | put_page(page2[0]); | 993 | put_page(tree_page); |
920 | node = node->rb_right; | 994 | node = node->rb_right; |
921 | } else { | 995 | } else |
922 | return tree_rmap_item; | 996 | return tree_page; |
923 | } | ||
924 | } | 997 | } |
925 | 998 | ||
926 | return NULL; | 999 | return NULL; |
@@ -930,38 +1003,26 @@ static struct rmap_item *stable_tree_search(struct page *page, | |||
930 | * stable_tree_insert - insert rmap_item pointing to new ksm page | 1003 | * stable_tree_insert - insert rmap_item pointing to new ksm page |
931 | * into the stable tree. | 1004 | * into the stable tree. |
932 | * | 1005 | * |
933 | * @page: the page that we are searching identical page to inside the stable | 1006 | * This function returns the stable tree node just allocated on success, |
934 | * tree. | 1007 | * NULL otherwise. |
935 | * @rmap_item: pointer to the reverse mapping item. | ||
936 | * | ||
937 | * This function returns rmap_item if success, NULL otherwise. | ||
938 | */ | 1008 | */ |
939 | static struct rmap_item *stable_tree_insert(struct page *page, | 1009 | static struct stable_node *stable_tree_insert(struct page *kpage) |
940 | struct rmap_item *rmap_item) | ||
941 | { | 1010 | { |
942 | struct rb_node **new = &root_stable_tree.rb_node; | 1011 | struct rb_node **new = &root_stable_tree.rb_node; |
943 | struct rb_node *parent = NULL; | 1012 | struct rb_node *parent = NULL; |
1013 | struct stable_node *stable_node; | ||
944 | 1014 | ||
945 | while (*new) { | 1015 | while (*new) { |
946 | struct rmap_item *tree_rmap_item, *next_rmap_item; | ||
947 | struct page *tree_page; | 1016 | struct page *tree_page; |
948 | int ret; | 1017 | int ret; |
949 | 1018 | ||
950 | tree_rmap_item = rb_entry(*new, struct rmap_item, node); | 1019 | cond_resched(); |
951 | while (tree_rmap_item) { | 1020 | stable_node = rb_entry(*new, struct stable_node, node); |
952 | BUG_ON(!in_stable_tree(tree_rmap_item)); | 1021 | tree_page = get_ksm_page(stable_node); |
953 | cond_resched(); | 1022 | if (!tree_page) |
954 | tree_page = get_ksm_page(tree_rmap_item); | ||
955 | if (tree_page) | ||
956 | break; | ||
957 | next_rmap_item = tree_rmap_item->next; | ||
958 | remove_rmap_item_from_tree(tree_rmap_item); | ||
959 | tree_rmap_item = next_rmap_item; | ||
960 | } | ||
961 | if (!tree_rmap_item) | ||
962 | return NULL; | 1023 | return NULL; |
963 | 1024 | ||
964 | ret = memcmp_pages(page, tree_page); | 1025 | ret = memcmp_pages(kpage, tree_page); |
965 | put_page(tree_page); | 1026 | put_page(tree_page); |
966 | 1027 | ||
967 | parent = *new; | 1028 | parent = *new; |
@@ -979,22 +1040,24 @@ static struct rmap_item *stable_tree_insert(struct page *page, | |||
979 | } | 1040 | } |
980 | } | 1041 | } |
981 | 1042 | ||
982 | rmap_item->address |= NODE_FLAG | STABLE_FLAG; | 1043 | stable_node = alloc_stable_node(); |
983 | rmap_item->next = NULL; | 1044 | if (!stable_node) |
984 | rb_link_node(&rmap_item->node, parent, new); | 1045 | return NULL; |
985 | rb_insert_color(&rmap_item->node, &root_stable_tree); | ||
986 | 1046 | ||
987 | ksm_pages_shared++; | 1047 | rb_link_node(&stable_node->node, parent, new); |
988 | return rmap_item; | 1048 | rb_insert_color(&stable_node->node, &root_stable_tree); |
1049 | |||
1050 | INIT_HLIST_HEAD(&stable_node->hlist); | ||
1051 | |||
1052 | stable_node->kpfn = page_to_pfn(kpage); | ||
1053 | set_page_stable_node(kpage, stable_node); | ||
1054 | |||
1055 | return stable_node; | ||
989 | } | 1056 | } |
990 | 1057 | ||
991 | /* | 1058 | /* |
992 | * unstable_tree_search_insert - search and insert items into the unstable tree. | 1059 | * unstable_tree_search_insert - search for identical page, |
993 | * | 1060 | * else insert rmap_item into the unstable tree. |
994 | * @page: the page that we are going to search for identical page or to insert | ||
995 | * into the unstable tree | ||
996 | * @page2: pointer into identical page that was found inside the unstable tree | ||
997 | * @rmap_item: the reverse mapping item of page | ||
998 | * | 1061 | * |
999 | * This function searches for a page in the unstable tree identical to the | 1062 | * This function searches for a page in the unstable tree identical to the |
1000 | * page currently being scanned; and if no identical page is found in the | 1063 | * page currently being scanned; and if no identical page is found in the |
@@ -1006,46 +1069,50 @@ static struct rmap_item *stable_tree_insert(struct page *page, | |||
1006 | * This function does both searching and inserting, because they share | 1069 | * This function does both searching and inserting, because they share |
1007 | * the same walking algorithm in an rbtree. | 1070 | * the same walking algorithm in an rbtree. |
1008 | */ | 1071 | */ |
1009 | static struct rmap_item *unstable_tree_search_insert(struct page *page, | 1072 | static |
1010 | struct page **page2, | 1073 | struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, |
1011 | struct rmap_item *rmap_item) | 1074 | struct page *page, |
1075 | struct page **tree_pagep) | ||
1076 | |||
1012 | { | 1077 | { |
1013 | struct rb_node **new = &root_unstable_tree.rb_node; | 1078 | struct rb_node **new = &root_unstable_tree.rb_node; |
1014 | struct rb_node *parent = NULL; | 1079 | struct rb_node *parent = NULL; |
1015 | 1080 | ||
1016 | while (*new) { | 1081 | while (*new) { |
1017 | struct rmap_item *tree_rmap_item; | 1082 | struct rmap_item *tree_rmap_item; |
1083 | struct page *tree_page; | ||
1018 | int ret; | 1084 | int ret; |
1019 | 1085 | ||
1086 | cond_resched(); | ||
1020 | tree_rmap_item = rb_entry(*new, struct rmap_item, node); | 1087 | tree_rmap_item = rb_entry(*new, struct rmap_item, node); |
1021 | page2[0] = get_mergeable_page(tree_rmap_item); | 1088 | tree_page = get_mergeable_page(tree_rmap_item); |
1022 | if (!page2[0]) | 1089 | if (!tree_page) |
1023 | return NULL; | 1090 | return NULL; |
1024 | 1091 | ||
1025 | /* | 1092 | /* |
1026 | * Don't substitute an unswappable ksm page | 1093 | * Don't substitute a ksm page for a forked page. |
1027 | * just for one good swappable forked page. | ||
1028 | */ | 1094 | */ |
1029 | if (page == page2[0]) { | 1095 | if (page == tree_page) { |
1030 | put_page(page2[0]); | 1096 | put_page(tree_page); |
1031 | return NULL; | 1097 | return NULL; |
1032 | } | 1098 | } |
1033 | 1099 | ||
1034 | ret = memcmp_pages(page, page2[0]); | 1100 | ret = memcmp_pages(page, tree_page); |
1035 | 1101 | ||
1036 | parent = *new; | 1102 | parent = *new; |
1037 | if (ret < 0) { | 1103 | if (ret < 0) { |
1038 | put_page(page2[0]); | 1104 | put_page(tree_page); |
1039 | new = &parent->rb_left; | 1105 | new = &parent->rb_left; |
1040 | } else if (ret > 0) { | 1106 | } else if (ret > 0) { |
1041 | put_page(page2[0]); | 1107 | put_page(tree_page); |
1042 | new = &parent->rb_right; | 1108 | new = &parent->rb_right; |
1043 | } else { | 1109 | } else { |
1110 | *tree_pagep = tree_page; | ||
1044 | return tree_rmap_item; | 1111 | return tree_rmap_item; |
1045 | } | 1112 | } |
1046 | } | 1113 | } |
1047 | 1114 | ||
1048 | rmap_item->address |= NODE_FLAG; | 1115 | rmap_item->address |= UNSTABLE_FLAG; |
1049 | rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); | 1116 | rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); |
1050 | rb_link_node(&rmap_item->node, parent, new); | 1117 | rb_link_node(&rmap_item->node, parent, new); |
1051 | rb_insert_color(&rmap_item->node, &root_unstable_tree); | 1118 | rb_insert_color(&rmap_item->node, &root_unstable_tree); |
@@ -1060,18 +1127,16 @@ static struct rmap_item *unstable_tree_search_insert(struct page *page, | |||
1060 | * the same ksm page. | 1127 | * the same ksm page. |
1061 | */ | 1128 | */ |
1062 | static void stable_tree_append(struct rmap_item *rmap_item, | 1129 | static void stable_tree_append(struct rmap_item *rmap_item, |
1063 | struct rmap_item *tree_rmap_item) | 1130 | struct stable_node *stable_node) |
1064 | { | 1131 | { |
1065 | rmap_item->next = tree_rmap_item->next; | 1132 | rmap_item->head = stable_node; |
1066 | rmap_item->prev = tree_rmap_item; | ||
1067 | |||
1068 | if (tree_rmap_item->next) | ||
1069 | tree_rmap_item->next->prev = rmap_item; | ||
1070 | |||
1071 | tree_rmap_item->next = rmap_item; | ||
1072 | rmap_item->address |= STABLE_FLAG; | 1133 | rmap_item->address |= STABLE_FLAG; |
1134 | hlist_add_head(&rmap_item->hlist, &stable_node->hlist); | ||
1073 | 1135 | ||
1074 | ksm_pages_sharing++; | 1136 | if (rmap_item->hlist.next) |
1137 | ksm_pages_sharing++; | ||
1138 | else | ||
1139 | ksm_pages_shared++; | ||
1075 | } | 1140 | } |
1076 | 1141 | ||
1077 | /* | 1142 | /* |
@@ -1085,49 +1150,37 @@ static void stable_tree_append(struct rmap_item *rmap_item, | |||
1085 | */ | 1150 | */ |
1086 | static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | 1151 | static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) |
1087 | { | 1152 | { |
1088 | struct page *page2[1]; | ||
1089 | struct rmap_item *tree_rmap_item; | 1153 | struct rmap_item *tree_rmap_item; |
1154 | struct page *tree_page = NULL; | ||
1155 | struct stable_node *stable_node; | ||
1156 | struct page *kpage; | ||
1090 | unsigned int checksum; | 1157 | unsigned int checksum; |
1091 | int err; | 1158 | int err; |
1092 | 1159 | ||
1093 | if (in_stable_tree(rmap_item)) | 1160 | remove_rmap_item_from_tree(rmap_item); |
1094 | remove_rmap_item_from_tree(rmap_item); | ||
1095 | 1161 | ||
1096 | /* We first start with searching the page inside the stable tree */ | 1162 | /* We first start with searching the page inside the stable tree */ |
1097 | tree_rmap_item = stable_tree_search(page, page2, rmap_item); | 1163 | kpage = stable_tree_search(page); |
1098 | if (tree_rmap_item) { | 1164 | if (kpage) { |
1099 | if (page == page2[0]) /* forked */ | 1165 | err = try_to_merge_with_ksm_page(rmap_item, page, kpage); |
1100 | err = 0; | ||
1101 | else | ||
1102 | err = try_to_merge_with_ksm_page(rmap_item->mm, | ||
1103 | rmap_item->address, | ||
1104 | page, page2[0]); | ||
1105 | put_page(page2[0]); | ||
1106 | |||
1107 | if (!err) { | 1166 | if (!err) { |
1108 | /* | 1167 | /* |
1109 | * The page was successfully merged: | 1168 | * The page was successfully merged: |
1110 | * add its rmap_item to the stable tree. | 1169 | * add its rmap_item to the stable tree. |
1111 | */ | 1170 | */ |
1112 | stable_tree_append(rmap_item, tree_rmap_item); | 1171 | lock_page(kpage); |
1172 | stable_tree_append(rmap_item, page_stable_node(kpage)); | ||
1173 | unlock_page(kpage); | ||
1113 | } | 1174 | } |
1175 | put_page(kpage); | ||
1114 | return; | 1176 | return; |
1115 | } | 1177 | } |
1116 | 1178 | ||
1117 | /* | 1179 | /* |
1118 | * A ksm page might have got here by fork, but its other | 1180 | * If the hash value of the page has changed from the last time |
1119 | * references have already been removed from the stable tree. | 1181 | * we calculated it, this page is changing frequently: therefore we |
1120 | * Or it might be left over from a break_ksm which failed | 1182 | * don't want to insert it in the unstable tree, and we don't want |
1121 | * when the mem_cgroup had reached its limit: try again now. | 1183 | * to waste our time searching for something identical to it there. |
1122 | */ | ||
1123 | if (PageKsm(page)) | ||
1124 | break_cow(rmap_item->mm, rmap_item->address); | ||
1125 | |||
1126 | /* | ||
1127 | * In case the hash value of the page was changed from the last time we | ||
1128 | * have calculated it, this page to be changed frequely, therefore we | ||
1129 | * don't want to insert it to the unstable tree, and we don't want to | ||
1130 | * waste our time to search if there is something identical to it there. | ||
1131 | */ | 1184 | */ |
1132 | checksum = calc_checksum(page); | 1185 | checksum = calc_checksum(page); |
1133 | if (rmap_item->oldchecksum != checksum) { | 1186 | if (rmap_item->oldchecksum != checksum) { |
@@ -1135,21 +1188,27 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | |||
1135 | return; | 1188 | return; |
1136 | } | 1189 | } |
1137 | 1190 | ||
1138 | tree_rmap_item = unstable_tree_search_insert(page, page2, rmap_item); | 1191 | tree_rmap_item = |
1192 | unstable_tree_search_insert(rmap_item, page, &tree_page); | ||
1139 | if (tree_rmap_item) { | 1193 | if (tree_rmap_item) { |
1140 | err = try_to_merge_two_pages(rmap_item->mm, | 1194 | kpage = try_to_merge_two_pages(rmap_item, page, |
1141 | rmap_item->address, page, | 1195 | tree_rmap_item, tree_page); |
1142 | tree_rmap_item->mm, | 1196 | put_page(tree_page); |
1143 | tree_rmap_item->address, page2[0]); | ||
1144 | /* | 1197 | /* |
1145 | * As soon as we merge this page, we want to remove the | 1198 | * As soon as we merge this page, we want to remove the |
1146 | * rmap_item of the page we have merged with from the unstable | 1199 | * rmap_item of the page we have merged with from the unstable |
1147 | * tree, and insert it instead as new node in the stable tree. | 1200 | * tree, and insert it instead as new node in the stable tree. |
1148 | */ | 1201 | */ |
1149 | if (!err) { | 1202 | if (kpage) { |
1150 | rb_erase(&tree_rmap_item->node, &root_unstable_tree); | 1203 | remove_rmap_item_from_tree(tree_rmap_item); |
1151 | tree_rmap_item->address &= ~NODE_FLAG; | 1204 | |
1152 | ksm_pages_unshared--; | 1205 | lock_page(kpage); |
1206 | stable_node = stable_tree_insert(kpage); | ||
1207 | if (stable_node) { | ||
1208 | stable_tree_append(tree_rmap_item, stable_node); | ||
1209 | stable_tree_append(rmap_item, stable_node); | ||
1210 | } | ||
1211 | unlock_page(kpage); | ||
1153 | 1212 | ||
1154 | /* | 1213 | /* |
1155 | * If we fail to insert the page into the stable tree, | 1214 | * If we fail to insert the page into the stable tree, |
@@ -1157,37 +1216,28 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | |||
1157 | * to a ksm page left outside the stable tree, | 1216 | * to a ksm page left outside the stable tree, |
1158 | * in which case we need to break_cow on both. | 1217 | * in which case we need to break_cow on both. |
1159 | */ | 1218 | */ |
1160 | if (stable_tree_insert(page2[0], tree_rmap_item)) | 1219 | if (!stable_node) { |
1161 | stable_tree_append(rmap_item, tree_rmap_item); | 1220 | break_cow(tree_rmap_item); |
1162 | else { | 1221 | break_cow(rmap_item); |
1163 | break_cow(tree_rmap_item->mm, | ||
1164 | tree_rmap_item->address); | ||
1165 | break_cow(rmap_item->mm, rmap_item->address); | ||
1166 | } | 1222 | } |
1167 | } | 1223 | } |
1168 | |||
1169 | put_page(page2[0]); | ||
1170 | } | 1224 | } |
1171 | } | 1225 | } |
1172 | 1226 | ||
1173 | static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, | 1227 | static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, |
1174 | struct list_head *cur, | 1228 | struct rmap_item **rmap_list, |
1175 | unsigned long addr) | 1229 | unsigned long addr) |
1176 | { | 1230 | { |
1177 | struct rmap_item *rmap_item; | 1231 | struct rmap_item *rmap_item; |
1178 | 1232 | ||
1179 | while (cur != &mm_slot->rmap_list) { | 1233 | while (*rmap_list) { |
1180 | rmap_item = list_entry(cur, struct rmap_item, link); | 1234 | rmap_item = *rmap_list; |
1181 | if ((rmap_item->address & PAGE_MASK) == addr) { | 1235 | if ((rmap_item->address & PAGE_MASK) == addr) |
1182 | if (!in_stable_tree(rmap_item)) | ||
1183 | remove_rmap_item_from_tree(rmap_item); | ||
1184 | return rmap_item; | 1236 | return rmap_item; |
1185 | } | ||
1186 | if (rmap_item->address > addr) | 1237 | if (rmap_item->address > addr) |
1187 | break; | 1238 | break; |
1188 | cur = cur->next; | 1239 | *rmap_list = rmap_item->rmap_list; |
1189 | remove_rmap_item_from_tree(rmap_item); | 1240 | remove_rmap_item_from_tree(rmap_item); |
1190 | list_del(&rmap_item->link); | ||
1191 | free_rmap_item(rmap_item); | 1241 | free_rmap_item(rmap_item); |
1192 | } | 1242 | } |
1193 | 1243 | ||
@@ -1196,7 +1246,8 @@ static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, | |||
1196 | /* It has already been zeroed */ | 1246 | /* It has already been zeroed */ |
1197 | rmap_item->mm = mm_slot->mm; | 1247 | rmap_item->mm = mm_slot->mm; |
1198 | rmap_item->address = addr; | 1248 | rmap_item->address = addr; |
1199 | list_add_tail(&rmap_item->link, cur); | 1249 | rmap_item->rmap_list = *rmap_list; |
1250 | *rmap_list = rmap_item; | ||
1200 | } | 1251 | } |
1201 | return rmap_item; | 1252 | return rmap_item; |
1202 | } | 1253 | } |
@@ -1221,8 +1272,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) | |||
1221 | spin_unlock(&ksm_mmlist_lock); | 1272 | spin_unlock(&ksm_mmlist_lock); |
1222 | next_mm: | 1273 | next_mm: |
1223 | ksm_scan.address = 0; | 1274 | ksm_scan.address = 0; |
1224 | ksm_scan.rmap_item = list_entry(&slot->rmap_list, | 1275 | ksm_scan.rmap_list = &slot->rmap_list; |
1225 | struct rmap_item, link); | ||
1226 | } | 1276 | } |
1227 | 1277 | ||
1228 | mm = slot->mm; | 1278 | mm = slot->mm; |
@@ -1248,10 +1298,10 @@ next_mm: | |||
1248 | flush_anon_page(vma, *page, ksm_scan.address); | 1298 | flush_anon_page(vma, *page, ksm_scan.address); |
1249 | flush_dcache_page(*page); | 1299 | flush_dcache_page(*page); |
1250 | rmap_item = get_next_rmap_item(slot, | 1300 | rmap_item = get_next_rmap_item(slot, |
1251 | ksm_scan.rmap_item->link.next, | 1301 | ksm_scan.rmap_list, ksm_scan.address); |
1252 | ksm_scan.address); | ||
1253 | if (rmap_item) { | 1302 | if (rmap_item) { |
1254 | ksm_scan.rmap_item = rmap_item; | 1303 | ksm_scan.rmap_list = |
1304 | &rmap_item->rmap_list; | ||
1255 | ksm_scan.address += PAGE_SIZE; | 1305 | ksm_scan.address += PAGE_SIZE; |
1256 | } else | 1306 | } else |
1257 | put_page(*page); | 1307 | put_page(*page); |
@@ -1267,14 +1317,13 @@ next_mm: | |||
1267 | 1317 | ||
1268 | if (ksm_test_exit(mm)) { | 1318 | if (ksm_test_exit(mm)) { |
1269 | ksm_scan.address = 0; | 1319 | ksm_scan.address = 0; |
1270 | ksm_scan.rmap_item = list_entry(&slot->rmap_list, | 1320 | ksm_scan.rmap_list = &slot->rmap_list; |
1271 | struct rmap_item, link); | ||
1272 | } | 1321 | } |
1273 | /* | 1322 | /* |
1274 | * Nuke all the rmap_items that are above this current rmap: | 1323 | * Nuke all the rmap_items that are above this current rmap: |
1275 | * because there were no VM_MERGEABLE vmas with such addresses. | 1324 | * because there were no VM_MERGEABLE vmas with such addresses. |
1276 | */ | 1325 | */ |
1277 | remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next); | 1326 | remove_trailing_rmap_items(slot, ksm_scan.rmap_list); |
1278 | 1327 | ||
1279 | spin_lock(&ksm_mmlist_lock); | 1328 | spin_lock(&ksm_mmlist_lock); |
1280 | ksm_scan.mm_slot = list_entry(slot->mm_list.next, | 1329 | ksm_scan.mm_slot = list_entry(slot->mm_list.next, |
@@ -1327,14 +1376,6 @@ static void ksm_do_scan(unsigned int scan_npages) | |||
1327 | return; | 1376 | return; |
1328 | if (!PageKsm(page) || !in_stable_tree(rmap_item)) | 1377 | if (!PageKsm(page) || !in_stable_tree(rmap_item)) |
1329 | cmp_and_merge_page(page, rmap_item); | 1378 | cmp_and_merge_page(page, rmap_item); |
1330 | else if (page_mapcount(page) == 1) { | ||
1331 | /* | ||
1332 | * Replace now-unshared ksm page by ordinary page. | ||
1333 | */ | ||
1334 | break_cow(rmap_item->mm, rmap_item->address); | ||
1335 | remove_rmap_item_from_tree(rmap_item); | ||
1336 | rmap_item->oldchecksum = calc_checksum(page); | ||
1337 | } | ||
1338 | put_page(page); | 1379 | put_page(page); |
1339 | } | 1380 | } |
1340 | } | 1381 | } |
@@ -1379,7 +1420,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, | |||
1379 | if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | | 1420 | if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | |
1380 | VM_PFNMAP | VM_IO | VM_DONTEXPAND | | 1421 | VM_PFNMAP | VM_IO | VM_DONTEXPAND | |
1381 | VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | | 1422 | VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | |
1382 | VM_MIXEDMAP | VM_SAO)) | 1423 | VM_NONLINEAR | VM_MIXEDMAP | VM_SAO)) |
1383 | return 0; /* just ignore the advice */ | 1424 | return 0; /* just ignore the advice */ |
1384 | 1425 | ||
1385 | if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { | 1426 | if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { |
@@ -1456,7 +1497,7 @@ void __ksm_exit(struct mm_struct *mm) | |||
1456 | spin_lock(&ksm_mmlist_lock); | 1497 | spin_lock(&ksm_mmlist_lock); |
1457 | mm_slot = get_mm_slot(mm); | 1498 | mm_slot = get_mm_slot(mm); |
1458 | if (mm_slot && ksm_scan.mm_slot != mm_slot) { | 1499 | if (mm_slot && ksm_scan.mm_slot != mm_slot) { |
1459 | if (list_empty(&mm_slot->rmap_list)) { | 1500 | if (!mm_slot->rmap_list) { |
1460 | hlist_del(&mm_slot->link); | 1501 | hlist_del(&mm_slot->link); |
1461 | list_del(&mm_slot->mm_list); | 1502 | list_del(&mm_slot->mm_list); |
1462 | easy_to_free = 1; | 1503 | easy_to_free = 1; |
@@ -1477,6 +1518,249 @@ void __ksm_exit(struct mm_struct *mm) | |||
1477 | } | 1518 | } |
1478 | } | 1519 | } |
1479 | 1520 | ||
1521 | struct page *ksm_does_need_to_copy(struct page *page, | ||
1522 | struct vm_area_struct *vma, unsigned long address) | ||
1523 | { | ||
1524 | struct page *new_page; | ||
1525 | |||
1526 | unlock_page(page); /* any racers will COW it, not modify it */ | ||
1527 | |||
1528 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | ||
1529 | if (new_page) { | ||
1530 | copy_user_highpage(new_page, page, address, vma); | ||
1531 | |||
1532 | SetPageDirty(new_page); | ||
1533 | __SetPageUptodate(new_page); | ||
1534 | SetPageSwapBacked(new_page); | ||
1535 | __set_page_locked(new_page); | ||
1536 | |||
1537 | if (page_evictable(new_page, vma)) | ||
1538 | lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); | ||
1539 | else | ||
1540 | add_page_to_unevictable_list(new_page); | ||
1541 | } | ||
1542 | |||
1543 | page_cache_release(page); | ||
1544 | return new_page; | ||
1545 | } | ||
1546 | |||
1547 | int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, | ||
1548 | unsigned long *vm_flags) | ||
1549 | { | ||
1550 | struct stable_node *stable_node; | ||
1551 | struct rmap_item *rmap_item; | ||
1552 | struct hlist_node *hlist; | ||
1553 | unsigned int mapcount = page_mapcount(page); | ||
1554 | int referenced = 0; | ||
1555 | int search_new_forks = 0; | ||
1556 | |||
1557 | VM_BUG_ON(!PageKsm(page)); | ||
1558 | VM_BUG_ON(!PageLocked(page)); | ||
1559 | |||
1560 | stable_node = page_stable_node(page); | ||
1561 | if (!stable_node) | ||
1562 | return 0; | ||
1563 | again: | ||
1564 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | ||
1565 | struct anon_vma *anon_vma = rmap_item->anon_vma; | ||
1566 | struct vm_area_struct *vma; | ||
1567 | |||
1568 | spin_lock(&anon_vma->lock); | ||
1569 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | ||
1570 | if (rmap_item->address < vma->vm_start || | ||
1571 | rmap_item->address >= vma->vm_end) | ||
1572 | continue; | ||
1573 | /* | ||
1574 | * Initially we examine only the vma which covers this | ||
1575 | * rmap_item; but later, if there is still work to do, | ||
1576 | * we examine covering vmas in other mms: in case they | ||
1577 | * were forked from the original since ksmd passed. | ||
1578 | */ | ||
1579 | if ((rmap_item->mm == vma->vm_mm) == search_new_forks) | ||
1580 | continue; | ||
1581 | |||
1582 | if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) | ||
1583 | continue; | ||
1584 | |||
1585 | referenced += page_referenced_one(page, vma, | ||
1586 | rmap_item->address, &mapcount, vm_flags); | ||
1587 | if (!search_new_forks || !mapcount) | ||
1588 | break; | ||
1589 | } | ||
1590 | spin_unlock(&anon_vma->lock); | ||
1591 | if (!mapcount) | ||
1592 | goto out; | ||
1593 | } | ||
1594 | if (!search_new_forks++) | ||
1595 | goto again; | ||
1596 | out: | ||
1597 | return referenced; | ||
1598 | } | ||
1599 | |||
1600 | int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) | ||
1601 | { | ||
1602 | struct stable_node *stable_node; | ||
1603 | struct hlist_node *hlist; | ||
1604 | struct rmap_item *rmap_item; | ||
1605 | int ret = SWAP_AGAIN; | ||
1606 | int search_new_forks = 0; | ||
1607 | |||
1608 | VM_BUG_ON(!PageKsm(page)); | ||
1609 | VM_BUG_ON(!PageLocked(page)); | ||
1610 | |||
1611 | stable_node = page_stable_node(page); | ||
1612 | if (!stable_node) | ||
1613 | return SWAP_FAIL; | ||
1614 | again: | ||
1615 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | ||
1616 | struct anon_vma *anon_vma = rmap_item->anon_vma; | ||
1617 | struct vm_area_struct *vma; | ||
1618 | |||
1619 | spin_lock(&anon_vma->lock); | ||
1620 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | ||
1621 | if (rmap_item->address < vma->vm_start || | ||
1622 | rmap_item->address >= vma->vm_end) | ||
1623 | continue; | ||
1624 | /* | ||
1625 | * Initially we examine only the vma which covers this | ||
1626 | * rmap_item; but later, if there is still work to do, | ||
1627 | * we examine covering vmas in other mms: in case they | ||
1628 | * were forked from the original since ksmd passed. | ||
1629 | */ | ||
1630 | if ((rmap_item->mm == vma->vm_mm) == search_new_forks) | ||
1631 | continue; | ||
1632 | |||
1633 | ret = try_to_unmap_one(page, vma, | ||
1634 | rmap_item->address, flags); | ||
1635 | if (ret != SWAP_AGAIN || !page_mapped(page)) { | ||
1636 | spin_unlock(&anon_vma->lock); | ||
1637 | goto out; | ||
1638 | } | ||
1639 | } | ||
1640 | spin_unlock(&anon_vma->lock); | ||
1641 | } | ||
1642 | if (!search_new_forks++) | ||
1643 | goto again; | ||
1644 | out: | ||
1645 | return ret; | ||
1646 | } | ||
1647 | |||
1648 | #ifdef CONFIG_MIGRATION | ||
1649 | int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, | ||
1650 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
1651 | { | ||
1652 | struct stable_node *stable_node; | ||
1653 | struct hlist_node *hlist; | ||
1654 | struct rmap_item *rmap_item; | ||
1655 | int ret = SWAP_AGAIN; | ||
1656 | int search_new_forks = 0; | ||
1657 | |||
1658 | VM_BUG_ON(!PageKsm(page)); | ||
1659 | VM_BUG_ON(!PageLocked(page)); | ||
1660 | |||
1661 | stable_node = page_stable_node(page); | ||
1662 | if (!stable_node) | ||
1663 | return ret; | ||
1664 | again: | ||
1665 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | ||
1666 | struct anon_vma *anon_vma = rmap_item->anon_vma; | ||
1667 | struct vm_area_struct *vma; | ||
1668 | |||
1669 | spin_lock(&anon_vma->lock); | ||
1670 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | ||
1671 | if (rmap_item->address < vma->vm_start || | ||
1672 | rmap_item->address >= vma->vm_end) | ||
1673 | continue; | ||
1674 | /* | ||
1675 | * Initially we examine only the vma which covers this | ||
1676 | * rmap_item; but later, if there is still work to do, | ||
1677 | * we examine covering vmas in other mms: in case they | ||
1678 | * were forked from the original since ksmd passed. | ||
1679 | */ | ||
1680 | if ((rmap_item->mm == vma->vm_mm) == search_new_forks) | ||
1681 | continue; | ||
1682 | |||
1683 | ret = rmap_one(page, vma, rmap_item->address, arg); | ||
1684 | if (ret != SWAP_AGAIN) { | ||
1685 | spin_unlock(&anon_vma->lock); | ||
1686 | goto out; | ||
1687 | } | ||
1688 | } | ||
1689 | spin_unlock(&anon_vma->lock); | ||
1690 | } | ||
1691 | if (!search_new_forks++) | ||
1692 | goto again; | ||
1693 | out: | ||
1694 | return ret; | ||
1695 | } | ||
1696 | |||
1697 | void ksm_migrate_page(struct page *newpage, struct page *oldpage) | ||
1698 | { | ||
1699 | struct stable_node *stable_node; | ||
1700 | |||
1701 | VM_BUG_ON(!PageLocked(oldpage)); | ||
1702 | VM_BUG_ON(!PageLocked(newpage)); | ||
1703 | VM_BUG_ON(newpage->mapping != oldpage->mapping); | ||
1704 | |||
1705 | stable_node = page_stable_node(newpage); | ||
1706 | if (stable_node) { | ||
1707 | VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); | ||
1708 | stable_node->kpfn = page_to_pfn(newpage); | ||
1709 | } | ||
1710 | } | ||
1711 | #endif /* CONFIG_MIGRATION */ | ||
1712 | |||
1713 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
1714 | static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn, | ||
1715 | unsigned long end_pfn) | ||
1716 | { | ||
1717 | struct rb_node *node; | ||
1718 | |||
1719 | for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) { | ||
1720 | struct stable_node *stable_node; | ||
1721 | |||
1722 | stable_node = rb_entry(node, struct stable_node, node); | ||
1723 | if (stable_node->kpfn >= start_pfn && | ||
1724 | stable_node->kpfn < end_pfn) | ||
1725 | return stable_node; | ||
1726 | } | ||
1727 | return NULL; | ||
1728 | } | ||
1729 | |||
1730 | static int ksm_memory_callback(struct notifier_block *self, | ||
1731 | unsigned long action, void *arg) | ||
1732 | { | ||
1733 | struct memory_notify *mn = arg; | ||
1734 | struct stable_node *stable_node; | ||
1735 | |||
1736 | switch (action) { | ||
1737 | case MEM_GOING_OFFLINE: | ||
1738 | /* | ||
1739 | * Keep it very simple for now: just lock out ksmd and | ||
1740 | * MADV_UNMERGEABLE while any memory is going offline. | ||
1741 | */ | ||
1742 | mutex_lock(&ksm_thread_mutex); | ||
1743 | break; | ||
1744 | |||
1745 | case MEM_OFFLINE: | ||
1746 | /* | ||
1747 | * Most of the work is done by page migration; but there might | ||
1748 | * be a few stable_nodes left over, still pointing to struct | ||
1749 | * pages which have been offlined: prune those from the tree. | ||
1750 | */ | ||
1751 | while ((stable_node = ksm_check_stable_tree(mn->start_pfn, | ||
1752 | mn->start_pfn + mn->nr_pages)) != NULL) | ||
1753 | remove_node_from_stable_tree(stable_node); | ||
1754 | /* fallthrough */ | ||
1755 | |||
1756 | case MEM_CANCEL_OFFLINE: | ||
1757 | mutex_unlock(&ksm_thread_mutex); | ||
1758 | break; | ||
1759 | } | ||
1760 | return NOTIFY_OK; | ||
1761 | } | ||
1762 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | ||
1763 | |||
1480 | #ifdef CONFIG_SYSFS | 1764 | #ifdef CONFIG_SYSFS |
1481 | /* | 1765 | /* |
1482 | * This all compiles without CONFIG_SYSFS, but is a waste of space. | 1766 | * This all compiles without CONFIG_SYSFS, but is a waste of space. |
@@ -1555,8 +1839,8 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
1555 | /* | 1839 | /* |
1556 | * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. | 1840 | * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. |
1557 | * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, | 1841 | * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, |
1558 | * breaking COW to free the unswappable pages_shared (but leaves | 1842 | * breaking COW to free the pages_shared (but leaves mm_slots |
1559 | * mm_slots on the list for when ksmd may be set running again). | 1843 | * on the list for when ksmd may be set running again). |
1560 | */ | 1844 | */ |
1561 | 1845 | ||
1562 | mutex_lock(&ksm_thread_mutex); | 1846 | mutex_lock(&ksm_thread_mutex); |
@@ -1581,29 +1865,6 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
1581 | } | 1865 | } |
1582 | KSM_ATTR(run); | 1866 | KSM_ATTR(run); |
1583 | 1867 | ||
1584 | static ssize_t max_kernel_pages_store(struct kobject *kobj, | ||
1585 | struct kobj_attribute *attr, | ||
1586 | const char *buf, size_t count) | ||
1587 | { | ||
1588 | int err; | ||
1589 | unsigned long nr_pages; | ||
1590 | |||
1591 | err = strict_strtoul(buf, 10, &nr_pages); | ||
1592 | if (err) | ||
1593 | return -EINVAL; | ||
1594 | |||
1595 | ksm_max_kernel_pages = nr_pages; | ||
1596 | |||
1597 | return count; | ||
1598 | } | ||
1599 | |||
1600 | static ssize_t max_kernel_pages_show(struct kobject *kobj, | ||
1601 | struct kobj_attribute *attr, char *buf) | ||
1602 | { | ||
1603 | return sprintf(buf, "%lu\n", ksm_max_kernel_pages); | ||
1604 | } | ||
1605 | KSM_ATTR(max_kernel_pages); | ||
1606 | |||
1607 | static ssize_t pages_shared_show(struct kobject *kobj, | 1868 | static ssize_t pages_shared_show(struct kobject *kobj, |
1608 | struct kobj_attribute *attr, char *buf) | 1869 | struct kobj_attribute *attr, char *buf) |
1609 | { | 1870 | { |
@@ -1653,7 +1914,6 @@ static struct attribute *ksm_attrs[] = { | |||
1653 | &sleep_millisecs_attr.attr, | 1914 | &sleep_millisecs_attr.attr, |
1654 | &pages_to_scan_attr.attr, | 1915 | &pages_to_scan_attr.attr, |
1655 | &run_attr.attr, | 1916 | &run_attr.attr, |
1656 | &max_kernel_pages_attr.attr, | ||
1657 | &pages_shared_attr.attr, | 1917 | &pages_shared_attr.attr, |
1658 | &pages_sharing_attr.attr, | 1918 | &pages_sharing_attr.attr, |
1659 | &pages_unshared_attr.attr, | 1919 | &pages_unshared_attr.attr, |
@@ -1673,8 +1933,6 @@ static int __init ksm_init(void) | |||
1673 | struct task_struct *ksm_thread; | 1933 | struct task_struct *ksm_thread; |
1674 | int err; | 1934 | int err; |
1675 | 1935 | ||
1676 | ksm_init_max_kernel_pages(); | ||
1677 | |||
1678 | err = ksm_slab_init(); | 1936 | err = ksm_slab_init(); |
1679 | if (err) | 1937 | if (err) |
1680 | goto out; | 1938 | goto out; |
@@ -1697,8 +1955,18 @@ static int __init ksm_init(void) | |||
1697 | kthread_stop(ksm_thread); | 1955 | kthread_stop(ksm_thread); |
1698 | goto out_free2; | 1956 | goto out_free2; |
1699 | } | 1957 | } |
1958 | #else | ||
1959 | ksm_run = KSM_RUN_MERGE; /* no way for user to start it */ | ||
1960 | |||
1700 | #endif /* CONFIG_SYSFS */ | 1961 | #endif /* CONFIG_SYSFS */ |
1701 | 1962 | ||
1963 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
1964 | /* | ||
1965 | * Choose a high priority since the callback takes ksm_thread_mutex: | ||
1966 | * later callbacks could only be taking locks which nest within that. | ||
1967 | */ | ||
1968 | hotplug_memory_notifier(ksm_memory_callback, 100); | ||
1969 | #endif | ||
1702 | return 0; | 1970 | return 0; |
1703 | 1971 | ||
1704 | out_free2: | 1972 | out_free2: |
diff --git a/mm/madvise.c b/mm/madvise.c index 35b1479b7c9d..319528b8db74 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/pagemap.h> | 9 | #include <linux/pagemap.h> |
10 | #include <linux/syscalls.h> | 10 | #include <linux/syscalls.h> |
11 | #include <linux/mempolicy.h> | 11 | #include <linux/mempolicy.h> |
12 | #include <linux/page-isolation.h> | ||
12 | #include <linux/hugetlb.h> | 13 | #include <linux/hugetlb.h> |
13 | #include <linux/sched.h> | 14 | #include <linux/sched.h> |
14 | #include <linux/ksm.h> | 15 | #include <linux/ksm.h> |
@@ -222,7 +223,7 @@ static long madvise_remove(struct vm_area_struct *vma, | |||
222 | /* | 223 | /* |
223 | * Error injection support for memory error handling. | 224 | * Error injection support for memory error handling. |
224 | */ | 225 | */ |
225 | static int madvise_hwpoison(unsigned long start, unsigned long end) | 226 | static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) |
226 | { | 227 | { |
227 | int ret = 0; | 228 | int ret = 0; |
228 | 229 | ||
@@ -230,15 +231,21 @@ static int madvise_hwpoison(unsigned long start, unsigned long end) | |||
230 | return -EPERM; | 231 | return -EPERM; |
231 | for (; start < end; start += PAGE_SIZE) { | 232 | for (; start < end; start += PAGE_SIZE) { |
232 | struct page *p; | 233 | struct page *p; |
233 | int ret = get_user_pages(current, current->mm, start, 1, | 234 | int ret = get_user_pages_fast(start, 1, 0, &p); |
234 | 0, 0, &p, NULL); | ||
235 | if (ret != 1) | 235 | if (ret != 1) |
236 | return ret; | 236 | return ret; |
237 | if (bhv == MADV_SOFT_OFFLINE) { | ||
238 | printk(KERN_INFO "Soft offlining page %lx at %lx\n", | ||
239 | page_to_pfn(p), start); | ||
240 | ret = soft_offline_page(p, MF_COUNT_INCREASED); | ||
241 | if (ret) | ||
242 | break; | ||
243 | continue; | ||
244 | } | ||
237 | printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", | 245 | printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", |
238 | page_to_pfn(p), start); | 246 | page_to_pfn(p), start); |
239 | /* Ignore return value for now */ | 247 | /* Ignore return value for now */ |
240 | __memory_failure(page_to_pfn(p), 0, 1); | 248 | __memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); |
241 | put_page(p); | ||
242 | } | 249 | } |
243 | return ret; | 250 | return ret; |
244 | } | 251 | } |
@@ -335,8 +342,8 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | |||
335 | size_t len; | 342 | size_t len; |
336 | 343 | ||
337 | #ifdef CONFIG_MEMORY_FAILURE | 344 | #ifdef CONFIG_MEMORY_FAILURE |
338 | if (behavior == MADV_HWPOISON) | 345 | if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) |
339 | return madvise_hwpoison(start, start+len_in); | 346 | return madvise_hwpoison(behavior, start, start+len_in); |
340 | #endif | 347 | #endif |
341 | if (!madvise_behavior_valid(behavior)) | 348 | if (!madvise_behavior_valid(behavior)) |
342 | return error; | 349 | return error; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f99f5991d6bb..488b644e0e8e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -38,6 +38,7 @@ | |||
38 | #include <linux/vmalloc.h> | 38 | #include <linux/vmalloc.h> |
39 | #include <linux/mm_inline.h> | 39 | #include <linux/mm_inline.h> |
40 | #include <linux/page_cgroup.h> | 40 | #include <linux/page_cgroup.h> |
41 | #include <linux/cpu.h> | ||
41 | #include "internal.h" | 42 | #include "internal.h" |
42 | 43 | ||
43 | #include <asm/uaccess.h> | 44 | #include <asm/uaccess.h> |
@@ -54,7 +55,6 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/ | |||
54 | #define do_swap_account (0) | 55 | #define do_swap_account (0) |
55 | #endif | 56 | #endif |
56 | 57 | ||
57 | static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ | ||
58 | #define SOFTLIMIT_EVENTS_THRESH (1000) | 58 | #define SOFTLIMIT_EVENTS_THRESH (1000) |
59 | 59 | ||
60 | /* | 60 | /* |
@@ -66,7 +66,7 @@ enum mem_cgroup_stat_index { | |||
66 | */ | 66 | */ |
67 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ | 67 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ |
68 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ | 68 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ |
69 | MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ | 69 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ |
70 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ | 70 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ |
71 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ | 71 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ |
72 | MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ | 72 | MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ |
@@ -209,7 +209,7 @@ struct mem_cgroup { | |||
209 | int prev_priority; /* for recording reclaim priority */ | 209 | int prev_priority; /* for recording reclaim priority */ |
210 | 210 | ||
211 | /* | 211 | /* |
212 | * While reclaiming in a hiearchy, we cache the last child we | 212 | * While reclaiming in a hierarchy, we cache the last child we |
213 | * reclaimed from. | 213 | * reclaimed from. |
214 | */ | 214 | */ |
215 | int last_scanned_child; | 215 | int last_scanned_child; |
@@ -275,6 +275,7 @@ enum charge_type { | |||
275 | static void mem_cgroup_get(struct mem_cgroup *mem); | 275 | static void mem_cgroup_get(struct mem_cgroup *mem); |
276 | static void mem_cgroup_put(struct mem_cgroup *mem); | 276 | static void mem_cgroup_put(struct mem_cgroup *mem); |
277 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); | 277 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); |
278 | static void drain_all_stock_async(void); | ||
278 | 279 | ||
279 | static struct mem_cgroup_per_zone * | 280 | static struct mem_cgroup_per_zone * |
280 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | 281 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) |
@@ -282,6 +283,11 @@ mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | |||
282 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; | 283 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; |
283 | } | 284 | } |
284 | 285 | ||
286 | struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) | ||
287 | { | ||
288 | return &mem->css; | ||
289 | } | ||
290 | |||
285 | static struct mem_cgroup_per_zone * | 291 | static struct mem_cgroup_per_zone * |
286 | page_cgroup_zoneinfo(struct page_cgroup *pc) | 292 | page_cgroup_zoneinfo(struct page_cgroup *pc) |
287 | { | 293 | { |
@@ -758,7 +764,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | |||
758 | task_unlock(task); | 764 | task_unlock(task); |
759 | if (!curr) | 765 | if (!curr) |
760 | return 0; | 766 | return 0; |
761 | if (curr->use_hierarchy) | 767 | /* |
768 | * We should check use_hierarchy of "mem" not "curr". Because checking | ||
769 | * use_hierarchy of "curr" here make this function true if hierarchy is | ||
770 | * enabled in "curr" and "curr" is a child of "mem" in *cgroup* | ||
771 | * hierarchy(even if use_hierarchy is disabled in "mem"). | ||
772 | */ | ||
773 | if (mem->use_hierarchy) | ||
762 | ret = css_is_ancestor(&curr->css, &mem->css); | 774 | ret = css_is_ancestor(&curr->css, &mem->css); |
763 | else | 775 | else |
764 | ret = (curr == mem); | 776 | ret = (curr == mem); |
@@ -1007,7 +1019,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | |||
1007 | static char memcg_name[PATH_MAX]; | 1019 | static char memcg_name[PATH_MAX]; |
1008 | int ret; | 1020 | int ret; |
1009 | 1021 | ||
1010 | if (!memcg) | 1022 | if (!memcg || !p) |
1011 | return; | 1023 | return; |
1012 | 1024 | ||
1013 | 1025 | ||
@@ -1137,6 +1149,8 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1137 | victim = mem_cgroup_select_victim(root_mem); | 1149 | victim = mem_cgroup_select_victim(root_mem); |
1138 | if (victim == root_mem) { | 1150 | if (victim == root_mem) { |
1139 | loop++; | 1151 | loop++; |
1152 | if (loop >= 1) | ||
1153 | drain_all_stock_async(); | ||
1140 | if (loop >= 2) { | 1154 | if (loop >= 2) { |
1141 | /* | 1155 | /* |
1142 | * If we have not been able to reclaim | 1156 | * If we have not been able to reclaim |
@@ -1223,7 +1237,7 @@ static void record_last_oom(struct mem_cgroup *mem) | |||
1223 | * Currently used to update mapped file statistics, but the routine can be | 1237 | * Currently used to update mapped file statistics, but the routine can be |
1224 | * generalized to update other statistics as well. | 1238 | * generalized to update other statistics as well. |
1225 | */ | 1239 | */ |
1226 | void mem_cgroup_update_mapped_file_stat(struct page *page, int val) | 1240 | void mem_cgroup_update_file_mapped(struct page *page, int val) |
1227 | { | 1241 | { |
1228 | struct mem_cgroup *mem; | 1242 | struct mem_cgroup *mem; |
1229 | struct mem_cgroup_stat *stat; | 1243 | struct mem_cgroup_stat *stat; |
@@ -1231,9 +1245,6 @@ void mem_cgroup_update_mapped_file_stat(struct page *page, int val) | |||
1231 | int cpu; | 1245 | int cpu; |
1232 | struct page_cgroup *pc; | 1246 | struct page_cgroup *pc; |
1233 | 1247 | ||
1234 | if (!page_is_file_cache(page)) | ||
1235 | return; | ||
1236 | |||
1237 | pc = lookup_page_cgroup(page); | 1248 | pc = lookup_page_cgroup(page); |
1238 | if (unlikely(!pc)) | 1249 | if (unlikely(!pc)) |
1239 | return; | 1250 | return; |
@@ -1253,12 +1264,139 @@ void mem_cgroup_update_mapped_file_stat(struct page *page, int val) | |||
1253 | stat = &mem->stat; | 1264 | stat = &mem->stat; |
1254 | cpustat = &stat->cpustat[cpu]; | 1265 | cpustat = &stat->cpustat[cpu]; |
1255 | 1266 | ||
1256 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val); | 1267 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val); |
1257 | done: | 1268 | done: |
1258 | unlock_page_cgroup(pc); | 1269 | unlock_page_cgroup(pc); |
1259 | } | 1270 | } |
1260 | 1271 | ||
1261 | /* | 1272 | /* |
1273 | * size of first charge trial. "32" comes from vmscan.c's magic value. | ||
1274 | * TODO: maybe necessary to use big numbers in big irons. | ||
1275 | */ | ||
1276 | #define CHARGE_SIZE (32 * PAGE_SIZE) | ||
1277 | struct memcg_stock_pcp { | ||
1278 | struct mem_cgroup *cached; /* this never be root cgroup */ | ||
1279 | int charge; | ||
1280 | struct work_struct work; | ||
1281 | }; | ||
1282 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); | ||
1283 | static atomic_t memcg_drain_count; | ||
1284 | |||
1285 | /* | ||
1286 | * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed | ||
1287 | * from local stock and true is returned. If the stock is 0 or charges from a | ||
1288 | * cgroup which is not current target, returns false. This stock will be | ||
1289 | * refilled. | ||
1290 | */ | ||
1291 | static bool consume_stock(struct mem_cgroup *mem) | ||
1292 | { | ||
1293 | struct memcg_stock_pcp *stock; | ||
1294 | bool ret = true; | ||
1295 | |||
1296 | stock = &get_cpu_var(memcg_stock); | ||
1297 | if (mem == stock->cached && stock->charge) | ||
1298 | stock->charge -= PAGE_SIZE; | ||
1299 | else /* need to call res_counter_charge */ | ||
1300 | ret = false; | ||
1301 | put_cpu_var(memcg_stock); | ||
1302 | return ret; | ||
1303 | } | ||
1304 | |||
1305 | /* | ||
1306 | * Returns stocks cached in percpu to res_counter and reset cached information. | ||
1307 | */ | ||
1308 | static void drain_stock(struct memcg_stock_pcp *stock) | ||
1309 | { | ||
1310 | struct mem_cgroup *old = stock->cached; | ||
1311 | |||
1312 | if (stock->charge) { | ||
1313 | res_counter_uncharge(&old->res, stock->charge); | ||
1314 | if (do_swap_account) | ||
1315 | res_counter_uncharge(&old->memsw, stock->charge); | ||
1316 | } | ||
1317 | stock->cached = NULL; | ||
1318 | stock->charge = 0; | ||
1319 | } | ||
1320 | |||
1321 | /* | ||
1322 | * This must be called under preempt disabled or must be called by | ||
1323 | * a thread which is pinned to local cpu. | ||
1324 | */ | ||
1325 | static void drain_local_stock(struct work_struct *dummy) | ||
1326 | { | ||
1327 | struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); | ||
1328 | drain_stock(stock); | ||
1329 | } | ||
1330 | |||
1331 | /* | ||
1332 | * Cache charges(val) which is from res_counter, to local per_cpu area. | ||
1333 | * This will be consumed by consumt_stock() function, later. | ||
1334 | */ | ||
1335 | static void refill_stock(struct mem_cgroup *mem, int val) | ||
1336 | { | ||
1337 | struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); | ||
1338 | |||
1339 | if (stock->cached != mem) { /* reset if necessary */ | ||
1340 | drain_stock(stock); | ||
1341 | stock->cached = mem; | ||
1342 | } | ||
1343 | stock->charge += val; | ||
1344 | put_cpu_var(memcg_stock); | ||
1345 | } | ||
1346 | |||
1347 | /* | ||
1348 | * Tries to drain stocked charges in other cpus. This function is asynchronous | ||
1349 | * and just put a work per cpu for draining localy on each cpu. Caller can | ||
1350 | * expects some charges will be back to res_counter later but cannot wait for | ||
1351 | * it. | ||
1352 | */ | ||
1353 | static void drain_all_stock_async(void) | ||
1354 | { | ||
1355 | int cpu; | ||
1356 | /* This function is for scheduling "drain" in asynchronous way. | ||
1357 | * The result of "drain" is not directly handled by callers. Then, | ||
1358 | * if someone is calling drain, we don't have to call drain more. | ||
1359 | * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if | ||
1360 | * there is a race. We just do loose check here. | ||
1361 | */ | ||
1362 | if (atomic_read(&memcg_drain_count)) | ||
1363 | return; | ||
1364 | /* Notify other cpus that system-wide "drain" is running */ | ||
1365 | atomic_inc(&memcg_drain_count); | ||
1366 | get_online_cpus(); | ||
1367 | for_each_online_cpu(cpu) { | ||
1368 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); | ||
1369 | schedule_work_on(cpu, &stock->work); | ||
1370 | } | ||
1371 | put_online_cpus(); | ||
1372 | atomic_dec(&memcg_drain_count); | ||
1373 | /* We don't wait for flush_work */ | ||
1374 | } | ||
1375 | |||
1376 | /* This is a synchronous drain interface. */ | ||
1377 | static void drain_all_stock_sync(void) | ||
1378 | { | ||
1379 | /* called when force_empty is called */ | ||
1380 | atomic_inc(&memcg_drain_count); | ||
1381 | schedule_on_each_cpu(drain_local_stock); | ||
1382 | atomic_dec(&memcg_drain_count); | ||
1383 | } | ||
1384 | |||
1385 | static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, | ||
1386 | unsigned long action, | ||
1387 | void *hcpu) | ||
1388 | { | ||
1389 | int cpu = (unsigned long)hcpu; | ||
1390 | struct memcg_stock_pcp *stock; | ||
1391 | |||
1392 | if (action != CPU_DEAD) | ||
1393 | return NOTIFY_OK; | ||
1394 | stock = &per_cpu(memcg_stock, cpu); | ||
1395 | drain_stock(stock); | ||
1396 | return NOTIFY_OK; | ||
1397 | } | ||
1398 | |||
1399 | /* | ||
1262 | * Unlike exported interface, "oom" parameter is added. if oom==true, | 1400 | * Unlike exported interface, "oom" parameter is added. if oom==true, |
1263 | * oom-killer can be invoked. | 1401 | * oom-killer can be invoked. |
1264 | */ | 1402 | */ |
@@ -1269,6 +1407,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1269 | struct mem_cgroup *mem, *mem_over_limit; | 1407 | struct mem_cgroup *mem, *mem_over_limit; |
1270 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 1408 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
1271 | struct res_counter *fail_res; | 1409 | struct res_counter *fail_res; |
1410 | int csize = CHARGE_SIZE; | ||
1272 | 1411 | ||
1273 | if (unlikely(test_thread_flag(TIF_MEMDIE))) { | 1412 | if (unlikely(test_thread_flag(TIF_MEMDIE))) { |
1274 | /* Don't account this! */ | 1413 | /* Don't account this! */ |
@@ -1293,23 +1432,25 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1293 | return 0; | 1432 | return 0; |
1294 | 1433 | ||
1295 | VM_BUG_ON(css_is_removed(&mem->css)); | 1434 | VM_BUG_ON(css_is_removed(&mem->css)); |
1435 | if (mem_cgroup_is_root(mem)) | ||
1436 | goto done; | ||
1296 | 1437 | ||
1297 | while (1) { | 1438 | while (1) { |
1298 | int ret = 0; | 1439 | int ret = 0; |
1299 | unsigned long flags = 0; | 1440 | unsigned long flags = 0; |
1300 | 1441 | ||
1301 | if (mem_cgroup_is_root(mem)) | 1442 | if (consume_stock(mem)) |
1302 | goto done; | 1443 | goto charged; |
1303 | ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); | 1444 | |
1445 | ret = res_counter_charge(&mem->res, csize, &fail_res); | ||
1304 | if (likely(!ret)) { | 1446 | if (likely(!ret)) { |
1305 | if (!do_swap_account) | 1447 | if (!do_swap_account) |
1306 | break; | 1448 | break; |
1307 | ret = res_counter_charge(&mem->memsw, PAGE_SIZE, | 1449 | ret = res_counter_charge(&mem->memsw, csize, &fail_res); |
1308 | &fail_res); | ||
1309 | if (likely(!ret)) | 1450 | if (likely(!ret)) |
1310 | break; | 1451 | break; |
1311 | /* mem+swap counter fails */ | 1452 | /* mem+swap counter fails */ |
1312 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1453 | res_counter_uncharge(&mem->res, csize); |
1313 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; | 1454 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; |
1314 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | 1455 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, |
1315 | memsw); | 1456 | memsw); |
@@ -1318,6 +1459,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1318 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | 1459 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, |
1319 | res); | 1460 | res); |
1320 | 1461 | ||
1462 | /* reduce request size and retry */ | ||
1463 | if (csize > PAGE_SIZE) { | ||
1464 | csize = PAGE_SIZE; | ||
1465 | continue; | ||
1466 | } | ||
1321 | if (!(gfp_mask & __GFP_WAIT)) | 1467 | if (!(gfp_mask & __GFP_WAIT)) |
1322 | goto nomem; | 1468 | goto nomem; |
1323 | 1469 | ||
@@ -1339,14 +1485,15 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1339 | 1485 | ||
1340 | if (!nr_retries--) { | 1486 | if (!nr_retries--) { |
1341 | if (oom) { | 1487 | if (oom) { |
1342 | mutex_lock(&memcg_tasklist); | ||
1343 | mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); | 1488 | mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); |
1344 | mutex_unlock(&memcg_tasklist); | ||
1345 | record_last_oom(mem_over_limit); | 1489 | record_last_oom(mem_over_limit); |
1346 | } | 1490 | } |
1347 | goto nomem; | 1491 | goto nomem; |
1348 | } | 1492 | } |
1349 | } | 1493 | } |
1494 | if (csize > PAGE_SIZE) | ||
1495 | refill_stock(mem, csize - PAGE_SIZE); | ||
1496 | charged: | ||
1350 | /* | 1497 | /* |
1351 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | 1498 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. |
1352 | * if they exceeds softlimit. | 1499 | * if they exceeds softlimit. |
@@ -1361,6 +1508,21 @@ nomem: | |||
1361 | } | 1508 | } |
1362 | 1509 | ||
1363 | /* | 1510 | /* |
1511 | * Somemtimes we have to undo a charge we got by try_charge(). | ||
1512 | * This function is for that and do uncharge, put css's refcnt. | ||
1513 | * gotten by try_charge(). | ||
1514 | */ | ||
1515 | static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) | ||
1516 | { | ||
1517 | if (!mem_cgroup_is_root(mem)) { | ||
1518 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
1519 | if (do_swap_account) | ||
1520 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1521 | } | ||
1522 | css_put(&mem->css); | ||
1523 | } | ||
1524 | |||
1525 | /* | ||
1364 | * A helper function to get mem_cgroup from ID. must be called under | 1526 | * A helper function to get mem_cgroup from ID. must be called under |
1365 | * rcu_read_lock(). The caller must check css_is_removed() or some if | 1527 | * rcu_read_lock(). The caller must check css_is_removed() or some if |
1366 | * it's concern. (dropping refcnt from swap can be called against removed | 1528 | * it's concern. (dropping refcnt from swap can be called against removed |
@@ -1379,25 +1541,22 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) | |||
1379 | return container_of(css, struct mem_cgroup, css); | 1541 | return container_of(css, struct mem_cgroup, css); |
1380 | } | 1542 | } |
1381 | 1543 | ||
1382 | static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) | 1544 | struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) |
1383 | { | 1545 | { |
1384 | struct mem_cgroup *mem; | 1546 | struct mem_cgroup *mem = NULL; |
1385 | struct page_cgroup *pc; | 1547 | struct page_cgroup *pc; |
1386 | unsigned short id; | 1548 | unsigned short id; |
1387 | swp_entry_t ent; | 1549 | swp_entry_t ent; |
1388 | 1550 | ||
1389 | VM_BUG_ON(!PageLocked(page)); | 1551 | VM_BUG_ON(!PageLocked(page)); |
1390 | 1552 | ||
1391 | if (!PageSwapCache(page)) | ||
1392 | return NULL; | ||
1393 | |||
1394 | pc = lookup_page_cgroup(page); | 1553 | pc = lookup_page_cgroup(page); |
1395 | lock_page_cgroup(pc); | 1554 | lock_page_cgroup(pc); |
1396 | if (PageCgroupUsed(pc)) { | 1555 | if (PageCgroupUsed(pc)) { |
1397 | mem = pc->mem_cgroup; | 1556 | mem = pc->mem_cgroup; |
1398 | if (mem && !css_tryget(&mem->css)) | 1557 | if (mem && !css_tryget(&mem->css)) |
1399 | mem = NULL; | 1558 | mem = NULL; |
1400 | } else { | 1559 | } else if (PageSwapCache(page)) { |
1401 | ent.val = page_private(page); | 1560 | ent.val = page_private(page); |
1402 | id = lookup_swap_cgroup(ent); | 1561 | id = lookup_swap_cgroup(ent); |
1403 | rcu_read_lock(); | 1562 | rcu_read_lock(); |
@@ -1426,12 +1585,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
1426 | lock_page_cgroup(pc); | 1585 | lock_page_cgroup(pc); |
1427 | if (unlikely(PageCgroupUsed(pc))) { | 1586 | if (unlikely(PageCgroupUsed(pc))) { |
1428 | unlock_page_cgroup(pc); | 1587 | unlock_page_cgroup(pc); |
1429 | if (!mem_cgroup_is_root(mem)) { | 1588 | mem_cgroup_cancel_charge(mem); |
1430 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
1431 | if (do_swap_account) | ||
1432 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1433 | } | ||
1434 | css_put(&mem->css); | ||
1435 | return; | 1589 | return; |
1436 | } | 1590 | } |
1437 | 1591 | ||
@@ -1464,27 +1618,22 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
1464 | } | 1618 | } |
1465 | 1619 | ||
1466 | /** | 1620 | /** |
1467 | * mem_cgroup_move_account - move account of the page | 1621 | * __mem_cgroup_move_account - move account of the page |
1468 | * @pc: page_cgroup of the page. | 1622 | * @pc: page_cgroup of the page. |
1469 | * @from: mem_cgroup which the page is moved from. | 1623 | * @from: mem_cgroup which the page is moved from. |
1470 | * @to: mem_cgroup which the page is moved to. @from != @to. | 1624 | * @to: mem_cgroup which the page is moved to. @from != @to. |
1471 | * | 1625 | * |
1472 | * The caller must confirm following. | 1626 | * The caller must confirm following. |
1473 | * - page is not on LRU (isolate_page() is useful.) | 1627 | * - page is not on LRU (isolate_page() is useful.) |
1474 | * | 1628 | * - the pc is locked, used, and ->mem_cgroup points to @from. |
1475 | * returns 0 at success, | ||
1476 | * returns -EBUSY when lock is busy or "pc" is unstable. | ||
1477 | * | 1629 | * |
1478 | * This function does "uncharge" from old cgroup but doesn't do "charge" to | 1630 | * This function does "uncharge" from old cgroup but doesn't do "charge" to |
1479 | * new cgroup. It should be done by a caller. | 1631 | * new cgroup. It should be done by a caller. |
1480 | */ | 1632 | */ |
1481 | 1633 | ||
1482 | static int mem_cgroup_move_account(struct page_cgroup *pc, | 1634 | static void __mem_cgroup_move_account(struct page_cgroup *pc, |
1483 | struct mem_cgroup *from, struct mem_cgroup *to) | 1635 | struct mem_cgroup *from, struct mem_cgroup *to) |
1484 | { | 1636 | { |
1485 | struct mem_cgroup_per_zone *from_mz, *to_mz; | ||
1486 | int nid, zid; | ||
1487 | int ret = -EBUSY; | ||
1488 | struct page *page; | 1637 | struct page *page; |
1489 | int cpu; | 1638 | int cpu; |
1490 | struct mem_cgroup_stat *stat; | 1639 | struct mem_cgroup_stat *stat; |
@@ -1492,38 +1641,27 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
1492 | 1641 | ||
1493 | VM_BUG_ON(from == to); | 1642 | VM_BUG_ON(from == to); |
1494 | VM_BUG_ON(PageLRU(pc->page)); | 1643 | VM_BUG_ON(PageLRU(pc->page)); |
1495 | 1644 | VM_BUG_ON(!PageCgroupLocked(pc)); | |
1496 | nid = page_cgroup_nid(pc); | 1645 | VM_BUG_ON(!PageCgroupUsed(pc)); |
1497 | zid = page_cgroup_zid(pc); | 1646 | VM_BUG_ON(pc->mem_cgroup != from); |
1498 | from_mz = mem_cgroup_zoneinfo(from, nid, zid); | ||
1499 | to_mz = mem_cgroup_zoneinfo(to, nid, zid); | ||
1500 | |||
1501 | if (!trylock_page_cgroup(pc)) | ||
1502 | return ret; | ||
1503 | |||
1504 | if (!PageCgroupUsed(pc)) | ||
1505 | goto out; | ||
1506 | |||
1507 | if (pc->mem_cgroup != from) | ||
1508 | goto out; | ||
1509 | 1647 | ||
1510 | if (!mem_cgroup_is_root(from)) | 1648 | if (!mem_cgroup_is_root(from)) |
1511 | res_counter_uncharge(&from->res, PAGE_SIZE); | 1649 | res_counter_uncharge(&from->res, PAGE_SIZE); |
1512 | mem_cgroup_charge_statistics(from, pc, false); | 1650 | mem_cgroup_charge_statistics(from, pc, false); |
1513 | 1651 | ||
1514 | page = pc->page; | 1652 | page = pc->page; |
1515 | if (page_is_file_cache(page) && page_mapped(page)) { | 1653 | if (page_mapped(page) && !PageAnon(page)) { |
1516 | cpu = smp_processor_id(); | 1654 | cpu = smp_processor_id(); |
1517 | /* Update mapped_file data for mem_cgroup "from" */ | 1655 | /* Update mapped_file data for mem_cgroup "from" */ |
1518 | stat = &from->stat; | 1656 | stat = &from->stat; |
1519 | cpustat = &stat->cpustat[cpu]; | 1657 | cpustat = &stat->cpustat[cpu]; |
1520 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, | 1658 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, |
1521 | -1); | 1659 | -1); |
1522 | 1660 | ||
1523 | /* Update mapped_file data for mem_cgroup "to" */ | 1661 | /* Update mapped_file data for mem_cgroup "to" */ |
1524 | stat = &to->stat; | 1662 | stat = &to->stat; |
1525 | cpustat = &stat->cpustat[cpu]; | 1663 | cpustat = &stat->cpustat[cpu]; |
1526 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, | 1664 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, |
1527 | 1); | 1665 | 1); |
1528 | } | 1666 | } |
1529 | 1667 | ||
@@ -1534,15 +1672,28 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
1534 | css_get(&to->css); | 1672 | css_get(&to->css); |
1535 | pc->mem_cgroup = to; | 1673 | pc->mem_cgroup = to; |
1536 | mem_cgroup_charge_statistics(to, pc, true); | 1674 | mem_cgroup_charge_statistics(to, pc, true); |
1537 | ret = 0; | ||
1538 | out: | ||
1539 | unlock_page_cgroup(pc); | ||
1540 | /* | 1675 | /* |
1541 | * We charges against "to" which may not have any tasks. Then, "to" | 1676 | * We charges against "to" which may not have any tasks. Then, "to" |
1542 | * can be under rmdir(). But in current implementation, caller of | 1677 | * can be under rmdir(). But in current implementation, caller of |
1543 | * this function is just force_empty() and it's garanteed that | 1678 | * this function is just force_empty() and it's garanteed that |
1544 | * "to" is never removed. So, we don't check rmdir status here. | 1679 | * "to" is never removed. So, we don't check rmdir status here. |
1545 | */ | 1680 | */ |
1681 | } | ||
1682 | |||
1683 | /* | ||
1684 | * check whether the @pc is valid for moving account and call | ||
1685 | * __mem_cgroup_move_account() | ||
1686 | */ | ||
1687 | static int mem_cgroup_move_account(struct page_cgroup *pc, | ||
1688 | struct mem_cgroup *from, struct mem_cgroup *to) | ||
1689 | { | ||
1690 | int ret = -EINVAL; | ||
1691 | lock_page_cgroup(pc); | ||
1692 | if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { | ||
1693 | __mem_cgroup_move_account(pc, from, to); | ||
1694 | ret = 0; | ||
1695 | } | ||
1696 | unlock_page_cgroup(pc); | ||
1546 | return ret; | 1697 | return ret; |
1547 | } | 1698 | } |
1548 | 1699 | ||
@@ -1564,45 +1715,27 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, | |||
1564 | if (!pcg) | 1715 | if (!pcg) |
1565 | return -EINVAL; | 1716 | return -EINVAL; |
1566 | 1717 | ||
1718 | ret = -EBUSY; | ||
1719 | if (!get_page_unless_zero(page)) | ||
1720 | goto out; | ||
1721 | if (isolate_lru_page(page)) | ||
1722 | goto put; | ||
1567 | 1723 | ||
1568 | parent = mem_cgroup_from_cont(pcg); | 1724 | parent = mem_cgroup_from_cont(pcg); |
1569 | |||
1570 | |||
1571 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); | 1725 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); |
1572 | if (ret || !parent) | 1726 | if (ret || !parent) |
1573 | return ret; | 1727 | goto put_back; |
1574 | |||
1575 | if (!get_page_unless_zero(page)) { | ||
1576 | ret = -EBUSY; | ||
1577 | goto uncharge; | ||
1578 | } | ||
1579 | |||
1580 | ret = isolate_lru_page(page); | ||
1581 | |||
1582 | if (ret) | ||
1583 | goto cancel; | ||
1584 | 1728 | ||
1585 | ret = mem_cgroup_move_account(pc, child, parent); | 1729 | ret = mem_cgroup_move_account(pc, child, parent); |
1586 | 1730 | if (!ret) | |
1731 | css_put(&parent->css); /* drop extra refcnt by try_charge() */ | ||
1732 | else | ||
1733 | mem_cgroup_cancel_charge(parent); /* does css_put */ | ||
1734 | put_back: | ||
1587 | putback_lru_page(page); | 1735 | putback_lru_page(page); |
1588 | if (!ret) { | 1736 | put: |
1589 | put_page(page); | ||
1590 | /* drop extra refcnt by try_charge() */ | ||
1591 | css_put(&parent->css); | ||
1592 | return 0; | ||
1593 | } | ||
1594 | |||
1595 | cancel: | ||
1596 | put_page(page); | 1737 | put_page(page); |
1597 | uncharge: | 1738 | out: |
1598 | /* drop extra refcnt by try_charge() */ | ||
1599 | css_put(&parent->css); | ||
1600 | /* uncharge if move fails */ | ||
1601 | if (!mem_cgroup_is_root(parent)) { | ||
1602 | res_counter_uncharge(&parent->res, PAGE_SIZE); | ||
1603 | if (do_swap_account) | ||
1604 | res_counter_uncharge(&parent->memsw, PAGE_SIZE); | ||
1605 | } | ||
1606 | return ret; | 1739 | return ret; |
1607 | } | 1740 | } |
1608 | 1741 | ||
@@ -1720,7 +1853,7 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
1720 | /* | 1853 | /* |
1721 | * While swap-in, try_charge -> commit or cancel, the page is locked. | 1854 | * While swap-in, try_charge -> commit or cancel, the page is locked. |
1722 | * And when try_charge() successfully returns, one refcnt to memcg without | 1855 | * And when try_charge() successfully returns, one refcnt to memcg without |
1723 | * struct page_cgroup is aquired. This refcnt will be cumsumed by | 1856 | * struct page_cgroup is acquired. This refcnt will be consumed by |
1724 | * "commit()" or removed by "cancel()" | 1857 | * "commit()" or removed by "cancel()" |
1725 | */ | 1858 | */ |
1726 | int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | 1859 | int mem_cgroup_try_charge_swapin(struct mm_struct *mm, |
@@ -1737,12 +1870,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
1737 | goto charge_cur_mm; | 1870 | goto charge_cur_mm; |
1738 | /* | 1871 | /* |
1739 | * A racing thread's fault, or swapoff, may have already updated | 1872 | * A racing thread's fault, or swapoff, may have already updated |
1740 | * the pte, and even removed page from swap cache: return success | 1873 | * the pte, and even removed page from swap cache: in those cases |
1741 | * to go on to do_swap_page()'s pte_same() test, which should fail. | 1874 | * do_swap_page()'s pte_same() test will fail; but there's also a |
1875 | * KSM case which does need to charge the page. | ||
1742 | */ | 1876 | */ |
1743 | if (!PageSwapCache(page)) | 1877 | if (!PageSwapCache(page)) |
1744 | return 0; | 1878 | goto charge_cur_mm; |
1745 | mem = try_get_mem_cgroup_from_swapcache(page); | 1879 | mem = try_get_mem_cgroup_from_page(page); |
1746 | if (!mem) | 1880 | if (!mem) |
1747 | goto charge_cur_mm; | 1881 | goto charge_cur_mm; |
1748 | *ptr = mem; | 1882 | *ptr = mem; |
@@ -1818,14 +1952,53 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | |||
1818 | return; | 1952 | return; |
1819 | if (!mem) | 1953 | if (!mem) |
1820 | return; | 1954 | return; |
1821 | if (!mem_cgroup_is_root(mem)) { | 1955 | mem_cgroup_cancel_charge(mem); |
1822 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
1823 | if (do_swap_account) | ||
1824 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1825 | } | ||
1826 | css_put(&mem->css); | ||
1827 | } | 1956 | } |
1828 | 1957 | ||
1958 | static void | ||
1959 | __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) | ||
1960 | { | ||
1961 | struct memcg_batch_info *batch = NULL; | ||
1962 | bool uncharge_memsw = true; | ||
1963 | /* If swapout, usage of swap doesn't decrease */ | ||
1964 | if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | ||
1965 | uncharge_memsw = false; | ||
1966 | /* | ||
1967 | * do_batch > 0 when unmapping pages or inode invalidate/truncate. | ||
1968 | * In those cases, all pages freed continously can be expected to be in | ||
1969 | * the same cgroup and we have chance to coalesce uncharges. | ||
1970 | * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) | ||
1971 | * because we want to do uncharge as soon as possible. | ||
1972 | */ | ||
1973 | if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE)) | ||
1974 | goto direct_uncharge; | ||
1975 | |||
1976 | batch = ¤t->memcg_batch; | ||
1977 | /* | ||
1978 | * In usual, we do css_get() when we remember memcg pointer. | ||
1979 | * But in this case, we keep res->usage until end of a series of | ||
1980 | * uncharges. Then, it's ok to ignore memcg's refcnt. | ||
1981 | */ | ||
1982 | if (!batch->memcg) | ||
1983 | batch->memcg = mem; | ||
1984 | /* | ||
1985 | * In typical case, batch->memcg == mem. This means we can | ||
1986 | * merge a series of uncharges to an uncharge of res_counter. | ||
1987 | * If not, we uncharge res_counter ony by one. | ||
1988 | */ | ||
1989 | if (batch->memcg != mem) | ||
1990 | goto direct_uncharge; | ||
1991 | /* remember freed charge and uncharge it later */ | ||
1992 | batch->bytes += PAGE_SIZE; | ||
1993 | if (uncharge_memsw) | ||
1994 | batch->memsw_bytes += PAGE_SIZE; | ||
1995 | return; | ||
1996 | direct_uncharge: | ||
1997 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
1998 | if (uncharge_memsw) | ||
1999 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
2000 | return; | ||
2001 | } | ||
1829 | 2002 | ||
1830 | /* | 2003 | /* |
1831 | * uncharge if !page_mapped(page) | 2004 | * uncharge if !page_mapped(page) |
@@ -1874,12 +2047,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
1874 | break; | 2047 | break; |
1875 | } | 2048 | } |
1876 | 2049 | ||
1877 | if (!mem_cgroup_is_root(mem)) { | 2050 | if (!mem_cgroup_is_root(mem)) |
1878 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 2051 | __do_uncharge(mem, ctype); |
1879 | if (do_swap_account && | ||
1880 | (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) | ||
1881 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1882 | } | ||
1883 | if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 2052 | if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
1884 | mem_cgroup_swap_statistics(mem, true); | 2053 | mem_cgroup_swap_statistics(mem, true); |
1885 | mem_cgroup_charge_statistics(mem, pc, false); | 2054 | mem_cgroup_charge_statistics(mem, pc, false); |
@@ -1925,6 +2094,50 @@ void mem_cgroup_uncharge_cache_page(struct page *page) | |||
1925 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); | 2094 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); |
1926 | } | 2095 | } |
1927 | 2096 | ||
2097 | /* | ||
2098 | * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. | ||
2099 | * In that cases, pages are freed continuously and we can expect pages | ||
2100 | * are in the same memcg. All these calls itself limits the number of | ||
2101 | * pages freed at once, then uncharge_start/end() is called properly. | ||
2102 | * This may be called prural(2) times in a context, | ||
2103 | */ | ||
2104 | |||
2105 | void mem_cgroup_uncharge_start(void) | ||
2106 | { | ||
2107 | current->memcg_batch.do_batch++; | ||
2108 | /* We can do nest. */ | ||
2109 | if (current->memcg_batch.do_batch == 1) { | ||
2110 | current->memcg_batch.memcg = NULL; | ||
2111 | current->memcg_batch.bytes = 0; | ||
2112 | current->memcg_batch.memsw_bytes = 0; | ||
2113 | } | ||
2114 | } | ||
2115 | |||
2116 | void mem_cgroup_uncharge_end(void) | ||
2117 | { | ||
2118 | struct memcg_batch_info *batch = ¤t->memcg_batch; | ||
2119 | |||
2120 | if (!batch->do_batch) | ||
2121 | return; | ||
2122 | |||
2123 | batch->do_batch--; | ||
2124 | if (batch->do_batch) /* If stacked, do nothing. */ | ||
2125 | return; | ||
2126 | |||
2127 | if (!batch->memcg) | ||
2128 | return; | ||
2129 | /* | ||
2130 | * This "batch->memcg" is valid without any css_get/put etc... | ||
2131 | * bacause we hide charges behind us. | ||
2132 | */ | ||
2133 | if (batch->bytes) | ||
2134 | res_counter_uncharge(&batch->memcg->res, batch->bytes); | ||
2135 | if (batch->memsw_bytes) | ||
2136 | res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); | ||
2137 | /* forget this pointer (for sanity check) */ | ||
2138 | batch->memcg = NULL; | ||
2139 | } | ||
2140 | |||
1928 | #ifdef CONFIG_SWAP | 2141 | #ifdef CONFIG_SWAP |
1929 | /* | 2142 | /* |
1930 | * called after __delete_from_swap_cache() and drop "page" account. | 2143 | * called after __delete_from_swap_cache() and drop "page" account. |
@@ -2100,7 +2313,6 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
2100 | unsigned long long val) | 2313 | unsigned long long val) |
2101 | { | 2314 | { |
2102 | int retry_count; | 2315 | int retry_count; |
2103 | int progress; | ||
2104 | u64 memswlimit; | 2316 | u64 memswlimit; |
2105 | int ret = 0; | 2317 | int ret = 0; |
2106 | int children = mem_cgroup_count_children(memcg); | 2318 | int children = mem_cgroup_count_children(memcg); |
@@ -2144,8 +2356,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
2144 | if (!ret) | 2356 | if (!ret) |
2145 | break; | 2357 | break; |
2146 | 2358 | ||
2147 | progress = mem_cgroup_hierarchical_reclaim(memcg, NULL, | 2359 | mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, |
2148 | GFP_KERNEL, | ||
2149 | MEM_CGROUP_RECLAIM_SHRINK); | 2360 | MEM_CGROUP_RECLAIM_SHRINK); |
2150 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); | 2361 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); |
2151 | /* Usage is reduced ? */ | 2362 | /* Usage is reduced ? */ |
@@ -2384,6 +2595,7 @@ move_account: | |||
2384 | goto out; | 2595 | goto out; |
2385 | /* This is for making all *used* pages to be on LRU. */ | 2596 | /* This is for making all *used* pages to be on LRU. */ |
2386 | lru_add_drain_all(); | 2597 | lru_add_drain_all(); |
2598 | drain_all_stock_sync(); | ||
2387 | ret = 0; | 2599 | ret = 0; |
2388 | for_each_node_state(node, N_HIGH_MEMORY) { | 2600 | for_each_node_state(node, N_HIGH_MEMORY) { |
2389 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { | 2601 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { |
@@ -2466,7 +2678,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
2466 | 2678 | ||
2467 | cgroup_lock(); | 2679 | cgroup_lock(); |
2468 | /* | 2680 | /* |
2469 | * If parent's use_hiearchy is set, we can't make any modifications | 2681 | * If parent's use_hierarchy is set, we can't make any modifications |
2470 | * in the child subtrees. If it is unset, then the change can | 2682 | * in the child subtrees. If it is unset, then the change can |
2471 | * occur, provided the current cgroup has no children. | 2683 | * occur, provided the current cgroup has no children. |
2472 | * | 2684 | * |
@@ -2541,6 +2753,7 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | |||
2541 | val += idx_val; | 2753 | val += idx_val; |
2542 | mem_cgroup_get_recursive_idx_stat(mem, | 2754 | mem_cgroup_get_recursive_idx_stat(mem, |
2543 | MEM_CGROUP_STAT_SWAPOUT, &idx_val); | 2755 | MEM_CGROUP_STAT_SWAPOUT, &idx_val); |
2756 | val += idx_val; | ||
2544 | val <<= PAGE_SHIFT; | 2757 | val <<= PAGE_SHIFT; |
2545 | } else | 2758 | } else |
2546 | val = res_counter_read_u64(&mem->memsw, name); | 2759 | val = res_counter_read_u64(&mem->memsw, name); |
@@ -2660,7 +2873,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | |||
2660 | enum { | 2873 | enum { |
2661 | MCS_CACHE, | 2874 | MCS_CACHE, |
2662 | MCS_RSS, | 2875 | MCS_RSS, |
2663 | MCS_MAPPED_FILE, | 2876 | MCS_FILE_MAPPED, |
2664 | MCS_PGPGIN, | 2877 | MCS_PGPGIN, |
2665 | MCS_PGPGOUT, | 2878 | MCS_PGPGOUT, |
2666 | MCS_SWAP, | 2879 | MCS_SWAP, |
@@ -2704,8 +2917,8 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) | |||
2704 | s->stat[MCS_CACHE] += val * PAGE_SIZE; | 2917 | s->stat[MCS_CACHE] += val * PAGE_SIZE; |
2705 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); | 2918 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); |
2706 | s->stat[MCS_RSS] += val * PAGE_SIZE; | 2919 | s->stat[MCS_RSS] += val * PAGE_SIZE; |
2707 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_MAPPED_FILE); | 2920 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_MAPPED); |
2708 | s->stat[MCS_MAPPED_FILE] += val * PAGE_SIZE; | 2921 | s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; |
2709 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); | 2922 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); |
2710 | s->stat[MCS_PGPGIN] += val; | 2923 | s->stat[MCS_PGPGIN] += val; |
2711 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); | 2924 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); |
@@ -3097,11 +3310,18 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
3097 | 3310 | ||
3098 | /* root ? */ | 3311 | /* root ? */ |
3099 | if (cont->parent == NULL) { | 3312 | if (cont->parent == NULL) { |
3313 | int cpu; | ||
3100 | enable_swap_cgroup(); | 3314 | enable_swap_cgroup(); |
3101 | parent = NULL; | 3315 | parent = NULL; |
3102 | root_mem_cgroup = mem; | 3316 | root_mem_cgroup = mem; |
3103 | if (mem_cgroup_soft_limit_tree_init()) | 3317 | if (mem_cgroup_soft_limit_tree_init()) |
3104 | goto free_out; | 3318 | goto free_out; |
3319 | for_each_possible_cpu(cpu) { | ||
3320 | struct memcg_stock_pcp *stock = | ||
3321 | &per_cpu(memcg_stock, cpu); | ||
3322 | INIT_WORK(&stock->work, drain_local_stock); | ||
3323 | } | ||
3324 | hotcpu_notifier(memcg_stock_cpu_callback, 0); | ||
3105 | 3325 | ||
3106 | } else { | 3326 | } else { |
3107 | parent = mem_cgroup_from_cont(cont->parent); | 3327 | parent = mem_cgroup_from_cont(cont->parent); |
@@ -3170,12 +3390,10 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, | |||
3170 | struct task_struct *p, | 3390 | struct task_struct *p, |
3171 | bool threadgroup) | 3391 | bool threadgroup) |
3172 | { | 3392 | { |
3173 | mutex_lock(&memcg_tasklist); | ||
3174 | /* | 3393 | /* |
3175 | * FIXME: It's better to move charges of this process from old | 3394 | * FIXME: It's better to move charges of this process from old |
3176 | * memcg to new memcg. But it's just on TODO-List now. | 3395 | * memcg to new memcg. But it's just on TODO-List now. |
3177 | */ | 3396 | */ |
3178 | mutex_unlock(&memcg_tasklist); | ||
3179 | } | 3397 | } |
3180 | 3398 | ||
3181 | struct cgroup_subsys mem_cgroup_subsys = { | 3399 | struct cgroup_subsys mem_cgroup_subsys = { |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 729d4b15b645..17299fd4577c 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -34,11 +34,16 @@ | |||
34 | #include <linux/kernel.h> | 34 | #include <linux/kernel.h> |
35 | #include <linux/mm.h> | 35 | #include <linux/mm.h> |
36 | #include <linux/page-flags.h> | 36 | #include <linux/page-flags.h> |
37 | #include <linux/kernel-page-flags.h> | ||
37 | #include <linux/sched.h> | 38 | #include <linux/sched.h> |
39 | #include <linux/ksm.h> | ||
38 | #include <linux/rmap.h> | 40 | #include <linux/rmap.h> |
39 | #include <linux/pagemap.h> | 41 | #include <linux/pagemap.h> |
40 | #include <linux/swap.h> | 42 | #include <linux/swap.h> |
41 | #include <linux/backing-dev.h> | 43 | #include <linux/backing-dev.h> |
44 | #include <linux/migrate.h> | ||
45 | #include <linux/page-isolation.h> | ||
46 | #include <linux/suspend.h> | ||
42 | #include "internal.h" | 47 | #include "internal.h" |
43 | 48 | ||
44 | int sysctl_memory_failure_early_kill __read_mostly = 0; | 49 | int sysctl_memory_failure_early_kill __read_mostly = 0; |
@@ -47,6 +52,129 @@ int sysctl_memory_failure_recovery __read_mostly = 1; | |||
47 | 52 | ||
48 | atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); | 53 | atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); |
49 | 54 | ||
55 | #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) | ||
56 | |||
57 | u32 hwpoison_filter_enable = 0; | ||
58 | u32 hwpoison_filter_dev_major = ~0U; | ||
59 | u32 hwpoison_filter_dev_minor = ~0U; | ||
60 | u64 hwpoison_filter_flags_mask; | ||
61 | u64 hwpoison_filter_flags_value; | ||
62 | EXPORT_SYMBOL_GPL(hwpoison_filter_enable); | ||
63 | EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major); | ||
64 | EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor); | ||
65 | EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask); | ||
66 | EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value); | ||
67 | |||
68 | static int hwpoison_filter_dev(struct page *p) | ||
69 | { | ||
70 | struct address_space *mapping; | ||
71 | dev_t dev; | ||
72 | |||
73 | if (hwpoison_filter_dev_major == ~0U && | ||
74 | hwpoison_filter_dev_minor == ~0U) | ||
75 | return 0; | ||
76 | |||
77 | /* | ||
78 | * page_mapping() does not accept slab page | ||
79 | */ | ||
80 | if (PageSlab(p)) | ||
81 | return -EINVAL; | ||
82 | |||
83 | mapping = page_mapping(p); | ||
84 | if (mapping == NULL || mapping->host == NULL) | ||
85 | return -EINVAL; | ||
86 | |||
87 | dev = mapping->host->i_sb->s_dev; | ||
88 | if (hwpoison_filter_dev_major != ~0U && | ||
89 | hwpoison_filter_dev_major != MAJOR(dev)) | ||
90 | return -EINVAL; | ||
91 | if (hwpoison_filter_dev_minor != ~0U && | ||
92 | hwpoison_filter_dev_minor != MINOR(dev)) | ||
93 | return -EINVAL; | ||
94 | |||
95 | return 0; | ||
96 | } | ||
97 | |||
98 | static int hwpoison_filter_flags(struct page *p) | ||
99 | { | ||
100 | if (!hwpoison_filter_flags_mask) | ||
101 | return 0; | ||
102 | |||
103 | if ((stable_page_flags(p) & hwpoison_filter_flags_mask) == | ||
104 | hwpoison_filter_flags_value) | ||
105 | return 0; | ||
106 | else | ||
107 | return -EINVAL; | ||
108 | } | ||
109 | |||
110 | /* | ||
111 | * This allows stress tests to limit test scope to a collection of tasks | ||
112 | * by putting them under some memcg. This prevents killing unrelated/important | ||
113 | * processes such as /sbin/init. Note that the target task may share clean | ||
114 | * pages with init (eg. libc text), which is harmless. If the target task | ||
115 | * share _dirty_ pages with another task B, the test scheme must make sure B | ||
116 | * is also included in the memcg. At last, due to race conditions this filter | ||
117 | * can only guarantee that the page either belongs to the memcg tasks, or is | ||
118 | * a freed page. | ||
119 | */ | ||
120 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | ||
121 | u64 hwpoison_filter_memcg; | ||
122 | EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); | ||
123 | static int hwpoison_filter_task(struct page *p) | ||
124 | { | ||
125 | struct mem_cgroup *mem; | ||
126 | struct cgroup_subsys_state *css; | ||
127 | unsigned long ino; | ||
128 | |||
129 | if (!hwpoison_filter_memcg) | ||
130 | return 0; | ||
131 | |||
132 | mem = try_get_mem_cgroup_from_page(p); | ||
133 | if (!mem) | ||
134 | return -EINVAL; | ||
135 | |||
136 | css = mem_cgroup_css(mem); | ||
137 | /* root_mem_cgroup has NULL dentries */ | ||
138 | if (!css->cgroup->dentry) | ||
139 | return -EINVAL; | ||
140 | |||
141 | ino = css->cgroup->dentry->d_inode->i_ino; | ||
142 | css_put(css); | ||
143 | |||
144 | if (ino != hwpoison_filter_memcg) | ||
145 | return -EINVAL; | ||
146 | |||
147 | return 0; | ||
148 | } | ||
149 | #else | ||
150 | static int hwpoison_filter_task(struct page *p) { return 0; } | ||
151 | #endif | ||
152 | |||
153 | int hwpoison_filter(struct page *p) | ||
154 | { | ||
155 | if (!hwpoison_filter_enable) | ||
156 | return 0; | ||
157 | |||
158 | if (hwpoison_filter_dev(p)) | ||
159 | return -EINVAL; | ||
160 | |||
161 | if (hwpoison_filter_flags(p)) | ||
162 | return -EINVAL; | ||
163 | |||
164 | if (hwpoison_filter_task(p)) | ||
165 | return -EINVAL; | ||
166 | |||
167 | return 0; | ||
168 | } | ||
169 | #else | ||
170 | int hwpoison_filter(struct page *p) | ||
171 | { | ||
172 | return 0; | ||
173 | } | ||
174 | #endif | ||
175 | |||
176 | EXPORT_SYMBOL_GPL(hwpoison_filter); | ||
177 | |||
50 | /* | 178 | /* |
51 | * Send all the processes who have the page mapped an ``action optional'' | 179 | * Send all the processes who have the page mapped an ``action optional'' |
52 | * signal. | 180 | * signal. |
@@ -82,6 +210,36 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, | |||
82 | } | 210 | } |
83 | 211 | ||
84 | /* | 212 | /* |
213 | * When a unknown page type is encountered drain as many buffers as possible | ||
214 | * in the hope to turn the page into a LRU or free page, which we can handle. | ||
215 | */ | ||
216 | void shake_page(struct page *p, int access) | ||
217 | { | ||
218 | if (!PageSlab(p)) { | ||
219 | lru_add_drain_all(); | ||
220 | if (PageLRU(p)) | ||
221 | return; | ||
222 | drain_all_pages(); | ||
223 | if (PageLRU(p) || is_free_buddy_page(p)) | ||
224 | return; | ||
225 | } | ||
226 | |||
227 | /* | ||
228 | * Only all shrink_slab here (which would also | ||
229 | * shrink other caches) if access is not potentially fatal. | ||
230 | */ | ||
231 | if (access) { | ||
232 | int nr; | ||
233 | do { | ||
234 | nr = shrink_slab(1000, GFP_KERNEL, 1000); | ||
235 | if (page_count(p) == 0) | ||
236 | break; | ||
237 | } while (nr > 10); | ||
238 | } | ||
239 | } | ||
240 | EXPORT_SYMBOL_GPL(shake_page); | ||
241 | |||
242 | /* | ||
85 | * Kill all processes that have a poisoned page mapped and then isolate | 243 | * Kill all processes that have a poisoned page mapped and then isolate |
86 | * the page. | 244 | * the page. |
87 | * | 245 | * |
@@ -173,10 +331,9 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, | |||
173 | list_for_each_entry_safe (tk, next, to_kill, nd) { | 331 | list_for_each_entry_safe (tk, next, to_kill, nd) { |
174 | if (doit) { | 332 | if (doit) { |
175 | /* | 333 | /* |
176 | * In case something went wrong with munmaping | 334 | * In case something went wrong with munmapping |
177 | * make sure the process doesn't catch the | 335 | * make sure the process doesn't catch the |
178 | * signal and then access the memory. Just kill it. | 336 | * signal and then access the memory. Just kill it. |
179 | * the signal handlers | ||
180 | */ | 337 | */ |
181 | if (fail || tk->addr_valid == 0) { | 338 | if (fail || tk->addr_valid == 0) { |
182 | printk(KERN_ERR | 339 | printk(KERN_ERR |
@@ -313,33 +470,49 @@ static void collect_procs(struct page *page, struct list_head *tokill) | |||
313 | */ | 470 | */ |
314 | 471 | ||
315 | enum outcome { | 472 | enum outcome { |
316 | FAILED, /* Error handling failed */ | 473 | IGNORED, /* Error: cannot be handled */ |
474 | FAILED, /* Error: handling failed */ | ||
317 | DELAYED, /* Will be handled later */ | 475 | DELAYED, /* Will be handled later */ |
318 | IGNORED, /* Error safely ignored */ | ||
319 | RECOVERED, /* Successfully recovered */ | 476 | RECOVERED, /* Successfully recovered */ |
320 | }; | 477 | }; |
321 | 478 | ||
322 | static const char *action_name[] = { | 479 | static const char *action_name[] = { |
480 | [IGNORED] = "Ignored", | ||
323 | [FAILED] = "Failed", | 481 | [FAILED] = "Failed", |
324 | [DELAYED] = "Delayed", | 482 | [DELAYED] = "Delayed", |
325 | [IGNORED] = "Ignored", | ||
326 | [RECOVERED] = "Recovered", | 483 | [RECOVERED] = "Recovered", |
327 | }; | 484 | }; |
328 | 485 | ||
329 | /* | 486 | /* |
330 | * Error hit kernel page. | 487 | * XXX: It is possible that a page is isolated from LRU cache, |
331 | * Do nothing, try to be lucky and not touch this instead. For a few cases we | 488 | * and then kept in swap cache or failed to remove from page cache. |
332 | * could be more sophisticated. | 489 | * The page count will stop it from being freed by unpoison. |
490 | * Stress tests should be aware of this memory leak problem. | ||
333 | */ | 491 | */ |
334 | static int me_kernel(struct page *p, unsigned long pfn) | 492 | static int delete_from_lru_cache(struct page *p) |
335 | { | 493 | { |
336 | return DELAYED; | 494 | if (!isolate_lru_page(p)) { |
495 | /* | ||
496 | * Clear sensible page flags, so that the buddy system won't | ||
497 | * complain when the page is unpoison-and-freed. | ||
498 | */ | ||
499 | ClearPageActive(p); | ||
500 | ClearPageUnevictable(p); | ||
501 | /* | ||
502 | * drop the page count elevated by isolate_lru_page() | ||
503 | */ | ||
504 | page_cache_release(p); | ||
505 | return 0; | ||
506 | } | ||
507 | return -EIO; | ||
337 | } | 508 | } |
338 | 509 | ||
339 | /* | 510 | /* |
340 | * Already poisoned page. | 511 | * Error hit kernel page. |
512 | * Do nothing, try to be lucky and not touch this instead. For a few cases we | ||
513 | * could be more sophisticated. | ||
341 | */ | 514 | */ |
342 | static int me_ignore(struct page *p, unsigned long pfn) | 515 | static int me_kernel(struct page *p, unsigned long pfn) |
343 | { | 516 | { |
344 | return IGNORED; | 517 | return IGNORED; |
345 | } | 518 | } |
@@ -354,14 +527,6 @@ static int me_unknown(struct page *p, unsigned long pfn) | |||
354 | } | 527 | } |
355 | 528 | ||
356 | /* | 529 | /* |
357 | * Free memory | ||
358 | */ | ||
359 | static int me_free(struct page *p, unsigned long pfn) | ||
360 | { | ||
361 | return DELAYED; | ||
362 | } | ||
363 | |||
364 | /* | ||
365 | * Clean (or cleaned) page cache page. | 530 | * Clean (or cleaned) page cache page. |
366 | */ | 531 | */ |
367 | static int me_pagecache_clean(struct page *p, unsigned long pfn) | 532 | static int me_pagecache_clean(struct page *p, unsigned long pfn) |
@@ -370,8 +535,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) | |||
370 | int ret = FAILED; | 535 | int ret = FAILED; |
371 | struct address_space *mapping; | 536 | struct address_space *mapping; |
372 | 537 | ||
373 | if (!isolate_lru_page(p)) | 538 | delete_from_lru_cache(p); |
374 | page_cache_release(p); | ||
375 | 539 | ||
376 | /* | 540 | /* |
377 | * For anonymous pages we're done the only reference left | 541 | * For anonymous pages we're done the only reference left |
@@ -498,30 +662,24 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn) | |||
498 | */ | 662 | */ |
499 | static int me_swapcache_dirty(struct page *p, unsigned long pfn) | 663 | static int me_swapcache_dirty(struct page *p, unsigned long pfn) |
500 | { | 664 | { |
501 | int ret = FAILED; | ||
502 | |||
503 | ClearPageDirty(p); | 665 | ClearPageDirty(p); |
504 | /* Trigger EIO in shmem: */ | 666 | /* Trigger EIO in shmem: */ |
505 | ClearPageUptodate(p); | 667 | ClearPageUptodate(p); |
506 | 668 | ||
507 | if (!isolate_lru_page(p)) { | 669 | if (!delete_from_lru_cache(p)) |
508 | page_cache_release(p); | 670 | return DELAYED; |
509 | ret = DELAYED; | 671 | else |
510 | } | 672 | return FAILED; |
511 | |||
512 | return ret; | ||
513 | } | 673 | } |
514 | 674 | ||
515 | static int me_swapcache_clean(struct page *p, unsigned long pfn) | 675 | static int me_swapcache_clean(struct page *p, unsigned long pfn) |
516 | { | 676 | { |
517 | int ret = FAILED; | ||
518 | |||
519 | if (!isolate_lru_page(p)) { | ||
520 | page_cache_release(p); | ||
521 | ret = RECOVERED; | ||
522 | } | ||
523 | delete_from_swap_cache(p); | 677 | delete_from_swap_cache(p); |
524 | return ret; | 678 | |
679 | if (!delete_from_lru_cache(p)) | ||
680 | return RECOVERED; | ||
681 | else | ||
682 | return FAILED; | ||
525 | } | 683 | } |
526 | 684 | ||
527 | /* | 685 | /* |
@@ -564,7 +722,6 @@ static int me_huge_page(struct page *p, unsigned long pfn) | |||
564 | #define tail (1UL << PG_tail) | 722 | #define tail (1UL << PG_tail) |
565 | #define compound (1UL << PG_compound) | 723 | #define compound (1UL << PG_compound) |
566 | #define slab (1UL << PG_slab) | 724 | #define slab (1UL << PG_slab) |
567 | #define buddy (1UL << PG_buddy) | ||
568 | #define reserved (1UL << PG_reserved) | 725 | #define reserved (1UL << PG_reserved) |
569 | 726 | ||
570 | static struct page_state { | 727 | static struct page_state { |
@@ -573,8 +730,11 @@ static struct page_state { | |||
573 | char *msg; | 730 | char *msg; |
574 | int (*action)(struct page *p, unsigned long pfn); | 731 | int (*action)(struct page *p, unsigned long pfn); |
575 | } error_states[] = { | 732 | } error_states[] = { |
576 | { reserved, reserved, "reserved kernel", me_ignore }, | 733 | { reserved, reserved, "reserved kernel", me_kernel }, |
577 | { buddy, buddy, "free kernel", me_free }, | 734 | /* |
735 | * free pages are specially detected outside this table: | ||
736 | * PG_buddy pages only make a small fraction of all free pages. | ||
737 | */ | ||
578 | 738 | ||
579 | /* | 739 | /* |
580 | * Could in theory check if slab page is free or if we can drop | 740 | * Could in theory check if slab page is free or if we can drop |
@@ -596,14 +756,11 @@ static struct page_state { | |||
596 | { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, | 756 | { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, |
597 | { unevict, unevict, "unevictable LRU", me_pagecache_clean}, | 757 | { unevict, unevict, "unevictable LRU", me_pagecache_clean}, |
598 | 758 | ||
599 | #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT | ||
600 | { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, | 759 | { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, |
601 | { mlock, mlock, "mlocked LRU", me_pagecache_clean }, | 760 | { mlock, mlock, "mlocked LRU", me_pagecache_clean }, |
602 | #endif | ||
603 | 761 | ||
604 | { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, | 762 | { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, |
605 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, | 763 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, |
606 | { swapbacked, swapbacked, "anonymous", me_pagecache_clean }, | ||
607 | 764 | ||
608 | /* | 765 | /* |
609 | * Catchall entry: must be at end. | 766 | * Catchall entry: must be at end. |
@@ -611,38 +768,54 @@ static struct page_state { | |||
611 | { 0, 0, "unknown page state", me_unknown }, | 768 | { 0, 0, "unknown page state", me_unknown }, |
612 | }; | 769 | }; |
613 | 770 | ||
771 | #undef dirty | ||
772 | #undef sc | ||
773 | #undef unevict | ||
774 | #undef mlock | ||
775 | #undef writeback | ||
614 | #undef lru | 776 | #undef lru |
777 | #undef swapbacked | ||
778 | #undef head | ||
779 | #undef tail | ||
780 | #undef compound | ||
781 | #undef slab | ||
782 | #undef reserved | ||
615 | 783 | ||
616 | static void action_result(unsigned long pfn, char *msg, int result) | 784 | static void action_result(unsigned long pfn, char *msg, int result) |
617 | { | 785 | { |
618 | struct page *page = NULL; | 786 | struct page *page = pfn_to_page(pfn); |
619 | if (pfn_valid(pfn)) | ||
620 | page = pfn_to_page(pfn); | ||
621 | 787 | ||
622 | printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", | 788 | printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", |
623 | pfn, | 789 | pfn, |
624 | page && PageDirty(page) ? "dirty " : "", | 790 | PageDirty(page) ? "dirty " : "", |
625 | msg, action_name[result]); | 791 | msg, action_name[result]); |
626 | } | 792 | } |
627 | 793 | ||
628 | static int page_action(struct page_state *ps, struct page *p, | 794 | static int page_action(struct page_state *ps, struct page *p, |
629 | unsigned long pfn, int ref) | 795 | unsigned long pfn) |
630 | { | 796 | { |
631 | int result; | 797 | int result; |
798 | int count; | ||
632 | 799 | ||
633 | result = ps->action(p, pfn); | 800 | result = ps->action(p, pfn); |
634 | action_result(pfn, ps->msg, result); | 801 | action_result(pfn, ps->msg, result); |
635 | if (page_count(p) != 1 + ref) | 802 | |
803 | count = page_count(p) - 1; | ||
804 | if (ps->action == me_swapcache_dirty && result == DELAYED) | ||
805 | count--; | ||
806 | if (count != 0) { | ||
636 | printk(KERN_ERR | 807 | printk(KERN_ERR |
637 | "MCE %#lx: %s page still referenced by %d users\n", | 808 | "MCE %#lx: %s page still referenced by %d users\n", |
638 | pfn, ps->msg, page_count(p) - 1); | 809 | pfn, ps->msg, count); |
810 | result = FAILED; | ||
811 | } | ||
639 | 812 | ||
640 | /* Could do more checks here if page looks ok */ | 813 | /* Could do more checks here if page looks ok */ |
641 | /* | 814 | /* |
642 | * Could adjust zone counters here to correct for the missing page. | 815 | * Could adjust zone counters here to correct for the missing page. |
643 | */ | 816 | */ |
644 | 817 | ||
645 | return result == RECOVERED ? 0 : -EBUSY; | 818 | return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; |
646 | } | 819 | } |
647 | 820 | ||
648 | #define N_UNMAP_TRIES 5 | 821 | #define N_UNMAP_TRIES 5 |
@@ -651,7 +824,7 @@ static int page_action(struct page_state *ps, struct page *p, | |||
651 | * Do all that is necessary to remove user space mappings. Unmap | 824 | * Do all that is necessary to remove user space mappings. Unmap |
652 | * the pages and send SIGBUS to the processes if the data was dirty. | 825 | * the pages and send SIGBUS to the processes if the data was dirty. |
653 | */ | 826 | */ |
654 | static void hwpoison_user_mappings(struct page *p, unsigned long pfn, | 827 | static int hwpoison_user_mappings(struct page *p, unsigned long pfn, |
655 | int trapno) | 828 | int trapno) |
656 | { | 829 | { |
657 | enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; | 830 | enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; |
@@ -661,18 +834,18 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
661 | int i; | 834 | int i; |
662 | int kill = 1; | 835 | int kill = 1; |
663 | 836 | ||
664 | if (PageReserved(p) || PageCompound(p) || PageSlab(p)) | 837 | if (PageReserved(p) || PageSlab(p)) |
665 | return; | 838 | return SWAP_SUCCESS; |
666 | |||
667 | if (!PageLRU(p)) | ||
668 | lru_add_drain_all(); | ||
669 | 839 | ||
670 | /* | 840 | /* |
671 | * This check implies we don't kill processes if their pages | 841 | * This check implies we don't kill processes if their pages |
672 | * are in the swap cache early. Those are always late kills. | 842 | * are in the swap cache early. Those are always late kills. |
673 | */ | 843 | */ |
674 | if (!page_mapped(p)) | 844 | if (!page_mapped(p)) |
675 | return; | 845 | return SWAP_SUCCESS; |
846 | |||
847 | if (PageCompound(p) || PageKsm(p)) | ||
848 | return SWAP_FAIL; | ||
676 | 849 | ||
677 | if (PageSwapCache(p)) { | 850 | if (PageSwapCache(p)) { |
678 | printk(KERN_ERR | 851 | printk(KERN_ERR |
@@ -683,6 +856,8 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
683 | /* | 856 | /* |
684 | * Propagate the dirty bit from PTEs to struct page first, because we | 857 | * Propagate the dirty bit from PTEs to struct page first, because we |
685 | * need this to decide if we should kill or just drop the page. | 858 | * need this to decide if we should kill or just drop the page. |
859 | * XXX: the dirty test could be racy: set_page_dirty() may not always | ||
860 | * be called inside page lock (it's recommended but not enforced). | ||
686 | */ | 861 | */ |
687 | mapping = page_mapping(p); | 862 | mapping = page_mapping(p); |
688 | if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { | 863 | if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { |
@@ -734,9 +909,11 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
734 | */ | 909 | */ |
735 | kill_procs_ao(&tokill, !!PageDirty(p), trapno, | 910 | kill_procs_ao(&tokill, !!PageDirty(p), trapno, |
736 | ret != SWAP_SUCCESS, pfn); | 911 | ret != SWAP_SUCCESS, pfn); |
912 | |||
913 | return ret; | ||
737 | } | 914 | } |
738 | 915 | ||
739 | int __memory_failure(unsigned long pfn, int trapno, int ref) | 916 | int __memory_failure(unsigned long pfn, int trapno, int flags) |
740 | { | 917 | { |
741 | struct page_state *ps; | 918 | struct page_state *ps; |
742 | struct page *p; | 919 | struct page *p; |
@@ -746,13 +923,15 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) | |||
746 | panic("Memory failure from trap %d on page %lx", trapno, pfn); | 923 | panic("Memory failure from trap %d on page %lx", trapno, pfn); |
747 | 924 | ||
748 | if (!pfn_valid(pfn)) { | 925 | if (!pfn_valid(pfn)) { |
749 | action_result(pfn, "memory outside kernel control", IGNORED); | 926 | printk(KERN_ERR |
750 | return -EIO; | 927 | "MCE %#lx: memory outside kernel control\n", |
928 | pfn); | ||
929 | return -ENXIO; | ||
751 | } | 930 | } |
752 | 931 | ||
753 | p = pfn_to_page(pfn); | 932 | p = pfn_to_page(pfn); |
754 | if (TestSetPageHWPoison(p)) { | 933 | if (TestSetPageHWPoison(p)) { |
755 | action_result(pfn, "already hardware poisoned", IGNORED); | 934 | printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); |
756 | return 0; | 935 | return 0; |
757 | } | 936 | } |
758 | 937 | ||
@@ -769,9 +948,38 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) | |||
769 | * In fact it's dangerous to directly bump up page count from 0, | 948 | * In fact it's dangerous to directly bump up page count from 0, |
770 | * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. | 949 | * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. |
771 | */ | 950 | */ |
772 | if (!get_page_unless_zero(compound_head(p))) { | 951 | if (!(flags & MF_COUNT_INCREASED) && |
773 | action_result(pfn, "free or high order kernel", IGNORED); | 952 | !get_page_unless_zero(compound_head(p))) { |
774 | return PageBuddy(compound_head(p)) ? 0 : -EBUSY; | 953 | if (is_free_buddy_page(p)) { |
954 | action_result(pfn, "free buddy", DELAYED); | ||
955 | return 0; | ||
956 | } else { | ||
957 | action_result(pfn, "high order kernel", IGNORED); | ||
958 | return -EBUSY; | ||
959 | } | ||
960 | } | ||
961 | |||
962 | /* | ||
963 | * We ignore non-LRU pages for good reasons. | ||
964 | * - PG_locked is only well defined for LRU pages and a few others | ||
965 | * - to avoid races with __set_page_locked() | ||
966 | * - to avoid races with __SetPageSlab*() (and more non-atomic ops) | ||
967 | * The check (unnecessarily) ignores LRU pages being isolated and | ||
968 | * walked by the page reclaim code, however that's not a big loss. | ||
969 | */ | ||
970 | if (!PageLRU(p)) | ||
971 | shake_page(p, 0); | ||
972 | if (!PageLRU(p)) { | ||
973 | /* | ||
974 | * shake_page could have turned it free. | ||
975 | */ | ||
976 | if (is_free_buddy_page(p)) { | ||
977 | action_result(pfn, "free buddy, 2nd try", DELAYED); | ||
978 | return 0; | ||
979 | } | ||
980 | action_result(pfn, "non LRU", IGNORED); | ||
981 | put_page(p); | ||
982 | return -EBUSY; | ||
775 | } | 983 | } |
776 | 984 | ||
777 | /* | 985 | /* |
@@ -780,26 +988,48 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) | |||
780 | * and in many cases impossible, so we just avoid it here. | 988 | * and in many cases impossible, so we just avoid it here. |
781 | */ | 989 | */ |
782 | lock_page_nosync(p); | 990 | lock_page_nosync(p); |
991 | |||
992 | /* | ||
993 | * unpoison always clear PG_hwpoison inside page lock | ||
994 | */ | ||
995 | if (!PageHWPoison(p)) { | ||
996 | printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); | ||
997 | res = 0; | ||
998 | goto out; | ||
999 | } | ||
1000 | if (hwpoison_filter(p)) { | ||
1001 | if (TestClearPageHWPoison(p)) | ||
1002 | atomic_long_dec(&mce_bad_pages); | ||
1003 | unlock_page(p); | ||
1004 | put_page(p); | ||
1005 | return 0; | ||
1006 | } | ||
1007 | |||
783 | wait_on_page_writeback(p); | 1008 | wait_on_page_writeback(p); |
784 | 1009 | ||
785 | /* | 1010 | /* |
786 | * Now take care of user space mappings. | 1011 | * Now take care of user space mappings. |
1012 | * Abort on fail: __remove_from_page_cache() assumes unmapped page. | ||
787 | */ | 1013 | */ |
788 | hwpoison_user_mappings(p, pfn, trapno); | 1014 | if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) { |
1015 | printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); | ||
1016 | res = -EBUSY; | ||
1017 | goto out; | ||
1018 | } | ||
789 | 1019 | ||
790 | /* | 1020 | /* |
791 | * Torn down by someone else? | 1021 | * Torn down by someone else? |
792 | */ | 1022 | */ |
793 | if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { | 1023 | if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { |
794 | action_result(pfn, "already truncated LRU", IGNORED); | 1024 | action_result(pfn, "already truncated LRU", IGNORED); |
795 | res = 0; | 1025 | res = -EBUSY; |
796 | goto out; | 1026 | goto out; |
797 | } | 1027 | } |
798 | 1028 | ||
799 | res = -EBUSY; | 1029 | res = -EBUSY; |
800 | for (ps = error_states;; ps++) { | 1030 | for (ps = error_states;; ps++) { |
801 | if ((p->flags & ps->mask) == ps->res) { | 1031 | if ((p->flags & ps->mask) == ps->res) { |
802 | res = page_action(ps, p, pfn, ref); | 1032 | res = page_action(ps, p, pfn); |
803 | break; | 1033 | break; |
804 | } | 1034 | } |
805 | } | 1035 | } |
@@ -830,3 +1060,235 @@ void memory_failure(unsigned long pfn, int trapno) | |||
830 | { | 1060 | { |
831 | __memory_failure(pfn, trapno, 0); | 1061 | __memory_failure(pfn, trapno, 0); |
832 | } | 1062 | } |
1063 | |||
1064 | /** | ||
1065 | * unpoison_memory - Unpoison a previously poisoned page | ||
1066 | * @pfn: Page number of the to be unpoisoned page | ||
1067 | * | ||
1068 | * Software-unpoison a page that has been poisoned by | ||
1069 | * memory_failure() earlier. | ||
1070 | * | ||
1071 | * This is only done on the software-level, so it only works | ||
1072 | * for linux injected failures, not real hardware failures | ||
1073 | * | ||
1074 | * Returns 0 for success, otherwise -errno. | ||
1075 | */ | ||
1076 | int unpoison_memory(unsigned long pfn) | ||
1077 | { | ||
1078 | struct page *page; | ||
1079 | struct page *p; | ||
1080 | int freeit = 0; | ||
1081 | |||
1082 | if (!pfn_valid(pfn)) | ||
1083 | return -ENXIO; | ||
1084 | |||
1085 | p = pfn_to_page(pfn); | ||
1086 | page = compound_head(p); | ||
1087 | |||
1088 | if (!PageHWPoison(p)) { | ||
1089 | pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn); | ||
1090 | return 0; | ||
1091 | } | ||
1092 | |||
1093 | if (!get_page_unless_zero(page)) { | ||
1094 | if (TestClearPageHWPoison(p)) | ||
1095 | atomic_long_dec(&mce_bad_pages); | ||
1096 | pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); | ||
1097 | return 0; | ||
1098 | } | ||
1099 | |||
1100 | lock_page_nosync(page); | ||
1101 | /* | ||
1102 | * This test is racy because PG_hwpoison is set outside of page lock. | ||
1103 | * That's acceptable because that won't trigger kernel panic. Instead, | ||
1104 | * the PG_hwpoison page will be caught and isolated on the entrance to | ||
1105 | * the free buddy page pool. | ||
1106 | */ | ||
1107 | if (TestClearPageHWPoison(p)) { | ||
1108 | pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); | ||
1109 | atomic_long_dec(&mce_bad_pages); | ||
1110 | freeit = 1; | ||
1111 | } | ||
1112 | unlock_page(page); | ||
1113 | |||
1114 | put_page(page); | ||
1115 | if (freeit) | ||
1116 | put_page(page); | ||
1117 | |||
1118 | return 0; | ||
1119 | } | ||
1120 | EXPORT_SYMBOL(unpoison_memory); | ||
1121 | |||
1122 | static struct page *new_page(struct page *p, unsigned long private, int **x) | ||
1123 | { | ||
1124 | int nid = page_to_nid(p); | ||
1125 | return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); | ||
1126 | } | ||
1127 | |||
1128 | /* | ||
1129 | * Safely get reference count of an arbitrary page. | ||
1130 | * Returns 0 for a free page, -EIO for a zero refcount page | ||
1131 | * that is not free, and 1 for any other page type. | ||
1132 | * For 1 the page is returned with increased page count, otherwise not. | ||
1133 | */ | ||
1134 | static int get_any_page(struct page *p, unsigned long pfn, int flags) | ||
1135 | { | ||
1136 | int ret; | ||
1137 | |||
1138 | if (flags & MF_COUNT_INCREASED) | ||
1139 | return 1; | ||
1140 | |||
1141 | /* | ||
1142 | * The lock_system_sleep prevents a race with memory hotplug, | ||
1143 | * because the isolation assumes there's only a single user. | ||
1144 | * This is a big hammer, a better would be nicer. | ||
1145 | */ | ||
1146 | lock_system_sleep(); | ||
1147 | |||
1148 | /* | ||
1149 | * Isolate the page, so that it doesn't get reallocated if it | ||
1150 | * was free. | ||
1151 | */ | ||
1152 | set_migratetype_isolate(p); | ||
1153 | if (!get_page_unless_zero(compound_head(p))) { | ||
1154 | if (is_free_buddy_page(p)) { | ||
1155 | pr_debug("get_any_page: %#lx free buddy page\n", pfn); | ||
1156 | /* Set hwpoison bit while page is still isolated */ | ||
1157 | SetPageHWPoison(p); | ||
1158 | ret = 0; | ||
1159 | } else { | ||
1160 | pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n", | ||
1161 | pfn, p->flags); | ||
1162 | ret = -EIO; | ||
1163 | } | ||
1164 | } else { | ||
1165 | /* Not a free page */ | ||
1166 | ret = 1; | ||
1167 | } | ||
1168 | unset_migratetype_isolate(p); | ||
1169 | unlock_system_sleep(); | ||
1170 | return ret; | ||
1171 | } | ||
1172 | |||
1173 | /** | ||
1174 | * soft_offline_page - Soft offline a page. | ||
1175 | * @page: page to offline | ||
1176 | * @flags: flags. Same as memory_failure(). | ||
1177 | * | ||
1178 | * Returns 0 on success, otherwise negated errno. | ||
1179 | * | ||
1180 | * Soft offline a page, by migration or invalidation, | ||
1181 | * without killing anything. This is for the case when | ||
1182 | * a page is not corrupted yet (so it's still valid to access), | ||
1183 | * but has had a number of corrected errors and is better taken | ||
1184 | * out. | ||
1185 | * | ||
1186 | * The actual policy on when to do that is maintained by | ||
1187 | * user space. | ||
1188 | * | ||
1189 | * This should never impact any application or cause data loss, | ||
1190 | * however it might take some time. | ||
1191 | * | ||
1192 | * This is not a 100% solution for all memory, but tries to be | ||
1193 | * ``good enough'' for the majority of memory. | ||
1194 | */ | ||
1195 | int soft_offline_page(struct page *page, int flags) | ||
1196 | { | ||
1197 | int ret; | ||
1198 | unsigned long pfn = page_to_pfn(page); | ||
1199 | |||
1200 | ret = get_any_page(page, pfn, flags); | ||
1201 | if (ret < 0) | ||
1202 | return ret; | ||
1203 | if (ret == 0) | ||
1204 | goto done; | ||
1205 | |||
1206 | /* | ||
1207 | * Page cache page we can handle? | ||
1208 | */ | ||
1209 | if (!PageLRU(page)) { | ||
1210 | /* | ||
1211 | * Try to free it. | ||
1212 | */ | ||
1213 | put_page(page); | ||
1214 | shake_page(page, 1); | ||
1215 | |||
1216 | /* | ||
1217 | * Did it turn free? | ||
1218 | */ | ||
1219 | ret = get_any_page(page, pfn, 0); | ||
1220 | if (ret < 0) | ||
1221 | return ret; | ||
1222 | if (ret == 0) | ||
1223 | goto done; | ||
1224 | } | ||
1225 | if (!PageLRU(page)) { | ||
1226 | pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n", | ||
1227 | pfn, page->flags); | ||
1228 | return -EIO; | ||
1229 | } | ||
1230 | |||
1231 | lock_page(page); | ||
1232 | wait_on_page_writeback(page); | ||
1233 | |||
1234 | /* | ||
1235 | * Synchronized using the page lock with memory_failure() | ||
1236 | */ | ||
1237 | if (PageHWPoison(page)) { | ||
1238 | unlock_page(page); | ||
1239 | put_page(page); | ||
1240 | pr_debug("soft offline: %#lx page already poisoned\n", pfn); | ||
1241 | return -EBUSY; | ||
1242 | } | ||
1243 | |||
1244 | /* | ||
1245 | * Try to invalidate first. This should work for | ||
1246 | * non dirty unmapped page cache pages. | ||
1247 | */ | ||
1248 | ret = invalidate_inode_page(page); | ||
1249 | unlock_page(page); | ||
1250 | |||
1251 | /* | ||
1252 | * Drop count because page migration doesn't like raised | ||
1253 | * counts. The page could get re-allocated, but if it becomes | ||
1254 | * LRU the isolation will just fail. | ||
1255 | * RED-PEN would be better to keep it isolated here, but we | ||
1256 | * would need to fix isolation locking first. | ||
1257 | */ | ||
1258 | put_page(page); | ||
1259 | if (ret == 1) { | ||
1260 | ret = 0; | ||
1261 | pr_debug("soft_offline: %#lx: invalidated\n", pfn); | ||
1262 | goto done; | ||
1263 | } | ||
1264 | |||
1265 | /* | ||
1266 | * Simple invalidation didn't work. | ||
1267 | * Try to migrate to a new page instead. migrate.c | ||
1268 | * handles a large number of cases for us. | ||
1269 | */ | ||
1270 | ret = isolate_lru_page(page); | ||
1271 | if (!ret) { | ||
1272 | LIST_HEAD(pagelist); | ||
1273 | |||
1274 | list_add(&page->lru, &pagelist); | ||
1275 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); | ||
1276 | if (ret) { | ||
1277 | pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", | ||
1278 | pfn, ret, page->flags); | ||
1279 | if (ret > 0) | ||
1280 | ret = -EIO; | ||
1281 | } | ||
1282 | } else { | ||
1283 | pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", | ||
1284 | pfn, ret, page_count(page), page->flags); | ||
1285 | } | ||
1286 | if (ret) | ||
1287 | return ret; | ||
1288 | |||
1289 | done: | ||
1290 | atomic_long_add(1, &mce_bad_pages); | ||
1291 | SetPageHWPoison(page); | ||
1292 | /* keep elevated page count for bad page */ | ||
1293 | return ret; | ||
1294 | } | ||
diff --git a/mm/memory.c b/mm/memory.c index 7e91b5f9f690..09e4b1be7b67 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -572,7 +572,7 @@ out: | |||
572 | * covered by this vma. | 572 | * covered by this vma. |
573 | */ | 573 | */ |
574 | 574 | ||
575 | static inline void | 575 | static inline unsigned long |
576 | copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 576 | copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
577 | pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, | 577 | pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, |
578 | unsigned long addr, int *rss) | 578 | unsigned long addr, int *rss) |
@@ -586,7 +586,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
586 | if (!pte_file(pte)) { | 586 | if (!pte_file(pte)) { |
587 | swp_entry_t entry = pte_to_swp_entry(pte); | 587 | swp_entry_t entry = pte_to_swp_entry(pte); |
588 | 588 | ||
589 | swap_duplicate(entry); | 589 | if (swap_duplicate(entry) < 0) |
590 | return entry.val; | ||
591 | |||
590 | /* make sure dst_mm is on swapoff's mmlist. */ | 592 | /* make sure dst_mm is on swapoff's mmlist. */ |
591 | if (unlikely(list_empty(&dst_mm->mmlist))) { | 593 | if (unlikely(list_empty(&dst_mm->mmlist))) { |
592 | spin_lock(&mmlist_lock); | 594 | spin_lock(&mmlist_lock); |
@@ -635,16 +637,19 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
635 | 637 | ||
636 | out_set_pte: | 638 | out_set_pte: |
637 | set_pte_at(dst_mm, addr, dst_pte, pte); | 639 | set_pte_at(dst_mm, addr, dst_pte, pte); |
640 | return 0; | ||
638 | } | 641 | } |
639 | 642 | ||
640 | static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 643 | static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
641 | pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, | 644 | pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, |
642 | unsigned long addr, unsigned long end) | 645 | unsigned long addr, unsigned long end) |
643 | { | 646 | { |
647 | pte_t *orig_src_pte, *orig_dst_pte; | ||
644 | pte_t *src_pte, *dst_pte; | 648 | pte_t *src_pte, *dst_pte; |
645 | spinlock_t *src_ptl, *dst_ptl; | 649 | spinlock_t *src_ptl, *dst_ptl; |
646 | int progress = 0; | 650 | int progress = 0; |
647 | int rss[2]; | 651 | int rss[2]; |
652 | swp_entry_t entry = (swp_entry_t){0}; | ||
648 | 653 | ||
649 | again: | 654 | again: |
650 | rss[1] = rss[0] = 0; | 655 | rss[1] = rss[0] = 0; |
@@ -654,6 +659,8 @@ again: | |||
654 | src_pte = pte_offset_map_nested(src_pmd, addr); | 659 | src_pte = pte_offset_map_nested(src_pmd, addr); |
655 | src_ptl = pte_lockptr(src_mm, src_pmd); | 660 | src_ptl = pte_lockptr(src_mm, src_pmd); |
656 | spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); | 661 | spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); |
662 | orig_src_pte = src_pte; | ||
663 | orig_dst_pte = dst_pte; | ||
657 | arch_enter_lazy_mmu_mode(); | 664 | arch_enter_lazy_mmu_mode(); |
658 | 665 | ||
659 | do { | 666 | do { |
@@ -671,16 +678,25 @@ again: | |||
671 | progress++; | 678 | progress++; |
672 | continue; | 679 | continue; |
673 | } | 680 | } |
674 | copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); | 681 | entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, |
682 | vma, addr, rss); | ||
683 | if (entry.val) | ||
684 | break; | ||
675 | progress += 8; | 685 | progress += 8; |
676 | } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); | 686 | } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); |
677 | 687 | ||
678 | arch_leave_lazy_mmu_mode(); | 688 | arch_leave_lazy_mmu_mode(); |
679 | spin_unlock(src_ptl); | 689 | spin_unlock(src_ptl); |
680 | pte_unmap_nested(src_pte - 1); | 690 | pte_unmap_nested(orig_src_pte); |
681 | add_mm_rss(dst_mm, rss[0], rss[1]); | 691 | add_mm_rss(dst_mm, rss[0], rss[1]); |
682 | pte_unmap_unlock(dst_pte - 1, dst_ptl); | 692 | pte_unmap_unlock(orig_dst_pte, dst_ptl); |
683 | cond_resched(); | 693 | cond_resched(); |
694 | |||
695 | if (entry.val) { | ||
696 | if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) | ||
697 | return -ENOMEM; | ||
698 | progress = 0; | ||
699 | } | ||
684 | if (addr != end) | 700 | if (addr != end) |
685 | goto again; | 701 | goto again; |
686 | return 0; | 702 | return 0; |
@@ -940,6 +956,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, | |||
940 | details = NULL; | 956 | details = NULL; |
941 | 957 | ||
942 | BUG_ON(addr >= end); | 958 | BUG_ON(addr >= end); |
959 | mem_cgroup_uncharge_start(); | ||
943 | tlb_start_vma(tlb, vma); | 960 | tlb_start_vma(tlb, vma); |
944 | pgd = pgd_offset(vma->vm_mm, addr); | 961 | pgd = pgd_offset(vma->vm_mm, addr); |
945 | do { | 962 | do { |
@@ -952,6 +969,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, | |||
952 | zap_work, details); | 969 | zap_work, details); |
953 | } while (pgd++, addr = next, (addr != end && *zap_work > 0)); | 970 | } while (pgd++, addr = next, (addr != end && *zap_work > 0)); |
954 | tlb_end_vma(tlb, vma); | 971 | tlb_end_vma(tlb, vma); |
972 | mem_cgroup_uncharge_end(); | ||
955 | 973 | ||
956 | return addr; | 974 | return addr; |
957 | } | 975 | } |
@@ -1820,10 +1838,10 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
1820 | token = pmd_pgtable(*pmd); | 1838 | token = pmd_pgtable(*pmd); |
1821 | 1839 | ||
1822 | do { | 1840 | do { |
1823 | err = fn(pte, token, addr, data); | 1841 | err = fn(pte++, token, addr, data); |
1824 | if (err) | 1842 | if (err) |
1825 | break; | 1843 | break; |
1826 | } while (pte++, addr += PAGE_SIZE, addr != end); | 1844 | } while (addr += PAGE_SIZE, addr != end); |
1827 | 1845 | ||
1828 | arch_leave_lazy_mmu_mode(); | 1846 | arch_leave_lazy_mmu_mode(); |
1829 | 1847 | ||
@@ -2511,7 +2529,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2511 | ret = VM_FAULT_HWPOISON; | 2529 | ret = VM_FAULT_HWPOISON; |
2512 | } else { | 2530 | } else { |
2513 | print_bad_pte(vma, address, orig_pte, NULL); | 2531 | print_bad_pte(vma, address, orig_pte, NULL); |
2514 | ret = VM_FAULT_OOM; | 2532 | ret = VM_FAULT_SIGBUS; |
2515 | } | 2533 | } |
2516 | goto out; | 2534 | goto out; |
2517 | } | 2535 | } |
@@ -2537,14 +2555,24 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2537 | ret = VM_FAULT_MAJOR; | 2555 | ret = VM_FAULT_MAJOR; |
2538 | count_vm_event(PGMAJFAULT); | 2556 | count_vm_event(PGMAJFAULT); |
2539 | } else if (PageHWPoison(page)) { | 2557 | } else if (PageHWPoison(page)) { |
2558 | /* | ||
2559 | * hwpoisoned dirty swapcache pages are kept for killing | ||
2560 | * owner processes (which may be unknown at hwpoison time) | ||
2561 | */ | ||
2540 | ret = VM_FAULT_HWPOISON; | 2562 | ret = VM_FAULT_HWPOISON; |
2541 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2563 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
2542 | goto out; | 2564 | goto out_release; |
2543 | } | 2565 | } |
2544 | 2566 | ||
2545 | lock_page(page); | 2567 | lock_page(page); |
2546 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2568 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
2547 | 2569 | ||
2570 | page = ksm_might_need_to_copy(page, vma, address); | ||
2571 | if (!page) { | ||
2572 | ret = VM_FAULT_OOM; | ||
2573 | goto out; | ||
2574 | } | ||
2575 | |||
2548 | if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { | 2576 | if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { |
2549 | ret = VM_FAULT_OOM; | 2577 | ret = VM_FAULT_OOM; |
2550 | goto out_page; | 2578 | goto out_page; |
@@ -2611,6 +2639,7 @@ out_nomap: | |||
2611 | pte_unmap_unlock(page_table, ptl); | 2639 | pte_unmap_unlock(page_table, ptl); |
2612 | out_page: | 2640 | out_page: |
2613 | unlock_page(page); | 2641 | unlock_page(page); |
2642 | out_release: | ||
2614 | page_cache_release(page); | 2643 | page_cache_release(page); |
2615 | return ret; | 2644 | return ret; |
2616 | } | 2645 | } |
@@ -2906,7 +2935,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2906 | * Page table corrupted: show pte and kill process. | 2935 | * Page table corrupted: show pte and kill process. |
2907 | */ | 2936 | */ |
2908 | print_bad_pte(vma, address, orig_pte, NULL); | 2937 | print_bad_pte(vma, address, orig_pte, NULL); |
2909 | return VM_FAULT_OOM; | 2938 | return VM_FAULT_SIGBUS; |
2910 | } | 2939 | } |
2911 | 2940 | ||
2912 | pgoff = pte_to_pgoff(orig_pte); | 2941 | pgoff = pte_to_pgoff(orig_pte); |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 821dee596377..030ce8a5bb0e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -26,6 +26,8 @@ | |||
26 | #include <linux/migrate.h> | 26 | #include <linux/migrate.h> |
27 | #include <linux/page-isolation.h> | 27 | #include <linux/page-isolation.h> |
28 | #include <linux/pfn.h> | 28 | #include <linux/pfn.h> |
29 | #include <linux/suspend.h> | ||
30 | #include <linux/mm_inline.h> | ||
29 | 31 | ||
30 | #include <asm/tlbflush.h> | 32 | #include <asm/tlbflush.h> |
31 | 33 | ||
@@ -70,7 +72,9 @@ static void get_page_bootmem(unsigned long info, struct page *page, int type) | |||
70 | atomic_inc(&page->_count); | 72 | atomic_inc(&page->_count); |
71 | } | 73 | } |
72 | 74 | ||
73 | void put_page_bootmem(struct page *page) | 75 | /* reference to __meminit __free_pages_bootmem is valid |
76 | * so use __ref to tell modpost not to generate a warning */ | ||
77 | void __ref put_page_bootmem(struct page *page) | ||
74 | { | 78 | { |
75 | int type; | 79 | int type; |
76 | 80 | ||
@@ -447,7 +451,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
447 | } | 451 | } |
448 | #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ | 452 | #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ |
449 | 453 | ||
450 | static pg_data_t *hotadd_new_pgdat(int nid, u64 start) | 454 | /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ |
455 | static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) | ||
451 | { | 456 | { |
452 | struct pglist_data *pgdat; | 457 | struct pglist_data *pgdat; |
453 | unsigned long zones_size[MAX_NR_ZONES] = {0}; | 458 | unsigned long zones_size[MAX_NR_ZONES] = {0}; |
@@ -484,14 +489,18 @@ int __ref add_memory(int nid, u64 start, u64 size) | |||
484 | struct resource *res; | 489 | struct resource *res; |
485 | int ret; | 490 | int ret; |
486 | 491 | ||
492 | lock_system_sleep(); | ||
493 | |||
487 | res = register_memory_resource(start, size); | 494 | res = register_memory_resource(start, size); |
495 | ret = -EEXIST; | ||
488 | if (!res) | 496 | if (!res) |
489 | return -EEXIST; | 497 | goto out; |
490 | 498 | ||
491 | if (!node_online(nid)) { | 499 | if (!node_online(nid)) { |
492 | pgdat = hotadd_new_pgdat(nid, start); | 500 | pgdat = hotadd_new_pgdat(nid, start); |
501 | ret = -ENOMEM; | ||
493 | if (!pgdat) | 502 | if (!pgdat) |
494 | return -ENOMEM; | 503 | goto out; |
495 | new_pgdat = 1; | 504 | new_pgdat = 1; |
496 | } | 505 | } |
497 | 506 | ||
@@ -514,7 +523,8 @@ int __ref add_memory(int nid, u64 start, u64 size) | |||
514 | BUG_ON(ret); | 523 | BUG_ON(ret); |
515 | } | 524 | } |
516 | 525 | ||
517 | return ret; | 526 | goto out; |
527 | |||
518 | error: | 528 | error: |
519 | /* rollback pgdat allocation and others */ | 529 | /* rollback pgdat allocation and others */ |
520 | if (new_pgdat) | 530 | if (new_pgdat) |
@@ -522,6 +532,8 @@ error: | |||
522 | if (res) | 532 | if (res) |
523 | release_memory_resource(res); | 533 | release_memory_resource(res); |
524 | 534 | ||
535 | out: | ||
536 | unlock_system_sleep(); | ||
525 | return ret; | 537 | return ret; |
526 | } | 538 | } |
527 | EXPORT_SYMBOL_GPL(add_memory); | 539 | EXPORT_SYMBOL_GPL(add_memory); |
@@ -663,6 +675,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
663 | if (!ret) { /* Success */ | 675 | if (!ret) { /* Success */ |
664 | list_add_tail(&page->lru, &source); | 676 | list_add_tail(&page->lru, &source); |
665 | move_pages--; | 677 | move_pages--; |
678 | inc_zone_page_state(page, NR_ISOLATED_ANON + | ||
679 | page_is_file_cache(page)); | ||
680 | |||
666 | } else { | 681 | } else { |
667 | /* Becasue we don't have big zone->lock. we should | 682 | /* Becasue we don't have big zone->lock. we should |
668 | check this again here. */ | 683 | check this again here. */ |
@@ -685,7 +700,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
685 | if (list_empty(&source)) | 700 | if (list_empty(&source)) |
686 | goto out; | 701 | goto out; |
687 | /* this function returns # of failed pages */ | 702 | /* this function returns # of failed pages */ |
688 | ret = migrate_pages(&source, hotremove_migrate_alloc, 0); | 703 | ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1); |
689 | 704 | ||
690 | out: | 705 | out: |
691 | return ret; | 706 | return ret; |
@@ -738,7 +753,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | |||
738 | return offlined; | 753 | return offlined; |
739 | } | 754 | } |
740 | 755 | ||
741 | int offline_pages(unsigned long start_pfn, | 756 | static int offline_pages(unsigned long start_pfn, |
742 | unsigned long end_pfn, unsigned long timeout) | 757 | unsigned long end_pfn, unsigned long timeout) |
743 | { | 758 | { |
744 | unsigned long pfn, nr_pages, expire; | 759 | unsigned long pfn, nr_pages, expire; |
@@ -758,6 +773,8 @@ int offline_pages(unsigned long start_pfn, | |||
758 | if (!test_pages_in_a_zone(start_pfn, end_pfn)) | 773 | if (!test_pages_in_a_zone(start_pfn, end_pfn)) |
759 | return -EINVAL; | 774 | return -EINVAL; |
760 | 775 | ||
776 | lock_system_sleep(); | ||
777 | |||
761 | zone = page_zone(pfn_to_page(start_pfn)); | 778 | zone = page_zone(pfn_to_page(start_pfn)); |
762 | node = zone_to_nid(zone); | 779 | node = zone_to_nid(zone); |
763 | nr_pages = end_pfn - start_pfn; | 780 | nr_pages = end_pfn - start_pfn; |
@@ -765,7 +782,7 @@ int offline_pages(unsigned long start_pfn, | |||
765 | /* set above range as isolated */ | 782 | /* set above range as isolated */ |
766 | ret = start_isolate_page_range(start_pfn, end_pfn); | 783 | ret = start_isolate_page_range(start_pfn, end_pfn); |
767 | if (ret) | 784 | if (ret) |
768 | return ret; | 785 | goto out; |
769 | 786 | ||
770 | arg.start_pfn = start_pfn; | 787 | arg.start_pfn = start_pfn; |
771 | arg.nr_pages = nr_pages; | 788 | arg.nr_pages = nr_pages; |
@@ -838,11 +855,16 @@ repeat: | |||
838 | 855 | ||
839 | setup_per_zone_wmarks(); | 856 | setup_per_zone_wmarks(); |
840 | calculate_zone_inactive_ratio(zone); | 857 | calculate_zone_inactive_ratio(zone); |
858 | if (!node_present_pages(node)) { | ||
859 | node_clear_state(node, N_HIGH_MEMORY); | ||
860 | kswapd_stop(node); | ||
861 | } | ||
841 | 862 | ||
842 | vm_total_pages = nr_free_pagecache_pages(); | 863 | vm_total_pages = nr_free_pagecache_pages(); |
843 | writeback_set_ratelimit(); | 864 | writeback_set_ratelimit(); |
844 | 865 | ||
845 | memory_notify(MEM_OFFLINE, &arg); | 866 | memory_notify(MEM_OFFLINE, &arg); |
867 | unlock_system_sleep(); | ||
846 | return 0; | 868 | return 0; |
847 | 869 | ||
848 | failed_removal: | 870 | failed_removal: |
@@ -852,6 +874,8 @@ failed_removal: | |||
852 | /* pushback to free area */ | 874 | /* pushback to free area */ |
853 | undo_isolate_page_range(start_pfn, end_pfn); | 875 | undo_isolate_page_range(start_pfn, end_pfn); |
854 | 876 | ||
877 | out: | ||
878 | unlock_system_sleep(); | ||
855 | return ret; | 879 | return ret; |
856 | } | 880 | } |
857 | 881 | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 7dd9d9f80694..290fb5bf0440 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -85,10 +85,12 @@ | |||
85 | #include <linux/seq_file.h> | 85 | #include <linux/seq_file.h> |
86 | #include <linux/proc_fs.h> | 86 | #include <linux/proc_fs.h> |
87 | #include <linux/migrate.h> | 87 | #include <linux/migrate.h> |
88 | #include <linux/ksm.h> | ||
88 | #include <linux/rmap.h> | 89 | #include <linux/rmap.h> |
89 | #include <linux/security.h> | 90 | #include <linux/security.h> |
90 | #include <linux/syscalls.h> | 91 | #include <linux/syscalls.h> |
91 | #include <linux/ctype.h> | 92 | #include <linux/ctype.h> |
93 | #include <linux/mm_inline.h> | ||
92 | 94 | ||
93 | #include <asm/tlbflush.h> | 95 | #include <asm/tlbflush.h> |
94 | #include <asm/uaccess.h> | 96 | #include <asm/uaccess.h> |
@@ -412,17 +414,11 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
412 | if (!page) | 414 | if (!page) |
413 | continue; | 415 | continue; |
414 | /* | 416 | /* |
415 | * The check for PageReserved here is important to avoid | 417 | * vm_normal_page() filters out zero pages, but there might |
416 | * handling zero pages and other pages that may have been | 418 | * still be PageReserved pages to skip, perhaps in a VDSO. |
417 | * marked special by the system. | 419 | * And we cannot move PageKsm pages sensibly or safely yet. |
418 | * | ||
419 | * If the PageReserved would not be checked here then f.e. | ||
420 | * the location of the zero page could have an influence | ||
421 | * on MPOL_MF_STRICT, zero pages would be counted for | ||
422 | * the per node stats, and there would be useless attempts | ||
423 | * to put zero pages on the migration list. | ||
424 | */ | 420 | */ |
425 | if (PageReserved(page)) | 421 | if (PageReserved(page) || PageKsm(page)) |
426 | continue; | 422 | continue; |
427 | nid = page_to_nid(page); | 423 | nid = page_to_nid(page); |
428 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) | 424 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) |
@@ -809,6 +805,8 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, | |||
809 | if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { | 805 | if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { |
810 | if (!isolate_lru_page(page)) { | 806 | if (!isolate_lru_page(page)) { |
811 | list_add_tail(&page->lru, pagelist); | 807 | list_add_tail(&page->lru, pagelist); |
808 | inc_zone_page_state(page, NR_ISOLATED_ANON + | ||
809 | page_is_file_cache(page)); | ||
812 | } | 810 | } |
813 | } | 811 | } |
814 | } | 812 | } |
@@ -836,7 +834,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, | |||
836 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); | 834 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); |
837 | 835 | ||
838 | if (!list_empty(&pagelist)) | 836 | if (!list_empty(&pagelist)) |
839 | err = migrate_pages(&pagelist, new_node_page, dest); | 837 | err = migrate_pages(&pagelist, new_node_page, dest, 0); |
840 | 838 | ||
841 | return err; | 839 | return err; |
842 | } | 840 | } |
@@ -1024,7 +1022,7 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1024 | 1022 | ||
1025 | err = migrate_prep(); | 1023 | err = migrate_prep(); |
1026 | if (err) | 1024 | if (err) |
1027 | return err; | 1025 | goto mpol_out; |
1028 | } | 1026 | } |
1029 | { | 1027 | { |
1030 | NODEMASK_SCRATCH(scratch); | 1028 | NODEMASK_SCRATCH(scratch); |
@@ -1039,10 +1037,9 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1039 | err = -ENOMEM; | 1037 | err = -ENOMEM; |
1040 | NODEMASK_SCRATCH_FREE(scratch); | 1038 | NODEMASK_SCRATCH_FREE(scratch); |
1041 | } | 1039 | } |
1042 | if (err) { | 1040 | if (err) |
1043 | mpol_put(new); | 1041 | goto mpol_out; |
1044 | return err; | 1042 | |
1045 | } | ||
1046 | vma = check_range(mm, start, end, nmask, | 1043 | vma = check_range(mm, start, end, nmask, |
1047 | flags | MPOL_MF_INVERT, &pagelist); | 1044 | flags | MPOL_MF_INVERT, &pagelist); |
1048 | 1045 | ||
@@ -1054,13 +1051,15 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1054 | 1051 | ||
1055 | if (!list_empty(&pagelist)) | 1052 | if (!list_empty(&pagelist)) |
1056 | nr_failed = migrate_pages(&pagelist, new_vma_page, | 1053 | nr_failed = migrate_pages(&pagelist, new_vma_page, |
1057 | (unsigned long)vma); | 1054 | (unsigned long)vma, 0); |
1058 | 1055 | ||
1059 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) | 1056 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) |
1060 | err = -EIO; | 1057 | err = -EIO; |
1061 | } | 1058 | } else |
1059 | putback_lru_pages(&pagelist); | ||
1062 | 1060 | ||
1063 | up_write(&mm->mmap_sem); | 1061 | up_write(&mm->mmap_sem); |
1062 | mpol_out: | ||
1064 | mpol_put(new); | 1063 | mpol_put(new); |
1065 | return err; | 1064 | return err; |
1066 | } | 1065 | } |
@@ -1564,6 +1563,53 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, | |||
1564 | } | 1563 | } |
1565 | return zl; | 1564 | return zl; |
1566 | } | 1565 | } |
1566 | |||
1567 | /* | ||
1568 | * init_nodemask_of_mempolicy | ||
1569 | * | ||
1570 | * If the current task's mempolicy is "default" [NULL], return 'false' | ||
1571 | * to indicate default policy. Otherwise, extract the policy nodemask | ||
1572 | * for 'bind' or 'interleave' policy into the argument nodemask, or | ||
1573 | * initialize the argument nodemask to contain the single node for | ||
1574 | * 'preferred' or 'local' policy and return 'true' to indicate presence | ||
1575 | * of non-default mempolicy. | ||
1576 | * | ||
1577 | * We don't bother with reference counting the mempolicy [mpol_get/put] | ||
1578 | * because the current task is examining it's own mempolicy and a task's | ||
1579 | * mempolicy is only ever changed by the task itself. | ||
1580 | * | ||
1581 | * N.B., it is the caller's responsibility to free a returned nodemask. | ||
1582 | */ | ||
1583 | bool init_nodemask_of_mempolicy(nodemask_t *mask) | ||
1584 | { | ||
1585 | struct mempolicy *mempolicy; | ||
1586 | int nid; | ||
1587 | |||
1588 | if (!(mask && current->mempolicy)) | ||
1589 | return false; | ||
1590 | |||
1591 | mempolicy = current->mempolicy; | ||
1592 | switch (mempolicy->mode) { | ||
1593 | case MPOL_PREFERRED: | ||
1594 | if (mempolicy->flags & MPOL_F_LOCAL) | ||
1595 | nid = numa_node_id(); | ||
1596 | else | ||
1597 | nid = mempolicy->v.preferred_node; | ||
1598 | init_nodemask_of_node(mask, nid); | ||
1599 | break; | ||
1600 | |||
1601 | case MPOL_BIND: | ||
1602 | /* Fall through */ | ||
1603 | case MPOL_INTERLEAVE: | ||
1604 | *mask = mempolicy->v.nodes; | ||
1605 | break; | ||
1606 | |||
1607 | default: | ||
1608 | BUG(); | ||
1609 | } | ||
1610 | |||
1611 | return true; | ||
1612 | } | ||
1567 | #endif | 1613 | #endif |
1568 | 1614 | ||
1569 | /* Allocate a page in interleaved policy. | 1615 | /* Allocate a page in interleaved policy. |
diff --git a/mm/migrate.c b/mm/migrate.c index 1a4bf4813780..efddbf0926b2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/mm_inline.h> | 21 | #include <linux/mm_inline.h> |
22 | #include <linux/nsproxy.h> | 22 | #include <linux/nsproxy.h> |
23 | #include <linux/pagevec.h> | 23 | #include <linux/pagevec.h> |
24 | #include <linux/ksm.h> | ||
24 | #include <linux/rmap.h> | 25 | #include <linux/rmap.h> |
25 | #include <linux/topology.h> | 26 | #include <linux/topology.h> |
26 | #include <linux/cpu.h> | 27 | #include <linux/cpu.h> |
@@ -78,8 +79,8 @@ int putback_lru_pages(struct list_head *l) | |||
78 | /* | 79 | /* |
79 | * Restore a potential migration pte to a working pte entry | 80 | * Restore a potential migration pte to a working pte entry |
80 | */ | 81 | */ |
81 | static void remove_migration_pte(struct vm_area_struct *vma, | 82 | static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, |
82 | struct page *old, struct page *new) | 83 | unsigned long addr, void *old) |
83 | { | 84 | { |
84 | struct mm_struct *mm = vma->vm_mm; | 85 | struct mm_struct *mm = vma->vm_mm; |
85 | swp_entry_t entry; | 86 | swp_entry_t entry; |
@@ -88,40 +89,37 @@ static void remove_migration_pte(struct vm_area_struct *vma, | |||
88 | pmd_t *pmd; | 89 | pmd_t *pmd; |
89 | pte_t *ptep, pte; | 90 | pte_t *ptep, pte; |
90 | spinlock_t *ptl; | 91 | spinlock_t *ptl; |
91 | unsigned long addr = page_address_in_vma(new, vma); | ||
92 | |||
93 | if (addr == -EFAULT) | ||
94 | return; | ||
95 | 92 | ||
96 | pgd = pgd_offset(mm, addr); | 93 | pgd = pgd_offset(mm, addr); |
97 | if (!pgd_present(*pgd)) | 94 | if (!pgd_present(*pgd)) |
98 | return; | 95 | goto out; |
99 | 96 | ||
100 | pud = pud_offset(pgd, addr); | 97 | pud = pud_offset(pgd, addr); |
101 | if (!pud_present(*pud)) | 98 | if (!pud_present(*pud)) |
102 | return; | 99 | goto out; |
103 | 100 | ||
104 | pmd = pmd_offset(pud, addr); | 101 | pmd = pmd_offset(pud, addr); |
105 | if (!pmd_present(*pmd)) | 102 | if (!pmd_present(*pmd)) |
106 | return; | 103 | goto out; |
107 | 104 | ||
108 | ptep = pte_offset_map(pmd, addr); | 105 | ptep = pte_offset_map(pmd, addr); |
109 | 106 | ||
110 | if (!is_swap_pte(*ptep)) { | 107 | if (!is_swap_pte(*ptep)) { |
111 | pte_unmap(ptep); | 108 | pte_unmap(ptep); |
112 | return; | 109 | goto out; |
113 | } | 110 | } |
114 | 111 | ||
115 | ptl = pte_lockptr(mm, pmd); | 112 | ptl = pte_lockptr(mm, pmd); |
116 | spin_lock(ptl); | 113 | spin_lock(ptl); |
117 | pte = *ptep; | 114 | pte = *ptep; |
118 | if (!is_swap_pte(pte)) | 115 | if (!is_swap_pte(pte)) |
119 | goto out; | 116 | goto unlock; |
120 | 117 | ||
121 | entry = pte_to_swp_entry(pte); | 118 | entry = pte_to_swp_entry(pte); |
122 | 119 | ||
123 | if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) | 120 | if (!is_migration_entry(entry) || |
124 | goto out; | 121 | migration_entry_to_page(entry) != old) |
122 | goto unlock; | ||
125 | 123 | ||
126 | get_page(new); | 124 | get_page(new); |
127 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); | 125 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); |
@@ -137,58 +135,10 @@ static void remove_migration_pte(struct vm_area_struct *vma, | |||
137 | 135 | ||
138 | /* No need to invalidate - it was non-present before */ | 136 | /* No need to invalidate - it was non-present before */ |
139 | update_mmu_cache(vma, addr, pte); | 137 | update_mmu_cache(vma, addr, pte); |
140 | 138 | unlock: | |
141 | out: | ||
142 | pte_unmap_unlock(ptep, ptl); | 139 | pte_unmap_unlock(ptep, ptl); |
143 | } | 140 | out: |
144 | 141 | return SWAP_AGAIN; | |
145 | /* | ||
146 | * Note that remove_file_migration_ptes will only work on regular mappings, | ||
147 | * Nonlinear mappings do not use migration entries. | ||
148 | */ | ||
149 | static void remove_file_migration_ptes(struct page *old, struct page *new) | ||
150 | { | ||
151 | struct vm_area_struct *vma; | ||
152 | struct address_space *mapping = new->mapping; | ||
153 | struct prio_tree_iter iter; | ||
154 | pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
155 | |||
156 | if (!mapping) | ||
157 | return; | ||
158 | |||
159 | spin_lock(&mapping->i_mmap_lock); | ||
160 | |||
161 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) | ||
162 | remove_migration_pte(vma, old, new); | ||
163 | |||
164 | spin_unlock(&mapping->i_mmap_lock); | ||
165 | } | ||
166 | |||
167 | /* | ||
168 | * Must hold mmap_sem lock on at least one of the vmas containing | ||
169 | * the page so that the anon_vma cannot vanish. | ||
170 | */ | ||
171 | static void remove_anon_migration_ptes(struct page *old, struct page *new) | ||
172 | { | ||
173 | struct anon_vma *anon_vma; | ||
174 | struct vm_area_struct *vma; | ||
175 | unsigned long mapping; | ||
176 | |||
177 | mapping = (unsigned long)new->mapping; | ||
178 | |||
179 | if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0) | ||
180 | return; | ||
181 | |||
182 | /* | ||
183 | * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. | ||
184 | */ | ||
185 | anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON); | ||
186 | spin_lock(&anon_vma->lock); | ||
187 | |||
188 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) | ||
189 | remove_migration_pte(vma, old, new); | ||
190 | |||
191 | spin_unlock(&anon_vma->lock); | ||
192 | } | 142 | } |
193 | 143 | ||
194 | /* | 144 | /* |
@@ -197,10 +147,7 @@ static void remove_anon_migration_ptes(struct page *old, struct page *new) | |||
197 | */ | 147 | */ |
198 | static void remove_migration_ptes(struct page *old, struct page *new) | 148 | static void remove_migration_ptes(struct page *old, struct page *new) |
199 | { | 149 | { |
200 | if (PageAnon(new)) | 150 | rmap_walk(new, remove_migration_pte, old); |
201 | remove_anon_migration_ptes(old, new); | ||
202 | else | ||
203 | remove_file_migration_ptes(old, new); | ||
204 | } | 151 | } |
205 | 152 | ||
206 | /* | 153 | /* |
@@ -341,8 +288,8 @@ static void migrate_page_copy(struct page *newpage, struct page *page) | |||
341 | if (TestClearPageActive(page)) { | 288 | if (TestClearPageActive(page)) { |
342 | VM_BUG_ON(PageUnevictable(page)); | 289 | VM_BUG_ON(PageUnevictable(page)); |
343 | SetPageActive(newpage); | 290 | SetPageActive(newpage); |
344 | } else | 291 | } else if (TestClearPageUnevictable(page)) |
345 | unevictable_migrate_page(newpage, page); | 292 | SetPageUnevictable(newpage); |
346 | if (PageChecked(page)) | 293 | if (PageChecked(page)) |
347 | SetPageChecked(newpage); | 294 | SetPageChecked(newpage); |
348 | if (PageMappedToDisk(page)) | 295 | if (PageMappedToDisk(page)) |
@@ -361,6 +308,7 @@ static void migrate_page_copy(struct page *newpage, struct page *page) | |||
361 | } | 308 | } |
362 | 309 | ||
363 | mlock_migrate_page(newpage, page); | 310 | mlock_migrate_page(newpage, page); |
311 | ksm_migrate_page(newpage, page); | ||
364 | 312 | ||
365 | ClearPageSwapCache(page); | 313 | ClearPageSwapCache(page); |
366 | ClearPagePrivate(page); | 314 | ClearPagePrivate(page); |
@@ -580,9 +528,9 @@ static int move_to_new_page(struct page *newpage, struct page *page) | |||
580 | else | 528 | else |
581 | rc = fallback_migrate_page(mapping, newpage, page); | 529 | rc = fallback_migrate_page(mapping, newpage, page); |
582 | 530 | ||
583 | if (!rc) { | 531 | if (!rc) |
584 | remove_migration_ptes(page, newpage); | 532 | remove_migration_ptes(page, newpage); |
585 | } else | 533 | else |
586 | newpage->mapping = NULL; | 534 | newpage->mapping = NULL; |
587 | 535 | ||
588 | unlock_page(newpage); | 536 | unlock_page(newpage); |
@@ -595,14 +543,14 @@ static int move_to_new_page(struct page *newpage, struct page *page) | |||
595 | * to the newly allocated page in newpage. | 543 | * to the newly allocated page in newpage. |
596 | */ | 544 | */ |
597 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, | 545 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, |
598 | struct page *page, int force) | 546 | struct page *page, int force, int offlining) |
599 | { | 547 | { |
600 | int rc = 0; | 548 | int rc = 0; |
601 | int *result = NULL; | 549 | int *result = NULL; |
602 | struct page *newpage = get_new_page(page, private, &result); | 550 | struct page *newpage = get_new_page(page, private, &result); |
603 | int rcu_locked = 0; | 551 | int rcu_locked = 0; |
604 | int charge = 0; | 552 | int charge = 0; |
605 | struct mem_cgroup *mem; | 553 | struct mem_cgroup *mem = NULL; |
606 | 554 | ||
607 | if (!newpage) | 555 | if (!newpage) |
608 | return -ENOMEM; | 556 | return -ENOMEM; |
@@ -621,6 +569,20 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
621 | lock_page(page); | 569 | lock_page(page); |
622 | } | 570 | } |
623 | 571 | ||
572 | /* | ||
573 | * Only memory hotplug's offline_pages() caller has locked out KSM, | ||
574 | * and can safely migrate a KSM page. The other cases have skipped | ||
575 | * PageKsm along with PageReserved - but it is only now when we have | ||
576 | * the page lock that we can be certain it will not go KSM beneath us | ||
577 | * (KSM will not upgrade a page from PageAnon to PageKsm when it sees | ||
578 | * its pagecount raised, but only here do we take the page lock which | ||
579 | * serializes that). | ||
580 | */ | ||
581 | if (PageKsm(page) && !offlining) { | ||
582 | rc = -EBUSY; | ||
583 | goto unlock; | ||
584 | } | ||
585 | |||
624 | /* charge against new page */ | 586 | /* charge against new page */ |
625 | charge = mem_cgroup_prepare_migration(page, &mem); | 587 | charge = mem_cgroup_prepare_migration(page, &mem); |
626 | if (charge == -ENOMEM) { | 588 | if (charge == -ENOMEM) { |
@@ -737,7 +699,7 @@ move_newpage: | |||
737 | * Return: Number of pages not migrated or error code. | 699 | * Return: Number of pages not migrated or error code. |
738 | */ | 700 | */ |
739 | int migrate_pages(struct list_head *from, | 701 | int migrate_pages(struct list_head *from, |
740 | new_page_t get_new_page, unsigned long private) | 702 | new_page_t get_new_page, unsigned long private, int offlining) |
741 | { | 703 | { |
742 | int retry = 1; | 704 | int retry = 1; |
743 | int nr_failed = 0; | 705 | int nr_failed = 0; |
@@ -746,13 +708,6 @@ int migrate_pages(struct list_head *from, | |||
746 | struct page *page2; | 708 | struct page *page2; |
747 | int swapwrite = current->flags & PF_SWAPWRITE; | 709 | int swapwrite = current->flags & PF_SWAPWRITE; |
748 | int rc; | 710 | int rc; |
749 | unsigned long flags; | ||
750 | |||
751 | local_irq_save(flags); | ||
752 | list_for_each_entry(page, from, lru) | ||
753 | __inc_zone_page_state(page, NR_ISOLATED_ANON + | ||
754 | page_is_file_cache(page)); | ||
755 | local_irq_restore(flags); | ||
756 | 711 | ||
757 | if (!swapwrite) | 712 | if (!swapwrite) |
758 | current->flags |= PF_SWAPWRITE; | 713 | current->flags |= PF_SWAPWRITE; |
@@ -764,7 +719,7 @@ int migrate_pages(struct list_head *from, | |||
764 | cond_resched(); | 719 | cond_resched(); |
765 | 720 | ||
766 | rc = unmap_and_move(get_new_page, private, | 721 | rc = unmap_and_move(get_new_page, private, |
767 | page, pass > 2); | 722 | page, pass > 2, offlining); |
768 | 723 | ||
769 | switch(rc) { | 724 | switch(rc) { |
770 | case -ENOMEM: | 725 | case -ENOMEM: |
@@ -860,7 +815,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm, | |||
860 | if (!page) | 815 | if (!page) |
861 | goto set_status; | 816 | goto set_status; |
862 | 817 | ||
863 | if (PageReserved(page)) /* Check for zero page */ | 818 | /* Use PageReserved to check for zero page */ |
819 | if (PageReserved(page) || PageKsm(page)) | ||
864 | goto put_and_set; | 820 | goto put_and_set; |
865 | 821 | ||
866 | pp->page = page; | 822 | pp->page = page; |
@@ -878,8 +834,11 @@ static int do_move_page_to_node_array(struct mm_struct *mm, | |||
878 | goto put_and_set; | 834 | goto put_and_set; |
879 | 835 | ||
880 | err = isolate_lru_page(page); | 836 | err = isolate_lru_page(page); |
881 | if (!err) | 837 | if (!err) { |
882 | list_add_tail(&page->lru, &pagelist); | 838 | list_add_tail(&page->lru, &pagelist); |
839 | inc_zone_page_state(page, NR_ISOLATED_ANON + | ||
840 | page_is_file_cache(page)); | ||
841 | } | ||
883 | put_and_set: | 842 | put_and_set: |
884 | /* | 843 | /* |
885 | * Either remove the duplicate refcount from | 844 | * Either remove the duplicate refcount from |
@@ -894,7 +853,7 @@ set_status: | |||
894 | err = 0; | 853 | err = 0; |
895 | if (!list_empty(&pagelist)) | 854 | if (!list_empty(&pagelist)) |
896 | err = migrate_pages(&pagelist, new_page_node, | 855 | err = migrate_pages(&pagelist, new_page_node, |
897 | (unsigned long)pm); | 856 | (unsigned long)pm, 0); |
898 | 857 | ||
899 | up_read(&mm->mmap_sem); | 858 | up_read(&mm->mmap_sem); |
900 | return err; | 859 | return err; |
@@ -1015,7 +974,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, | |||
1015 | 974 | ||
1016 | err = -ENOENT; | 975 | err = -ENOENT; |
1017 | /* Use PageReserved to check for zero page */ | 976 | /* Use PageReserved to check for zero page */ |
1018 | if (!page || PageReserved(page)) | 977 | if (!page || PageReserved(page) || PageKsm(page)) |
1019 | goto set_status; | 978 | goto set_status; |
1020 | 979 | ||
1021 | err = page_to_nid(page); | 980 | err = page_to_nid(page); |
@@ -1044,7 +1003,7 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, | |||
1044 | int err; | 1003 | int err; |
1045 | 1004 | ||
1046 | for (i = 0; i < nr_pages; i += chunk_nr) { | 1005 | for (i = 0; i < nr_pages; i += chunk_nr) { |
1047 | if (chunk_nr + i > nr_pages) | 1006 | if (chunk_nr > nr_pages - i) |
1048 | chunk_nr = nr_pages - i; | 1007 | chunk_nr = nr_pages - i; |
1049 | 1008 | ||
1050 | err = copy_from_user(chunk_pages, &pages[i], | 1009 | err = copy_from_user(chunk_pages, &pages[i], |
diff --git a/mm/mincore.c b/mm/mincore.c index 8cb508f84ea4..7a3436ef39eb 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/syscalls.h> | 14 | #include <linux/syscalls.h> |
15 | #include <linux/swap.h> | 15 | #include <linux/swap.h> |
16 | #include <linux/swapops.h> | 16 | #include <linux/swapops.h> |
17 | #include <linux/hugetlb.h> | ||
17 | 18 | ||
18 | #include <asm/uaccess.h> | 19 | #include <asm/uaccess.h> |
19 | #include <asm/pgtable.h> | 20 | #include <asm/pgtable.h> |
@@ -72,6 +73,42 @@ static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pag | |||
72 | if (!vma || addr < vma->vm_start) | 73 | if (!vma || addr < vma->vm_start) |
73 | return -ENOMEM; | 74 | return -ENOMEM; |
74 | 75 | ||
76 | #ifdef CONFIG_HUGETLB_PAGE | ||
77 | if (is_vm_hugetlb_page(vma)) { | ||
78 | struct hstate *h; | ||
79 | unsigned long nr_huge; | ||
80 | unsigned char present; | ||
81 | |||
82 | i = 0; | ||
83 | nr = min(pages, (vma->vm_end - addr) >> PAGE_SHIFT); | ||
84 | h = hstate_vma(vma); | ||
85 | nr_huge = ((addr + pages * PAGE_SIZE - 1) >> huge_page_shift(h)) | ||
86 | - (addr >> huge_page_shift(h)) + 1; | ||
87 | nr_huge = min(nr_huge, | ||
88 | (vma->vm_end - addr) >> huge_page_shift(h)); | ||
89 | while (1) { | ||
90 | /* hugepage always in RAM for now, | ||
91 | * but generally it needs to be check */ | ||
92 | ptep = huge_pte_offset(current->mm, | ||
93 | addr & huge_page_mask(h)); | ||
94 | present = !!(ptep && | ||
95 | !huge_pte_none(huge_ptep_get(ptep))); | ||
96 | while (1) { | ||
97 | vec[i++] = present; | ||
98 | addr += PAGE_SIZE; | ||
99 | /* reach buffer limit */ | ||
100 | if (i == nr) | ||
101 | return nr; | ||
102 | /* check hugepage border */ | ||
103 | if (!((addr & ~huge_page_mask(h)) | ||
104 | >> PAGE_SHIFT)) | ||
105 | break; | ||
106 | } | ||
107 | } | ||
108 | return nr; | ||
109 | } | ||
110 | #endif | ||
111 | |||
75 | /* | 112 | /* |
76 | * Calculate how many pages there are left in the last level of the | 113 | * Calculate how many pages there are left in the last level of the |
77 | * PTE array for our address. | 114 | * PTE array for our address. |
diff --git a/mm/mlock.c b/mm/mlock.c index bd6f0e466f6c..2b8335a89400 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -88,25 +88,22 @@ void mlock_vma_page(struct page *page) | |||
88 | } | 88 | } |
89 | } | 89 | } |
90 | 90 | ||
91 | /* | 91 | /** |
92 | * called from munlock()/munmap() path with page supposedly on the LRU. | 92 | * munlock_vma_page - munlock a vma page |
93 | * @page - page to be unlocked | ||
93 | * | 94 | * |
94 | * Note: unlike mlock_vma_page(), we can't just clear the PageMlocked | 95 | * called from munlock()/munmap() path with page supposedly on the LRU. |
95 | * [in try_to_munlock()] and then attempt to isolate the page. We must | 96 | * When we munlock a page, because the vma where we found the page is being |
96 | * isolate the page to keep others from messing with its unevictable | 97 | * munlock()ed or munmap()ed, we want to check whether other vmas hold the |
97 | * and mlocked state while trying to munlock. However, we pre-clear the | 98 | * page locked so that we can leave it on the unevictable lru list and not |
98 | * mlocked state anyway as we might lose the isolation race and we might | 99 | * bother vmscan with it. However, to walk the page's rmap list in |
99 | * not get another chance to clear PageMlocked. If we successfully | 100 | * try_to_munlock() we must isolate the page from the LRU. If some other |
100 | * isolate the page and try_to_munlock() detects other VM_LOCKED vmas | 101 | * task has removed the page from the LRU, we won't be able to do that. |
101 | * mapping the page, it will restore the PageMlocked state, unless the page | 102 | * So we clear the PageMlocked as we might not get another chance. If we |
102 | * is mapped in a non-linear vma. So, we go ahead and SetPageMlocked(), | 103 | * can't isolate the page, we leave it for putback_lru_page() and vmscan |
103 | * perhaps redundantly. | 104 | * [page_referenced()/try_to_unmap()] to deal with. |
104 | * If we lose the isolation race, and the page is mapped by other VM_LOCKED | ||
105 | * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap() | ||
106 | * either of which will restore the PageMlocked state by calling | ||
107 | * mlock_vma_page() above, if it can grab the vma's mmap sem. | ||
108 | */ | 105 | */ |
109 | static void munlock_vma_page(struct page *page) | 106 | void munlock_vma_page(struct page *page) |
110 | { | 107 | { |
111 | BUG_ON(!PageLocked(page)); | 108 | BUG_ON(!PageLocked(page)); |
112 | 109 | ||
@@ -117,18 +114,18 @@ static void munlock_vma_page(struct page *page) | |||
117 | /* | 114 | /* |
118 | * did try_to_unlock() succeed or punt? | 115 | * did try_to_unlock() succeed or punt? |
119 | */ | 116 | */ |
120 | if (ret == SWAP_SUCCESS || ret == SWAP_AGAIN) | 117 | if (ret != SWAP_MLOCK) |
121 | count_vm_event(UNEVICTABLE_PGMUNLOCKED); | 118 | count_vm_event(UNEVICTABLE_PGMUNLOCKED); |
122 | 119 | ||
123 | putback_lru_page(page); | 120 | putback_lru_page(page); |
124 | } else { | 121 | } else { |
125 | /* | 122 | /* |
126 | * We lost the race. let try_to_unmap() deal | 123 | * Some other task has removed the page from the LRU. |
127 | * with it. At least we get the page state and | 124 | * putback_lru_page() will take care of removing the |
128 | * mlock stats right. However, page is still on | 125 | * page from the unevictable list, if necessary. |
129 | * the noreclaim list. We'll fix that up when | 126 | * vmscan [page_referenced()] will move the page back |
130 | * the page is eventually freed or we scan the | 127 | * to the unevictable list if some other vma has it |
131 | * noreclaim list. | 128 | * mlocked. |
132 | */ | 129 | */ |
133 | if (PageUnevictable(page)) | 130 | if (PageUnevictable(page)) |
134 | count_vm_event(UNEVICTABLE_PGSTRANDED); | 131 | count_vm_event(UNEVICTABLE_PGSTRANDED); |
@@ -20,7 +20,6 @@ | |||
20 | #include <linux/fs.h> | 20 | #include <linux/fs.h> |
21 | #include <linux/personality.h> | 21 | #include <linux/personality.h> |
22 | #include <linux/security.h> | 22 | #include <linux/security.h> |
23 | #include <linux/ima.h> | ||
24 | #include <linux/hugetlb.h> | 23 | #include <linux/hugetlb.h> |
25 | #include <linux/profile.h> | 24 | #include <linux/profile.h> |
26 | #include <linux/module.h> | 25 | #include <linux/module.h> |
@@ -932,13 +931,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
932 | if (!(flags & MAP_FIXED)) | 931 | if (!(flags & MAP_FIXED)) |
933 | addr = round_hint_to_min(addr); | 932 | addr = round_hint_to_min(addr); |
934 | 933 | ||
935 | error = arch_mmap_check(addr, len, flags); | ||
936 | if (error) | ||
937 | return error; | ||
938 | |||
939 | /* Careful about overflows.. */ | 934 | /* Careful about overflows.. */ |
940 | len = PAGE_ALIGN(len); | 935 | len = PAGE_ALIGN(len); |
941 | if (!len || len > TASK_SIZE) | 936 | if (!len) |
942 | return -ENOMEM; | 937 | return -ENOMEM; |
943 | 938 | ||
944 | /* offset overflow? */ | 939 | /* offset overflow? */ |
@@ -949,24 +944,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
949 | if (mm->map_count > sysctl_max_map_count) | 944 | if (mm->map_count > sysctl_max_map_count) |
950 | return -ENOMEM; | 945 | return -ENOMEM; |
951 | 946 | ||
952 | if (flags & MAP_HUGETLB) { | ||
953 | struct user_struct *user = NULL; | ||
954 | if (file) | ||
955 | return -EINVAL; | ||
956 | |||
957 | /* | ||
958 | * VM_NORESERVE is used because the reservations will be | ||
959 | * taken when vm_ops->mmap() is called | ||
960 | * A dummy user value is used because we are not locking | ||
961 | * memory so no accounting is necessary | ||
962 | */ | ||
963 | len = ALIGN(len, huge_page_size(&default_hstate)); | ||
964 | file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, | ||
965 | &user, HUGETLB_ANONHUGE_INODE); | ||
966 | if (IS_ERR(file)) | ||
967 | return PTR_ERR(file); | ||
968 | } | ||
969 | |||
970 | /* Obtain the address to map to. we verify (or select) it and ensure | 947 | /* Obtain the address to map to. we verify (or select) it and ensure |
971 | * that it represents a valid section of the address space. | 948 | * that it represents a valid section of the address space. |
972 | */ | 949 | */ |
@@ -1061,14 +1038,51 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
1061 | error = security_file_mmap(file, reqprot, prot, flags, addr, 0); | 1038 | error = security_file_mmap(file, reqprot, prot, flags, addr, 0); |
1062 | if (error) | 1039 | if (error) |
1063 | return error; | 1040 | return error; |
1064 | error = ima_file_mmap(file, prot); | ||
1065 | if (error) | ||
1066 | return error; | ||
1067 | 1041 | ||
1068 | return mmap_region(file, addr, len, flags, vm_flags, pgoff); | 1042 | return mmap_region(file, addr, len, flags, vm_flags, pgoff); |
1069 | } | 1043 | } |
1070 | EXPORT_SYMBOL(do_mmap_pgoff); | 1044 | EXPORT_SYMBOL(do_mmap_pgoff); |
1071 | 1045 | ||
1046 | SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | ||
1047 | unsigned long, prot, unsigned long, flags, | ||
1048 | unsigned long, fd, unsigned long, pgoff) | ||
1049 | { | ||
1050 | struct file *file = NULL; | ||
1051 | unsigned long retval = -EBADF; | ||
1052 | |||
1053 | if (!(flags & MAP_ANONYMOUS)) { | ||
1054 | if (unlikely(flags & MAP_HUGETLB)) | ||
1055 | return -EINVAL; | ||
1056 | file = fget(fd); | ||
1057 | if (!file) | ||
1058 | goto out; | ||
1059 | } else if (flags & MAP_HUGETLB) { | ||
1060 | struct user_struct *user = NULL; | ||
1061 | /* | ||
1062 | * VM_NORESERVE is used because the reservations will be | ||
1063 | * taken when vm_ops->mmap() is called | ||
1064 | * A dummy user value is used because we are not locking | ||
1065 | * memory so no accounting is necessary | ||
1066 | */ | ||
1067 | len = ALIGN(len, huge_page_size(&default_hstate)); | ||
1068 | file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, | ||
1069 | &user, HUGETLB_ANONHUGE_INODE); | ||
1070 | if (IS_ERR(file)) | ||
1071 | return PTR_ERR(file); | ||
1072 | } | ||
1073 | |||
1074 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); | ||
1075 | |||
1076 | down_write(¤t->mm->mmap_sem); | ||
1077 | retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); | ||
1078 | up_write(¤t->mm->mmap_sem); | ||
1079 | |||
1080 | if (file) | ||
1081 | fput(file); | ||
1082 | out: | ||
1083 | return retval; | ||
1084 | } | ||
1085 | |||
1072 | /* | 1086 | /* |
1073 | * Some shared mappigns will want the pages marked read-only | 1087 | * Some shared mappigns will want the pages marked read-only |
1074 | * to track write events. If so, we'll downgrade vm_page_prot | 1088 | * to track write events. If so, we'll downgrade vm_page_prot |
@@ -1224,8 +1238,20 @@ munmap_back: | |||
1224 | goto free_vma; | 1238 | goto free_vma; |
1225 | } | 1239 | } |
1226 | 1240 | ||
1227 | if (vma_wants_writenotify(vma)) | 1241 | if (vma_wants_writenotify(vma)) { |
1242 | pgprot_t pprot = vma->vm_page_prot; | ||
1243 | |||
1244 | /* Can vma->vm_page_prot have changed?? | ||
1245 | * | ||
1246 | * Answer: Yes, drivers may have changed it in their | ||
1247 | * f_op->mmap method. | ||
1248 | * | ||
1249 | * Ensures that vmas marked as uncached stay that way. | ||
1250 | */ | ||
1228 | vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); | 1251 | vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); |
1252 | if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot))) | ||
1253 | vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); | ||
1254 | } | ||
1229 | 1255 | ||
1230 | vma_link(mm, vma, prev, rb_link, rb_parent); | 1256 | vma_link(mm, vma, prev, rb_link, rb_parent); |
1231 | file = vma->vm_file; | 1257 | file = vma->vm_file; |
@@ -1459,6 +1485,14 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, | |||
1459 | unsigned long (*get_area)(struct file *, unsigned long, | 1485 | unsigned long (*get_area)(struct file *, unsigned long, |
1460 | unsigned long, unsigned long, unsigned long); | 1486 | unsigned long, unsigned long, unsigned long); |
1461 | 1487 | ||
1488 | unsigned long error = arch_mmap_check(addr, len, flags); | ||
1489 | if (error) | ||
1490 | return error; | ||
1491 | |||
1492 | /* Careful about overflows.. */ | ||
1493 | if (len > TASK_SIZE) | ||
1494 | return -ENOMEM; | ||
1495 | |||
1462 | get_area = current->mm->get_unmapped_area; | 1496 | get_area = current->mm->get_unmapped_area; |
1463 | if (file && file->f_op && file->f_op->get_unmapped_area) | 1497 | if (file && file->f_op && file->f_op->get_unmapped_area) |
1464 | get_area = file->f_op->get_unmapped_area; | 1498 | get_area = file->f_op->get_unmapped_area; |
@@ -1829,10 +1863,10 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1829 | } | 1863 | } |
1830 | 1864 | ||
1831 | /* | 1865 | /* |
1832 | * Split a vma into two pieces at address 'addr', a new vma is allocated | 1866 | * __split_vma() bypasses sysctl_max_map_count checking. We use this on the |
1833 | * either for the first part or the tail. | 1867 | * munmap path where it doesn't make sense to fail. |
1834 | */ | 1868 | */ |
1835 | int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | 1869 | static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, |
1836 | unsigned long addr, int new_below) | 1870 | unsigned long addr, int new_below) |
1837 | { | 1871 | { |
1838 | struct mempolicy *pol; | 1872 | struct mempolicy *pol; |
@@ -1842,9 +1876,6 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | |||
1842 | ~(huge_page_mask(hstate_vma(vma))))) | 1876 | ~(huge_page_mask(hstate_vma(vma))))) |
1843 | return -EINVAL; | 1877 | return -EINVAL; |
1844 | 1878 | ||
1845 | if (mm->map_count >= sysctl_max_map_count) | ||
1846 | return -ENOMEM; | ||
1847 | |||
1848 | new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); | 1879 | new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
1849 | if (!new) | 1880 | if (!new) |
1850 | return -ENOMEM; | 1881 | return -ENOMEM; |
@@ -1884,6 +1915,19 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | |||
1884 | return 0; | 1915 | return 0; |
1885 | } | 1916 | } |
1886 | 1917 | ||
1918 | /* | ||
1919 | * Split a vma into two pieces at address 'addr', a new vma is allocated | ||
1920 | * either for the first part or the tail. | ||
1921 | */ | ||
1922 | int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, | ||
1923 | unsigned long addr, int new_below) | ||
1924 | { | ||
1925 | if (mm->map_count >= sysctl_max_map_count) | ||
1926 | return -ENOMEM; | ||
1927 | |||
1928 | return __split_vma(mm, vma, addr, new_below); | ||
1929 | } | ||
1930 | |||
1887 | /* Munmap is split into 2 main parts -- this part which finds | 1931 | /* Munmap is split into 2 main parts -- this part which finds |
1888 | * what needs doing, and the areas themselves, which do the | 1932 | * what needs doing, and the areas themselves, which do the |
1889 | * work. This now handles partial unmappings. | 1933 | * work. This now handles partial unmappings. |
@@ -1919,7 +1963,17 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
1919 | * places tmp vma above, and higher split_vma places tmp vma below. | 1963 | * places tmp vma above, and higher split_vma places tmp vma below. |
1920 | */ | 1964 | */ |
1921 | if (start > vma->vm_start) { | 1965 | if (start > vma->vm_start) { |
1922 | int error = split_vma(mm, vma, start, 0); | 1966 | int error; |
1967 | |||
1968 | /* | ||
1969 | * Make sure that map_count on return from munmap() will | ||
1970 | * not exceed its limit; but let map_count go just above | ||
1971 | * its limit temporarily, to help free resources as expected. | ||
1972 | */ | ||
1973 | if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) | ||
1974 | return -ENOMEM; | ||
1975 | |||
1976 | error = __split_vma(mm, vma, start, 0); | ||
1923 | if (error) | 1977 | if (error) |
1924 | return error; | 1978 | return error; |
1925 | prev = vma; | 1979 | prev = vma; |
@@ -1928,7 +1982,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
1928 | /* Does it split the last one? */ | 1982 | /* Does it split the last one? */ |
1929 | last = find_vma(mm, end); | 1983 | last = find_vma(mm, end); |
1930 | if (last && end > last->vm_start) { | 1984 | if (last && end > last->vm_start) { |
1931 | int error = split_vma(mm, last, end, 1); | 1985 | int error = __split_vma(mm, last, end, 1); |
1932 | if (error) | 1986 | if (error) |
1933 | return error; | 1987 | return error; |
1934 | } | 1988 | } |
@@ -2003,20 +2057,14 @@ unsigned long do_brk(unsigned long addr, unsigned long len) | |||
2003 | if (!len) | 2057 | if (!len) |
2004 | return addr; | 2058 | return addr; |
2005 | 2059 | ||
2006 | if ((addr + len) > TASK_SIZE || (addr + len) < addr) | ||
2007 | return -EINVAL; | ||
2008 | |||
2009 | if (is_hugepage_only_range(mm, addr, len)) | ||
2010 | return -EINVAL; | ||
2011 | |||
2012 | error = security_file_mmap(NULL, 0, 0, 0, addr, 1); | 2060 | error = security_file_mmap(NULL, 0, 0, 0, addr, 1); |
2013 | if (error) | 2061 | if (error) |
2014 | return error; | 2062 | return error; |
2015 | 2063 | ||
2016 | flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; | 2064 | flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; |
2017 | 2065 | ||
2018 | error = arch_mmap_check(addr, len, flags); | 2066 | error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); |
2019 | if (error) | 2067 | if (error & ~PAGE_MASK) |
2020 | return error; | 2068 | return error; |
2021 | 2069 | ||
2022 | /* | 2070 | /* |
diff --git a/mm/mremap.c b/mm/mremap.c index 97bff2547719..845190898d59 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -261,6 +261,137 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
261 | return new_addr; | 261 | return new_addr; |
262 | } | 262 | } |
263 | 263 | ||
264 | static struct vm_area_struct *vma_to_resize(unsigned long addr, | ||
265 | unsigned long old_len, unsigned long new_len, unsigned long *p) | ||
266 | { | ||
267 | struct mm_struct *mm = current->mm; | ||
268 | struct vm_area_struct *vma = find_vma(mm, addr); | ||
269 | |||
270 | if (!vma || vma->vm_start > addr) | ||
271 | goto Efault; | ||
272 | |||
273 | if (is_vm_hugetlb_page(vma)) | ||
274 | goto Einval; | ||
275 | |||
276 | /* We can't remap across vm area boundaries */ | ||
277 | if (old_len > vma->vm_end - addr) | ||
278 | goto Efault; | ||
279 | |||
280 | if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) { | ||
281 | if (new_len > old_len) | ||
282 | goto Efault; | ||
283 | } | ||
284 | |||
285 | if (vma->vm_flags & VM_LOCKED) { | ||
286 | unsigned long locked, lock_limit; | ||
287 | locked = mm->locked_vm << PAGE_SHIFT; | ||
288 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | ||
289 | locked += new_len - old_len; | ||
290 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | ||
291 | goto Eagain; | ||
292 | } | ||
293 | |||
294 | if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) | ||
295 | goto Enomem; | ||
296 | |||
297 | if (vma->vm_flags & VM_ACCOUNT) { | ||
298 | unsigned long charged = (new_len - old_len) >> PAGE_SHIFT; | ||
299 | if (security_vm_enough_memory(charged)) | ||
300 | goto Efault; | ||
301 | *p = charged; | ||
302 | } | ||
303 | |||
304 | return vma; | ||
305 | |||
306 | Efault: /* very odd choice for most of the cases, but... */ | ||
307 | return ERR_PTR(-EFAULT); | ||
308 | Einval: | ||
309 | return ERR_PTR(-EINVAL); | ||
310 | Enomem: | ||
311 | return ERR_PTR(-ENOMEM); | ||
312 | Eagain: | ||
313 | return ERR_PTR(-EAGAIN); | ||
314 | } | ||
315 | |||
316 | static unsigned long mremap_to(unsigned long addr, | ||
317 | unsigned long old_len, unsigned long new_addr, | ||
318 | unsigned long new_len) | ||
319 | { | ||
320 | struct mm_struct *mm = current->mm; | ||
321 | struct vm_area_struct *vma; | ||
322 | unsigned long ret = -EINVAL; | ||
323 | unsigned long charged = 0; | ||
324 | unsigned long map_flags; | ||
325 | |||
326 | if (new_addr & ~PAGE_MASK) | ||
327 | goto out; | ||
328 | |||
329 | if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) | ||
330 | goto out; | ||
331 | |||
332 | /* Check if the location we're moving into overlaps the | ||
333 | * old location at all, and fail if it does. | ||
334 | */ | ||
335 | if ((new_addr <= addr) && (new_addr+new_len) > addr) | ||
336 | goto out; | ||
337 | |||
338 | if ((addr <= new_addr) && (addr+old_len) > new_addr) | ||
339 | goto out; | ||
340 | |||
341 | ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); | ||
342 | if (ret) | ||
343 | goto out; | ||
344 | |||
345 | ret = do_munmap(mm, new_addr, new_len); | ||
346 | if (ret) | ||
347 | goto out; | ||
348 | |||
349 | if (old_len >= new_len) { | ||
350 | ret = do_munmap(mm, addr+new_len, old_len - new_len); | ||
351 | if (ret && old_len != new_len) | ||
352 | goto out; | ||
353 | old_len = new_len; | ||
354 | } | ||
355 | |||
356 | vma = vma_to_resize(addr, old_len, new_len, &charged); | ||
357 | if (IS_ERR(vma)) { | ||
358 | ret = PTR_ERR(vma); | ||
359 | goto out; | ||
360 | } | ||
361 | |||
362 | map_flags = MAP_FIXED; | ||
363 | if (vma->vm_flags & VM_MAYSHARE) | ||
364 | map_flags |= MAP_SHARED; | ||
365 | |||
366 | ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff + | ||
367 | ((addr - vma->vm_start) >> PAGE_SHIFT), | ||
368 | map_flags); | ||
369 | if (ret & ~PAGE_MASK) | ||
370 | goto out1; | ||
371 | |||
372 | ret = move_vma(vma, addr, old_len, new_len, new_addr); | ||
373 | if (!(ret & ~PAGE_MASK)) | ||
374 | goto out; | ||
375 | out1: | ||
376 | vm_unacct_memory(charged); | ||
377 | |||
378 | out: | ||
379 | return ret; | ||
380 | } | ||
381 | |||
382 | static int vma_expandable(struct vm_area_struct *vma, unsigned long delta) | ||
383 | { | ||
384 | unsigned long end = vma->vm_end + delta; | ||
385 | if (end < vma->vm_end) /* overflow */ | ||
386 | return 0; | ||
387 | if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */ | ||
388 | return 0; | ||
389 | if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start, | ||
390 | 0, MAP_FIXED) & ~PAGE_MASK) | ||
391 | return 0; | ||
392 | return 1; | ||
393 | } | ||
394 | |||
264 | /* | 395 | /* |
265 | * Expand (or shrink) an existing mapping, potentially moving it at the | 396 | * Expand (or shrink) an existing mapping, potentially moving it at the |
266 | * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) | 397 | * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) |
@@ -294,32 +425,10 @@ unsigned long do_mremap(unsigned long addr, | |||
294 | if (!new_len) | 425 | if (!new_len) |
295 | goto out; | 426 | goto out; |
296 | 427 | ||
297 | /* new_addr is only valid if MREMAP_FIXED is specified */ | ||
298 | if (flags & MREMAP_FIXED) { | 428 | if (flags & MREMAP_FIXED) { |
299 | if (new_addr & ~PAGE_MASK) | 429 | if (flags & MREMAP_MAYMOVE) |
300 | goto out; | 430 | ret = mremap_to(addr, old_len, new_addr, new_len); |
301 | if (!(flags & MREMAP_MAYMOVE)) | 431 | goto out; |
302 | goto out; | ||
303 | |||
304 | if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) | ||
305 | goto out; | ||
306 | |||
307 | /* Check if the location we're moving into overlaps the | ||
308 | * old location at all, and fail if it does. | ||
309 | */ | ||
310 | if ((new_addr <= addr) && (new_addr+new_len) > addr) | ||
311 | goto out; | ||
312 | |||
313 | if ((addr <= new_addr) && (addr+old_len) > new_addr) | ||
314 | goto out; | ||
315 | |||
316 | ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); | ||
317 | if (ret) | ||
318 | goto out; | ||
319 | |||
320 | ret = do_munmap(mm, new_addr, new_len); | ||
321 | if (ret) | ||
322 | goto out; | ||
323 | } | 432 | } |
324 | 433 | ||
325 | /* | 434 | /* |
@@ -332,60 +441,23 @@ unsigned long do_mremap(unsigned long addr, | |||
332 | if (ret && old_len != new_len) | 441 | if (ret && old_len != new_len) |
333 | goto out; | 442 | goto out; |
334 | ret = addr; | 443 | ret = addr; |
335 | if (!(flags & MREMAP_FIXED) || (new_addr == addr)) | 444 | goto out; |
336 | goto out; | ||
337 | old_len = new_len; | ||
338 | } | 445 | } |
339 | 446 | ||
340 | /* | 447 | /* |
341 | * Ok, we need to grow.. or relocate. | 448 | * Ok, we need to grow.. |
342 | */ | 449 | */ |
343 | ret = -EFAULT; | 450 | vma = vma_to_resize(addr, old_len, new_len, &charged); |
344 | vma = find_vma(mm, addr); | 451 | if (IS_ERR(vma)) { |
345 | if (!vma || vma->vm_start > addr) | 452 | ret = PTR_ERR(vma); |
346 | goto out; | ||
347 | if (is_vm_hugetlb_page(vma)) { | ||
348 | ret = -EINVAL; | ||
349 | goto out; | ||
350 | } | ||
351 | /* We can't remap across vm area boundaries */ | ||
352 | if (old_len > vma->vm_end - addr) | ||
353 | goto out; | ||
354 | if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) { | ||
355 | if (new_len > old_len) | ||
356 | goto out; | ||
357 | } | ||
358 | if (vma->vm_flags & VM_LOCKED) { | ||
359 | unsigned long locked, lock_limit; | ||
360 | locked = mm->locked_vm << PAGE_SHIFT; | ||
361 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | ||
362 | locked += new_len - old_len; | ||
363 | ret = -EAGAIN; | ||
364 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | ||
365 | goto out; | ||
366 | } | ||
367 | if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) { | ||
368 | ret = -ENOMEM; | ||
369 | goto out; | 453 | goto out; |
370 | } | 454 | } |
371 | 455 | ||
372 | if (vma->vm_flags & VM_ACCOUNT) { | ||
373 | charged = (new_len - old_len) >> PAGE_SHIFT; | ||
374 | if (security_vm_enough_memory(charged)) | ||
375 | goto out_nc; | ||
376 | } | ||
377 | |||
378 | /* old_len exactly to the end of the area.. | 456 | /* old_len exactly to the end of the area.. |
379 | * And we're not relocating the area. | ||
380 | */ | 457 | */ |
381 | if (old_len == vma->vm_end - addr && | 458 | if (old_len == vma->vm_end - addr) { |
382 | !((flags & MREMAP_FIXED) && (addr != new_addr)) && | ||
383 | (old_len != new_len || !(flags & MREMAP_MAYMOVE))) { | ||
384 | unsigned long max_addr = TASK_SIZE; | ||
385 | if (vma->vm_next) | ||
386 | max_addr = vma->vm_next->vm_start; | ||
387 | /* can we just expand the current mapping? */ | 459 | /* can we just expand the current mapping? */ |
388 | if (max_addr - addr >= new_len) { | 460 | if (vma_expandable(vma, new_len - old_len)) { |
389 | int pages = (new_len - old_len) >> PAGE_SHIFT; | 461 | int pages = (new_len - old_len) >> PAGE_SHIFT; |
390 | 462 | ||
391 | vma_adjust(vma, vma->vm_start, | 463 | vma_adjust(vma, vma->vm_start, |
@@ -409,28 +481,27 @@ unsigned long do_mremap(unsigned long addr, | |||
409 | */ | 481 | */ |
410 | ret = -ENOMEM; | 482 | ret = -ENOMEM; |
411 | if (flags & MREMAP_MAYMOVE) { | 483 | if (flags & MREMAP_MAYMOVE) { |
412 | if (!(flags & MREMAP_FIXED)) { | 484 | unsigned long map_flags = 0; |
413 | unsigned long map_flags = 0; | 485 | if (vma->vm_flags & VM_MAYSHARE) |
414 | if (vma->vm_flags & VM_MAYSHARE) | 486 | map_flags |= MAP_SHARED; |
415 | map_flags |= MAP_SHARED; | 487 | |
416 | 488 | new_addr = get_unmapped_area(vma->vm_file, 0, new_len, | |
417 | new_addr = get_unmapped_area(vma->vm_file, 0, new_len, | 489 | vma->vm_pgoff + |
418 | vma->vm_pgoff, map_flags); | 490 | ((addr - vma->vm_start) >> PAGE_SHIFT), |
419 | if (new_addr & ~PAGE_MASK) { | 491 | map_flags); |
420 | ret = new_addr; | 492 | if (new_addr & ~PAGE_MASK) { |
421 | goto out; | 493 | ret = new_addr; |
422 | } | 494 | goto out; |
423 | |||
424 | ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); | ||
425 | if (ret) | ||
426 | goto out; | ||
427 | } | 495 | } |
496 | |||
497 | ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); | ||
498 | if (ret) | ||
499 | goto out; | ||
428 | ret = move_vma(vma, addr, old_len, new_len, new_addr); | 500 | ret = move_vma(vma, addr, old_len, new_len, new_addr); |
429 | } | 501 | } |
430 | out: | 502 | out: |
431 | if (ret & ~PAGE_MASK) | 503 | if (ret & ~PAGE_MASK) |
432 | vm_unacct_memory(charged); | 504 | vm_unacct_memory(charged); |
433 | out_nc: | ||
434 | return ret; | 505 | return ret; |
435 | } | 506 | } |
436 | 507 | ||
diff --git a/mm/nommu.c b/mm/nommu.c index 5189b5aed8c0..6f9248f89bde 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -1143,9 +1143,6 @@ static int do_mmap_private(struct vm_area_struct *vma, | |||
1143 | if (ret < rlen) | 1143 | if (ret < rlen) |
1144 | memset(base + ret, 0, rlen - ret); | 1144 | memset(base + ret, 0, rlen - ret); |
1145 | 1145 | ||
1146 | } else { | ||
1147 | /* if it's an anonymous mapping, then just clear it */ | ||
1148 | memset(base, 0, rlen); | ||
1149 | } | 1146 | } |
1150 | 1147 | ||
1151 | return 0; | 1148 | return 0; |
@@ -1343,6 +1340,11 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1343 | goto error_just_free; | 1340 | goto error_just_free; |
1344 | add_nommu_region(region); | 1341 | add_nommu_region(region); |
1345 | 1342 | ||
1343 | /* clear anonymous mappings that don't ask for uninitialized data */ | ||
1344 | if (!vma->vm_file && !(flags & MAP_UNINITIALIZED)) | ||
1345 | memset((void *)region->vm_start, 0, | ||
1346 | region->vm_end - region->vm_start); | ||
1347 | |||
1346 | /* okay... we have a mapping; now we have to register it */ | 1348 | /* okay... we have a mapping; now we have to register it */ |
1347 | result = vma->vm_start; | 1349 | result = vma->vm_start; |
1348 | 1350 | ||
@@ -1362,9 +1364,11 @@ share: | |||
1362 | error_just_free: | 1364 | error_just_free: |
1363 | up_write(&nommu_region_sem); | 1365 | up_write(&nommu_region_sem); |
1364 | error: | 1366 | error: |
1365 | fput(region->vm_file); | 1367 | if (region->vm_file) |
1368 | fput(region->vm_file); | ||
1366 | kmem_cache_free(vm_region_jar, region); | 1369 | kmem_cache_free(vm_region_jar, region); |
1367 | fput(vma->vm_file); | 1370 | if (vma->vm_file) |
1371 | fput(vma->vm_file); | ||
1368 | if (vma->vm_flags & VM_EXECUTABLE) | 1372 | if (vma->vm_flags & VM_EXECUTABLE) |
1369 | removed_exe_file_vma(vma->vm_mm); | 1373 | removed_exe_file_vma(vma->vm_mm); |
1370 | kmem_cache_free(vm_area_cachep, vma); | 1374 | kmem_cache_free(vm_area_cachep, vma); |
@@ -1394,6 +1398,31 @@ error_getting_region: | |||
1394 | } | 1398 | } |
1395 | EXPORT_SYMBOL(do_mmap_pgoff); | 1399 | EXPORT_SYMBOL(do_mmap_pgoff); |
1396 | 1400 | ||
1401 | SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | ||
1402 | unsigned long, prot, unsigned long, flags, | ||
1403 | unsigned long, fd, unsigned long, pgoff) | ||
1404 | { | ||
1405 | struct file *file = NULL; | ||
1406 | unsigned long retval = -EBADF; | ||
1407 | |||
1408 | if (!(flags & MAP_ANONYMOUS)) { | ||
1409 | file = fget(fd); | ||
1410 | if (!file) | ||
1411 | goto out; | ||
1412 | } | ||
1413 | |||
1414 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); | ||
1415 | |||
1416 | down_write(¤t->mm->mmap_sem); | ||
1417 | retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); | ||
1418 | up_write(¤t->mm->mmap_sem); | ||
1419 | |||
1420 | if (file) | ||
1421 | fput(file); | ||
1422 | out: | ||
1423 | return retval; | ||
1424 | } | ||
1425 | |||
1397 | /* | 1426 | /* |
1398 | * split a vma into two pieces at address 'addr', a new vma is allocated either | 1427 | * split a vma into two pieces at address 'addr', a new vma is allocated either |
1399 | * for the first part or the tail. | 1428 | * for the first part or the tail. |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ea2147dabba6..f52481b1c1e5 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -196,27 +196,46 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
196 | /* | 196 | /* |
197 | * Determine the type of allocation constraint. | 197 | * Determine the type of allocation constraint. |
198 | */ | 198 | */ |
199 | static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist, | ||
200 | gfp_t gfp_mask) | ||
201 | { | ||
202 | #ifdef CONFIG_NUMA | 199 | #ifdef CONFIG_NUMA |
200 | static enum oom_constraint constrained_alloc(struct zonelist *zonelist, | ||
201 | gfp_t gfp_mask, nodemask_t *nodemask) | ||
202 | { | ||
203 | struct zone *zone; | 203 | struct zone *zone; |
204 | struct zoneref *z; | 204 | struct zoneref *z; |
205 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 205 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
206 | nodemask_t nodes = node_states[N_HIGH_MEMORY]; | ||
207 | 206 | ||
208 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) | 207 | /* |
209 | if (cpuset_zone_allowed_softwall(zone, gfp_mask)) | 208 | * Reach here only when __GFP_NOFAIL is used. So, we should avoid |
210 | node_clear(zone_to_nid(zone), nodes); | 209 | * to kill current.We have to random task kill in this case. |
211 | else | 210 | * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now. |
212 | return CONSTRAINT_CPUSET; | 211 | */ |
212 | if (gfp_mask & __GFP_THISNODE) | ||
213 | return CONSTRAINT_NONE; | ||
213 | 214 | ||
214 | if (!nodes_empty(nodes)) | 215 | /* |
216 | * The nodemask here is a nodemask passed to alloc_pages(). Now, | ||
217 | * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy | ||
218 | * feature. mempolicy is an only user of nodemask here. | ||
219 | * check mempolicy's nodemask contains all N_HIGH_MEMORY | ||
220 | */ | ||
221 | if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) | ||
215 | return CONSTRAINT_MEMORY_POLICY; | 222 | return CONSTRAINT_MEMORY_POLICY; |
216 | #endif | ||
217 | 223 | ||
224 | /* Check this allocation failure is caused by cpuset's wall function */ | ||
225 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | ||
226 | high_zoneidx, nodemask) | ||
227 | if (!cpuset_zone_allowed_softwall(zone, gfp_mask)) | ||
228 | return CONSTRAINT_CPUSET; | ||
229 | |||
230 | return CONSTRAINT_NONE; | ||
231 | } | ||
232 | #else | ||
233 | static enum oom_constraint constrained_alloc(struct zonelist *zonelist, | ||
234 | gfp_t gfp_mask, nodemask_t *nodemask) | ||
235 | { | ||
218 | return CONSTRAINT_NONE; | 236 | return CONSTRAINT_NONE; |
219 | } | 237 | } |
238 | #endif | ||
220 | 239 | ||
221 | /* | 240 | /* |
222 | * Simple selection loop. We chose the process with the highest | 241 | * Simple selection loop. We chose the process with the highest |
@@ -337,6 +356,24 @@ static void dump_tasks(const struct mem_cgroup *mem) | |||
337 | } while_each_thread(g, p); | 356 | } while_each_thread(g, p); |
338 | } | 357 | } |
339 | 358 | ||
359 | static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | ||
360 | struct mem_cgroup *mem) | ||
361 | { | ||
362 | pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " | ||
363 | "oom_adj=%d\n", | ||
364 | current->comm, gfp_mask, order, current->signal->oom_adj); | ||
365 | task_lock(current); | ||
366 | cpuset_print_task_mems_allowed(current); | ||
367 | task_unlock(current); | ||
368 | dump_stack(); | ||
369 | mem_cgroup_print_oom_info(mem, p); | ||
370 | show_mem(); | ||
371 | if (sysctl_oom_dump_tasks) | ||
372 | dump_tasks(mem); | ||
373 | } | ||
374 | |||
375 | #define K(x) ((x) << (PAGE_SHIFT-10)) | ||
376 | |||
340 | /* | 377 | /* |
341 | * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO | 378 | * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO |
342 | * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO | 379 | * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO |
@@ -350,15 +387,23 @@ static void __oom_kill_task(struct task_struct *p, int verbose) | |||
350 | return; | 387 | return; |
351 | } | 388 | } |
352 | 389 | ||
390 | task_lock(p); | ||
353 | if (!p->mm) { | 391 | if (!p->mm) { |
354 | WARN_ON(1); | 392 | WARN_ON(1); |
355 | printk(KERN_WARNING "tried to kill an mm-less task!\n"); | 393 | printk(KERN_WARNING "tried to kill an mm-less task %d (%s)!\n", |
394 | task_pid_nr(p), p->comm); | ||
395 | task_unlock(p); | ||
356 | return; | 396 | return; |
357 | } | 397 | } |
358 | 398 | ||
359 | if (verbose) | 399 | if (verbose) |
360 | printk(KERN_ERR "Killed process %d (%s)\n", | 400 | printk(KERN_ERR "Killed process %d (%s) " |
361 | task_pid_nr(p), p->comm); | 401 | "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n", |
402 | task_pid_nr(p), p->comm, | ||
403 | K(p->mm->total_vm), | ||
404 | K(get_mm_counter(p->mm, anon_rss)), | ||
405 | K(get_mm_counter(p->mm, file_rss))); | ||
406 | task_unlock(p); | ||
362 | 407 | ||
363 | /* | 408 | /* |
364 | * We give our sacrificial lamb high priority and access to | 409 | * We give our sacrificial lamb high priority and access to |
@@ -395,20 +440,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
395 | { | 440 | { |
396 | struct task_struct *c; | 441 | struct task_struct *c; |
397 | 442 | ||
398 | if (printk_ratelimit()) { | 443 | if (printk_ratelimit()) |
399 | printk(KERN_WARNING "%s invoked oom-killer: " | 444 | dump_header(p, gfp_mask, order, mem); |
400 | "gfp_mask=0x%x, order=%d, oom_adj=%d\n", | ||
401 | current->comm, gfp_mask, order, | ||
402 | current->signal->oom_adj); | ||
403 | task_lock(current); | ||
404 | cpuset_print_task_mems_allowed(current); | ||
405 | task_unlock(current); | ||
406 | dump_stack(); | ||
407 | mem_cgroup_print_oom_info(mem, current); | ||
408 | show_mem(); | ||
409 | if (sysctl_oom_dump_tasks) | ||
410 | dump_tasks(mem); | ||
411 | } | ||
412 | 445 | ||
413 | /* | 446 | /* |
414 | * If the task is already exiting, don't alarm the sysadmin or kill | 447 | * If the task is already exiting, don't alarm the sysadmin or kill |
@@ -544,6 +577,7 @@ retry: | |||
544 | /* Found nothing?!?! Either we hang forever, or we panic. */ | 577 | /* Found nothing?!?! Either we hang forever, or we panic. */ |
545 | if (!p) { | 578 | if (!p) { |
546 | read_unlock(&tasklist_lock); | 579 | read_unlock(&tasklist_lock); |
580 | dump_header(NULL, gfp_mask, order, NULL); | ||
547 | panic("Out of memory and no killable processes...\n"); | 581 | panic("Out of memory and no killable processes...\n"); |
548 | } | 582 | } |
549 | 583 | ||
@@ -599,7 +633,8 @@ rest_and_return: | |||
599 | * OR try to be smart about which process to kill. Note that we | 633 | * OR try to be smart about which process to kill. Note that we |
600 | * don't have to be perfect here, we just have to be good. | 634 | * don't have to be perfect here, we just have to be good. |
601 | */ | 635 | */ |
602 | void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | 636 | void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, |
637 | int order, nodemask_t *nodemask) | ||
603 | { | 638 | { |
604 | unsigned long freed = 0; | 639 | unsigned long freed = 0; |
605 | enum oom_constraint constraint; | 640 | enum oom_constraint constraint; |
@@ -609,14 +644,16 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | |||
609 | /* Got some memory back in the last second. */ | 644 | /* Got some memory back in the last second. */ |
610 | return; | 645 | return; |
611 | 646 | ||
612 | if (sysctl_panic_on_oom == 2) | 647 | if (sysctl_panic_on_oom == 2) { |
648 | dump_header(NULL, gfp_mask, order, NULL); | ||
613 | panic("out of memory. Compulsory panic_on_oom is selected.\n"); | 649 | panic("out of memory. Compulsory panic_on_oom is selected.\n"); |
650 | } | ||
614 | 651 | ||
615 | /* | 652 | /* |
616 | * Check if there were limitations on the allocation (only relevant for | 653 | * Check if there were limitations on the allocation (only relevant for |
617 | * NUMA) that may require different handling. | 654 | * NUMA) that may require different handling. |
618 | */ | 655 | */ |
619 | constraint = constrained_alloc(zonelist, gfp_mask); | 656 | constraint = constrained_alloc(zonelist, gfp_mask, nodemask); |
620 | read_lock(&tasklist_lock); | 657 | read_lock(&tasklist_lock); |
621 | 658 | ||
622 | switch (constraint) { | 659 | switch (constraint) { |
@@ -626,8 +663,10 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | |||
626 | break; | 663 | break; |
627 | 664 | ||
628 | case CONSTRAINT_NONE: | 665 | case CONSTRAINT_NONE: |
629 | if (sysctl_panic_on_oom) | 666 | if (sysctl_panic_on_oom) { |
667 | dump_header(NULL, gfp_mask, order, NULL); | ||
630 | panic("out of memory. panic_on_oom is selected\n"); | 668 | panic("out of memory. panic_on_oom is selected\n"); |
669 | } | ||
631 | /* Fall-through */ | 670 | /* Fall-through */ |
632 | case CONSTRAINT_CPUSET: | 671 | case CONSTRAINT_CPUSET: |
633 | __out_of_memory(gfp_mask, order); | 672 | __out_of_memory(gfp_mask, order); |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index a3b14090b1fb..0b19943ecf8b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -566,7 +566,8 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
566 | if (pages_written >= write_chunk) | 566 | if (pages_written >= write_chunk) |
567 | break; /* We've done our duty */ | 567 | break; /* We've done our duty */ |
568 | 568 | ||
569 | schedule_timeout_interruptible(pause); | 569 | __set_current_state(TASK_INTERRUPTIBLE); |
570 | io_schedule_timeout(pause); | ||
570 | 571 | ||
571 | /* | 572 | /* |
572 | * Increase the delay for each loop, up to our previous | 573 | * Increase the delay for each loop, up to our previous |
@@ -820,7 +821,6 @@ int write_cache_pages(struct address_space *mapping, | |||
820 | struct writeback_control *wbc, writepage_t writepage, | 821 | struct writeback_control *wbc, writepage_t writepage, |
821 | void *data) | 822 | void *data) |
822 | { | 823 | { |
823 | struct backing_dev_info *bdi = mapping->backing_dev_info; | ||
824 | int ret = 0; | 824 | int ret = 0; |
825 | int done = 0; | 825 | int done = 0; |
826 | struct pagevec pvec; | 826 | struct pagevec pvec; |
@@ -833,11 +833,6 @@ int write_cache_pages(struct address_space *mapping, | |||
833 | int range_whole = 0; | 833 | int range_whole = 0; |
834 | long nr_to_write = wbc->nr_to_write; | 834 | long nr_to_write = wbc->nr_to_write; |
835 | 835 | ||
836 | if (wbc->nonblocking && bdi_write_congested(bdi)) { | ||
837 | wbc->encountered_congestion = 1; | ||
838 | return 0; | ||
839 | } | ||
840 | |||
841 | pagevec_init(&pvec, 0); | 836 | pagevec_init(&pvec, 0); |
842 | if (wbc->range_cyclic) { | 837 | if (wbc->range_cyclic) { |
843 | writeback_index = mapping->writeback_index; /* prev offset */ | 838 | writeback_index = mapping->writeback_index; /* prev offset */ |
@@ -956,12 +951,6 @@ continue_unlock: | |||
956 | break; | 951 | break; |
957 | } | 952 | } |
958 | } | 953 | } |
959 | |||
960 | if (wbc->nonblocking && bdi_write_congested(bdi)) { | ||
961 | wbc->encountered_congestion = 1; | ||
962 | done = 1; | ||
963 | break; | ||
964 | } | ||
965 | } | 954 | } |
966 | pagevec_release(&pvec); | 955 | pagevec_release(&pvec); |
967 | cond_resched(); | 956 | cond_resched(); |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bf720550b44d..4e9f5cc5fb59 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -48,6 +48,7 @@ | |||
48 | #include <linux/page_cgroup.h> | 48 | #include <linux/page_cgroup.h> |
49 | #include <linux/debugobjects.h> | 49 | #include <linux/debugobjects.h> |
50 | #include <linux/kmemleak.h> | 50 | #include <linux/kmemleak.h> |
51 | #include <linux/memory.h> | ||
51 | #include <trace/events/kmem.h> | 52 | #include <trace/events/kmem.h> |
52 | 53 | ||
53 | #include <asm/tlbflush.h> | 54 | #include <asm/tlbflush.h> |
@@ -486,7 +487,6 @@ static inline void __free_one_page(struct page *page, | |||
486 | zone->free_area[order].nr_free++; | 487 | zone->free_area[order].nr_free++; |
487 | } | 488 | } |
488 | 489 | ||
489 | #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT | ||
490 | /* | 490 | /* |
491 | * free_page_mlock() -- clean up attempts to free and mlocked() page. | 491 | * free_page_mlock() -- clean up attempts to free and mlocked() page. |
492 | * Page should not be on lru, so no need to fix that up. | 492 | * Page should not be on lru, so no need to fix that up. |
@@ -497,9 +497,6 @@ static inline void free_page_mlock(struct page *page) | |||
497 | __dec_zone_page_state(page, NR_MLOCK); | 497 | __dec_zone_page_state(page, NR_MLOCK); |
498 | __count_vm_event(UNEVICTABLE_MLOCKFREED); | 498 | __count_vm_event(UNEVICTABLE_MLOCKFREED); |
499 | } | 499 | } |
500 | #else | ||
501 | static void free_page_mlock(struct page *page) { } | ||
502 | #endif | ||
503 | 500 | ||
504 | static inline int free_pages_check(struct page *page) | 501 | static inline int free_pages_check(struct page *page) |
505 | { | 502 | { |
@@ -1658,12 +1655,22 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | |||
1658 | if (page) | 1655 | if (page) |
1659 | goto out; | 1656 | goto out; |
1660 | 1657 | ||
1661 | /* The OOM killer will not help higher order allocs */ | 1658 | if (!(gfp_mask & __GFP_NOFAIL)) { |
1662 | if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL)) | 1659 | /* The OOM killer will not help higher order allocs */ |
1663 | goto out; | 1660 | if (order > PAGE_ALLOC_COSTLY_ORDER) |
1664 | 1661 | goto out; | |
1662 | /* | ||
1663 | * GFP_THISNODE contains __GFP_NORETRY and we never hit this. | ||
1664 | * Sanity check for bare calls of __GFP_THISNODE, not real OOM. | ||
1665 | * The caller should handle page allocation failure by itself if | ||
1666 | * it specifies __GFP_THISNODE. | ||
1667 | * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER. | ||
1668 | */ | ||
1669 | if (gfp_mask & __GFP_THISNODE) | ||
1670 | goto out; | ||
1671 | } | ||
1665 | /* Exhausted what can be done so it's blamo time */ | 1672 | /* Exhausted what can be done so it's blamo time */ |
1666 | out_of_memory(zonelist, gfp_mask, order); | 1673 | out_of_memory(zonelist, gfp_mask, order, nodemask); |
1667 | 1674 | ||
1668 | out: | 1675 | out: |
1669 | clear_zonelist_oom(zonelist, gfp_mask); | 1676 | clear_zonelist_oom(zonelist, gfp_mask); |
@@ -1769,7 +1776,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) | |||
1769 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 1776 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
1770 | */ | 1777 | */ |
1771 | alloc_flags &= ~ALLOC_CPUSET; | 1778 | alloc_flags &= ~ALLOC_CPUSET; |
1772 | } else if (unlikely(rt_task(p))) | 1779 | } else if (unlikely(rt_task(p)) && !in_interrupt()) |
1773 | alloc_flags |= ALLOC_HARDER; | 1780 | alloc_flags |= ALLOC_HARDER; |
1774 | 1781 | ||
1775 | if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { | 1782 | if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { |
@@ -1817,9 +1824,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
1817 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) | 1824 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) |
1818 | goto nopage; | 1825 | goto nopage; |
1819 | 1826 | ||
1827 | restart: | ||
1820 | wake_all_kswapd(order, zonelist, high_zoneidx); | 1828 | wake_all_kswapd(order, zonelist, high_zoneidx); |
1821 | 1829 | ||
1822 | restart: | ||
1823 | /* | 1830 | /* |
1824 | * OK, we're below the kswapd watermark and have kicked background | 1831 | * OK, we're below the kswapd watermark and have kicked background |
1825 | * reclaim. Now things get more complex, so set up alloc_flags according | 1832 | * reclaim. Now things get more complex, so set up alloc_flags according |
@@ -2183,7 +2190,7 @@ void show_free_areas(void) | |||
2183 | printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" | 2190 | printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" |
2184 | " active_file:%lu inactive_file:%lu isolated_file:%lu\n" | 2191 | " active_file:%lu inactive_file:%lu isolated_file:%lu\n" |
2185 | " unevictable:%lu" | 2192 | " unevictable:%lu" |
2186 | " dirty:%lu writeback:%lu unstable:%lu buffer:%lu\n" | 2193 | " dirty:%lu writeback:%lu unstable:%lu\n" |
2187 | " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" | 2194 | " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" |
2188 | " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n", | 2195 | " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n", |
2189 | global_page_state(NR_ACTIVE_ANON), | 2196 | global_page_state(NR_ACTIVE_ANON), |
@@ -2196,7 +2203,6 @@ void show_free_areas(void) | |||
2196 | global_page_state(NR_FILE_DIRTY), | 2203 | global_page_state(NR_FILE_DIRTY), |
2197 | global_page_state(NR_WRITEBACK), | 2204 | global_page_state(NR_WRITEBACK), |
2198 | global_page_state(NR_UNSTABLE_NFS), | 2205 | global_page_state(NR_UNSTABLE_NFS), |
2199 | nr_blockdev_pages(), | ||
2200 | global_page_state(NR_FREE_PAGES), | 2206 | global_page_state(NR_FREE_PAGES), |
2201 | global_page_state(NR_SLAB_RECLAIMABLE), | 2207 | global_page_state(NR_SLAB_RECLAIMABLE), |
2202 | global_page_state(NR_SLAB_UNRECLAIMABLE), | 2208 | global_page_state(NR_SLAB_UNRECLAIMABLE), |
@@ -2396,13 +2402,14 @@ int numa_zonelist_order_handler(ctl_table *table, int write, | |||
2396 | { | 2402 | { |
2397 | char saved_string[NUMA_ZONELIST_ORDER_LEN]; | 2403 | char saved_string[NUMA_ZONELIST_ORDER_LEN]; |
2398 | int ret; | 2404 | int ret; |
2405 | static DEFINE_MUTEX(zl_order_mutex); | ||
2399 | 2406 | ||
2407 | mutex_lock(&zl_order_mutex); | ||
2400 | if (write) | 2408 | if (write) |
2401 | strncpy(saved_string, (char*)table->data, | 2409 | strcpy(saved_string, (char*)table->data); |
2402 | NUMA_ZONELIST_ORDER_LEN); | ||
2403 | ret = proc_dostring(table, write, buffer, length, ppos); | 2410 | ret = proc_dostring(table, write, buffer, length, ppos); |
2404 | if (ret) | 2411 | if (ret) |
2405 | return ret; | 2412 | goto out; |
2406 | if (write) { | 2413 | if (write) { |
2407 | int oldval = user_zonelist_order; | 2414 | int oldval = user_zonelist_order; |
2408 | if (__parse_numa_zonelist_order((char*)table->data)) { | 2415 | if (__parse_numa_zonelist_order((char*)table->data)) { |
@@ -2415,7 +2422,9 @@ int numa_zonelist_order_handler(ctl_table *table, int write, | |||
2415 | } else if (oldval != user_zonelist_order) | 2422 | } else if (oldval != user_zonelist_order) |
2416 | build_all_zonelists(); | 2423 | build_all_zonelists(); |
2417 | } | 2424 | } |
2418 | return 0; | 2425 | out: |
2426 | mutex_unlock(&zl_order_mutex); | ||
2427 | return ret; | ||
2419 | } | 2428 | } |
2420 | 2429 | ||
2421 | 2430 | ||
@@ -3128,7 +3137,7 @@ static int __cpuinit process_zones(int cpu) | |||
3128 | 3137 | ||
3129 | if (percpu_pagelist_fraction) | 3138 | if (percpu_pagelist_fraction) |
3130 | setup_pagelist_highmark(zone_pcp(zone, cpu), | 3139 | setup_pagelist_highmark(zone_pcp(zone, cpu), |
3131 | (zone->present_pages / percpu_pagelist_fraction)); | 3140 | (zone->present_pages / percpu_pagelist_fraction)); |
3132 | } | 3141 | } |
3133 | 3142 | ||
3134 | return 0; | 3143 | return 0; |
@@ -3574,7 +3583,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid, | |||
3574 | * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, | 3583 | * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, |
3575 | * then all holes in the requested range will be accounted for. | 3584 | * then all holes in the requested range will be accounted for. |
3576 | */ | 3585 | */ |
3577 | static unsigned long __meminit __absent_pages_in_range(int nid, | 3586 | unsigned long __meminit __absent_pages_in_range(int nid, |
3578 | unsigned long range_start_pfn, | 3587 | unsigned long range_start_pfn, |
3579 | unsigned long range_end_pfn) | 3588 | unsigned long range_end_pfn) |
3580 | { | 3589 | { |
@@ -4103,7 +4112,7 @@ static int __init cmp_node_active_region(const void *a, const void *b) | |||
4103 | } | 4112 | } |
4104 | 4113 | ||
4105 | /* sort the node_map by start_pfn */ | 4114 | /* sort the node_map by start_pfn */ |
4106 | static void __init sort_node_map(void) | 4115 | void __init sort_node_map(void) |
4107 | { | 4116 | { |
4108 | sort(early_node_map, (size_t)nr_nodemap_entries, | 4117 | sort(early_node_map, (size_t)nr_nodemap_entries, |
4109 | sizeof(struct node_active_region), | 4118 | sizeof(struct node_active_region), |
@@ -5003,23 +5012,65 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, | |||
5003 | int set_migratetype_isolate(struct page *page) | 5012 | int set_migratetype_isolate(struct page *page) |
5004 | { | 5013 | { |
5005 | struct zone *zone; | 5014 | struct zone *zone; |
5006 | unsigned long flags; | 5015 | struct page *curr_page; |
5016 | unsigned long flags, pfn, iter; | ||
5017 | unsigned long immobile = 0; | ||
5018 | struct memory_isolate_notify arg; | ||
5019 | int notifier_ret; | ||
5007 | int ret = -EBUSY; | 5020 | int ret = -EBUSY; |
5008 | int zone_idx; | 5021 | int zone_idx; |
5009 | 5022 | ||
5010 | zone = page_zone(page); | 5023 | zone = page_zone(page); |
5011 | zone_idx = zone_idx(zone); | 5024 | zone_idx = zone_idx(zone); |
5025 | |||
5012 | spin_lock_irqsave(&zone->lock, flags); | 5026 | spin_lock_irqsave(&zone->lock, flags); |
5027 | if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE || | ||
5028 | zone_idx == ZONE_MOVABLE) { | ||
5029 | ret = 0; | ||
5030 | goto out; | ||
5031 | } | ||
5032 | |||
5033 | pfn = page_to_pfn(page); | ||
5034 | arg.start_pfn = pfn; | ||
5035 | arg.nr_pages = pageblock_nr_pages; | ||
5036 | arg.pages_found = 0; | ||
5037 | |||
5013 | /* | 5038 | /* |
5014 | * In future, more migrate types will be able to be isolation target. | 5039 | * It may be possible to isolate a pageblock even if the |
5040 | * migratetype is not MIGRATE_MOVABLE. The memory isolation | ||
5041 | * notifier chain is used by balloon drivers to return the | ||
5042 | * number of pages in a range that are held by the balloon | ||
5043 | * driver to shrink memory. If all the pages are accounted for | ||
5044 | * by balloons, are free, or on the LRU, isolation can continue. | ||
5045 | * Later, for example, when memory hotplug notifier runs, these | ||
5046 | * pages reported as "can be isolated" should be isolated(freed) | ||
5047 | * by the balloon driver through the memory notifier chain. | ||
5015 | */ | 5048 | */ |
5016 | if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE && | 5049 | notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); |
5017 | zone_idx != ZONE_MOVABLE) | 5050 | notifier_ret = notifier_to_errno(notifier_ret); |
5051 | if (notifier_ret || !arg.pages_found) | ||
5018 | goto out; | 5052 | goto out; |
5019 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); | 5053 | |
5020 | move_freepages_block(zone, page, MIGRATE_ISOLATE); | 5054 | for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) { |
5021 | ret = 0; | 5055 | if (!pfn_valid_within(pfn)) |
5056 | continue; | ||
5057 | |||
5058 | curr_page = pfn_to_page(iter); | ||
5059 | if (!page_count(curr_page) || PageLRU(curr_page)) | ||
5060 | continue; | ||
5061 | |||
5062 | immobile++; | ||
5063 | } | ||
5064 | |||
5065 | if (arg.pages_found == immobile) | ||
5066 | ret = 0; | ||
5067 | |||
5022 | out: | 5068 | out: |
5069 | if (!ret) { | ||
5070 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); | ||
5071 | move_freepages_block(zone, page, MIGRATE_ISOLATE); | ||
5072 | } | ||
5073 | |||
5023 | spin_unlock_irqrestore(&zone->lock, flags); | 5074 | spin_unlock_irqrestore(&zone->lock, flags); |
5024 | if (!ret) | 5075 | if (!ret) |
5025 | drain_all_pages(); | 5076 | drain_all_pages(); |
@@ -5086,3 +5137,24 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | |||
5086 | spin_unlock_irqrestore(&zone->lock, flags); | 5137 | spin_unlock_irqrestore(&zone->lock, flags); |
5087 | } | 5138 | } |
5088 | #endif | 5139 | #endif |
5140 | |||
5141 | #ifdef CONFIG_MEMORY_FAILURE | ||
5142 | bool is_free_buddy_page(struct page *page) | ||
5143 | { | ||
5144 | struct zone *zone = page_zone(page); | ||
5145 | unsigned long pfn = page_to_pfn(page); | ||
5146 | unsigned long flags; | ||
5147 | int order; | ||
5148 | |||
5149 | spin_lock_irqsave(&zone->lock, flags); | ||
5150 | for (order = 0; order < MAX_ORDER; order++) { | ||
5151 | struct page *page_head = page - (pfn & ((1 << order) - 1)); | ||
5152 | |||
5153 | if (PageBuddy(page_head) && page_order(page_head) >= order) | ||
5154 | break; | ||
5155 | } | ||
5156 | spin_unlock_irqrestore(&zone->lock, flags); | ||
5157 | |||
5158 | return order < MAX_ORDER; | ||
5159 | } | ||
5160 | #endif | ||
diff --git a/mm/page_io.c b/mm/page_io.c index c6f3e5071de3..a19af956ee1b 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -19,20 +19,15 @@ | |||
19 | #include <linux/writeback.h> | 19 | #include <linux/writeback.h> |
20 | #include <asm/pgtable.h> | 20 | #include <asm/pgtable.h> |
21 | 21 | ||
22 | static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index, | 22 | static struct bio *get_swap_bio(gfp_t gfp_flags, |
23 | struct page *page, bio_end_io_t end_io) | 23 | struct page *page, bio_end_io_t end_io) |
24 | { | 24 | { |
25 | struct bio *bio; | 25 | struct bio *bio; |
26 | 26 | ||
27 | bio = bio_alloc(gfp_flags, 1); | 27 | bio = bio_alloc(gfp_flags, 1); |
28 | if (bio) { | 28 | if (bio) { |
29 | struct swap_info_struct *sis; | 29 | bio->bi_sector = map_swap_page(page, &bio->bi_bdev); |
30 | swp_entry_t entry = { .val = index, }; | 30 | bio->bi_sector <<= PAGE_SHIFT - 9; |
31 | |||
32 | sis = get_swap_info_struct(swp_type(entry)); | ||
33 | bio->bi_sector = map_swap_page(sis, swp_offset(entry)) * | ||
34 | (PAGE_SIZE >> 9); | ||
35 | bio->bi_bdev = sis->bdev; | ||
36 | bio->bi_io_vec[0].bv_page = page; | 31 | bio->bi_io_vec[0].bv_page = page; |
37 | bio->bi_io_vec[0].bv_len = PAGE_SIZE; | 32 | bio->bi_io_vec[0].bv_len = PAGE_SIZE; |
38 | bio->bi_io_vec[0].bv_offset = 0; | 33 | bio->bi_io_vec[0].bv_offset = 0; |
@@ -102,8 +97,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) | |||
102 | unlock_page(page); | 97 | unlock_page(page); |
103 | goto out; | 98 | goto out; |
104 | } | 99 | } |
105 | bio = get_swap_bio(GFP_NOIO, page_private(page), page, | 100 | bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); |
106 | end_swap_bio_write); | ||
107 | if (bio == NULL) { | 101 | if (bio == NULL) { |
108 | set_page_dirty(page); | 102 | set_page_dirty(page); |
109 | unlock_page(page); | 103 | unlock_page(page); |
@@ -127,8 +121,7 @@ int swap_readpage(struct page *page) | |||
127 | 121 | ||
128 | VM_BUG_ON(!PageLocked(page)); | 122 | VM_BUG_ON(!PageLocked(page)); |
129 | VM_BUG_ON(PageUptodate(page)); | 123 | VM_BUG_ON(PageUptodate(page)); |
130 | bio = get_swap_bio(GFP_KERNEL, page_private(page), page, | 124 | bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); |
131 | end_swap_bio_read); | ||
132 | if (bio == NULL) { | 125 | if (bio == NULL) { |
133 | unlock_page(page); | 126 | unlock_page(page); |
134 | ret = -ENOMEM; | 127 | ret = -ENOMEM; |
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index d5878bed7841..7b47a57b6646 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
@@ -1,6 +1,7 @@ | |||
1 | #include <linux/mm.h> | 1 | #include <linux/mm.h> |
2 | #include <linux/highmem.h> | 2 | #include <linux/highmem.h> |
3 | #include <linux/sched.h> | 3 | #include <linux/sched.h> |
4 | #include <linux/hugetlb.h> | ||
4 | 5 | ||
5 | static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | 6 | static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, |
6 | struct mm_walk *walk) | 7 | struct mm_walk *walk) |
@@ -107,6 +108,7 @@ int walk_page_range(unsigned long addr, unsigned long end, | |||
107 | pgd_t *pgd; | 108 | pgd_t *pgd; |
108 | unsigned long next; | 109 | unsigned long next; |
109 | int err = 0; | 110 | int err = 0; |
111 | struct vm_area_struct *vma; | ||
110 | 112 | ||
111 | if (addr >= end) | 113 | if (addr >= end) |
112 | return err; | 114 | return err; |
@@ -117,11 +119,38 @@ int walk_page_range(unsigned long addr, unsigned long end, | |||
117 | pgd = pgd_offset(walk->mm, addr); | 119 | pgd = pgd_offset(walk->mm, addr); |
118 | do { | 120 | do { |
119 | next = pgd_addr_end(addr, end); | 121 | next = pgd_addr_end(addr, end); |
122 | |||
123 | /* | ||
124 | * handle hugetlb vma individually because pagetable walk for | ||
125 | * the hugetlb page is dependent on the architecture and | ||
126 | * we can't handled it in the same manner as non-huge pages. | ||
127 | */ | ||
128 | vma = find_vma(walk->mm, addr); | ||
129 | #ifdef CONFIG_HUGETLB_PAGE | ||
130 | if (vma && is_vm_hugetlb_page(vma)) { | ||
131 | pte_t *pte; | ||
132 | struct hstate *hs; | ||
133 | |||
134 | if (vma->vm_end < next) | ||
135 | next = vma->vm_end; | ||
136 | hs = hstate_vma(vma); | ||
137 | pte = huge_pte_offset(walk->mm, | ||
138 | addr & huge_page_mask(hs)); | ||
139 | if (pte && !huge_pte_none(huge_ptep_get(pte)) | ||
140 | && walk->hugetlb_entry) | ||
141 | err = walk->hugetlb_entry(pte, addr, | ||
142 | next, walk); | ||
143 | if (err) | ||
144 | break; | ||
145 | continue; | ||
146 | } | ||
147 | #endif | ||
120 | if (pgd_none_or_clear_bad(pgd)) { | 148 | if (pgd_none_or_clear_bad(pgd)) { |
121 | if (walk->pte_hole) | 149 | if (walk->pte_hole) |
122 | err = walk->pte_hole(addr, next, walk); | 150 | err = walk->pte_hole(addr, next, walk); |
123 | if (err) | 151 | if (err) |
124 | break; | 152 | break; |
153 | pgd++; | ||
125 | continue; | 154 | continue; |
126 | } | 155 | } |
127 | if (walk->pgd_entry) | 156 | if (walk->pgd_entry) |
@@ -131,7 +160,8 @@ int walk_page_range(unsigned long addr, unsigned long end, | |||
131 | err = walk_pud_range(pgd, addr, next, walk); | 160 | err = walk_pud_range(pgd, addr, next, walk); |
132 | if (err) | 161 | if (err) |
133 | break; | 162 | break; |
134 | } while (pgd++, addr = next, addr != end); | 163 | pgd++; |
164 | } while (addr = next, addr != end); | ||
135 | 165 | ||
136 | return err; | 166 | return err; |
137 | } | 167 | } |
diff --git a/mm/percpu.c b/mm/percpu.c index 77c6f7994a46..626e43c99498 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -72,6 +72,7 @@ | |||
72 | #include <asm/cacheflush.h> | 72 | #include <asm/cacheflush.h> |
73 | #include <asm/sections.h> | 73 | #include <asm/sections.h> |
74 | #include <asm/tlbflush.h> | 74 | #include <asm/tlbflush.h> |
75 | #include <asm/io.h> | ||
75 | 76 | ||
76 | #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ | 77 | #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ |
77 | #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ | 78 | #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ |
@@ -151,7 +152,10 @@ static int pcpu_reserved_chunk_limit; | |||
151 | * | 152 | * |
152 | * During allocation, pcpu_alloc_mutex is kept locked all the time and | 153 | * During allocation, pcpu_alloc_mutex is kept locked all the time and |
153 | * pcpu_lock is grabbed and released as necessary. All actual memory | 154 | * pcpu_lock is grabbed and released as necessary. All actual memory |
154 | * allocations are done using GFP_KERNEL with pcpu_lock released. | 155 | * allocations are done using GFP_KERNEL with pcpu_lock released. In |
156 | * general, percpu memory can't be allocated with irq off but | ||
157 | * irqsave/restore are still used in alloc path so that it can be used | ||
158 | * from early init path - sched_init() specifically. | ||
155 | * | 159 | * |
156 | * Free path accesses and alters only the index data structures, so it | 160 | * Free path accesses and alters only the index data structures, so it |
157 | * can be safely called from atomic context. When memory needs to be | 161 | * can be safely called from atomic context. When memory needs to be |
@@ -350,63 +354,86 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) | |||
350 | } | 354 | } |
351 | 355 | ||
352 | /** | 356 | /** |
353 | * pcpu_extend_area_map - extend area map for allocation | 357 | * pcpu_need_to_extend - determine whether chunk area map needs to be extended |
354 | * @chunk: target chunk | 358 | * @chunk: chunk of interest |
355 | * | 359 | * |
356 | * Extend area map of @chunk so that it can accomodate an allocation. | 360 | * Determine whether area map of @chunk needs to be extended to |
357 | * A single allocation can split an area into three areas, so this | 361 | * accomodate a new allocation. |
358 | * function makes sure that @chunk->map has at least two extra slots. | ||
359 | * | 362 | * |
360 | * CONTEXT: | 363 | * CONTEXT: |
361 | * pcpu_alloc_mutex, pcpu_lock. pcpu_lock is released and reacquired | 364 | * pcpu_lock. |
362 | * if area map is extended. | ||
363 | * | 365 | * |
364 | * RETURNS: | 366 | * RETURNS: |
365 | * 0 if noop, 1 if successfully extended, -errno on failure. | 367 | * New target map allocation length if extension is necessary, 0 |
368 | * otherwise. | ||
366 | */ | 369 | */ |
367 | static int pcpu_extend_area_map(struct pcpu_chunk *chunk) | 370 | static int pcpu_need_to_extend(struct pcpu_chunk *chunk) |
368 | __releases(lock) __acquires(lock) | ||
369 | { | 371 | { |
370 | int new_alloc; | 372 | int new_alloc; |
371 | int *new; | ||
372 | size_t size; | ||
373 | 373 | ||
374 | /* has enough? */ | ||
375 | if (chunk->map_alloc >= chunk->map_used + 2) | 374 | if (chunk->map_alloc >= chunk->map_used + 2) |
376 | return 0; | 375 | return 0; |
377 | 376 | ||
378 | spin_unlock_irq(&pcpu_lock); | ||
379 | |||
380 | new_alloc = PCPU_DFL_MAP_ALLOC; | 377 | new_alloc = PCPU_DFL_MAP_ALLOC; |
381 | while (new_alloc < chunk->map_used + 2) | 378 | while (new_alloc < chunk->map_used + 2) |
382 | new_alloc *= 2; | 379 | new_alloc *= 2; |
383 | 380 | ||
384 | new = pcpu_mem_alloc(new_alloc * sizeof(new[0])); | 381 | return new_alloc; |
385 | if (!new) { | 382 | } |
386 | spin_lock_irq(&pcpu_lock); | 383 | |
384 | /** | ||
385 | * pcpu_extend_area_map - extend area map of a chunk | ||
386 | * @chunk: chunk of interest | ||
387 | * @new_alloc: new target allocation length of the area map | ||
388 | * | ||
389 | * Extend area map of @chunk to have @new_alloc entries. | ||
390 | * | ||
391 | * CONTEXT: | ||
392 | * Does GFP_KERNEL allocation. Grabs and releases pcpu_lock. | ||
393 | * | ||
394 | * RETURNS: | ||
395 | * 0 on success, -errno on failure. | ||
396 | */ | ||
397 | static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc) | ||
398 | { | ||
399 | int *old = NULL, *new = NULL; | ||
400 | size_t old_size = 0, new_size = new_alloc * sizeof(new[0]); | ||
401 | unsigned long flags; | ||
402 | |||
403 | new = pcpu_mem_alloc(new_size); | ||
404 | if (!new) | ||
387 | return -ENOMEM; | 405 | return -ENOMEM; |
388 | } | ||
389 | 406 | ||
390 | /* | 407 | /* acquire pcpu_lock and switch to new area map */ |
391 | * Acquire pcpu_lock and switch to new area map. Only free | 408 | spin_lock_irqsave(&pcpu_lock, flags); |
392 | * could have happened inbetween, so map_used couldn't have | ||
393 | * grown. | ||
394 | */ | ||
395 | spin_lock_irq(&pcpu_lock); | ||
396 | BUG_ON(new_alloc < chunk->map_used + 2); | ||
397 | 409 | ||
398 | size = chunk->map_alloc * sizeof(chunk->map[0]); | 410 | if (new_alloc <= chunk->map_alloc) |
399 | memcpy(new, chunk->map, size); | 411 | goto out_unlock; |
412 | |||
413 | old_size = chunk->map_alloc * sizeof(chunk->map[0]); | ||
414 | memcpy(new, chunk->map, old_size); | ||
400 | 415 | ||
401 | /* | 416 | /* |
402 | * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is | 417 | * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is |
403 | * one of the first chunks and still using static map. | 418 | * one of the first chunks and still using static map. |
404 | */ | 419 | */ |
405 | if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC) | 420 | if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC) |
406 | pcpu_mem_free(chunk->map, size); | 421 | old = chunk->map; |
407 | 422 | ||
408 | chunk->map_alloc = new_alloc; | 423 | chunk->map_alloc = new_alloc; |
409 | chunk->map = new; | 424 | chunk->map = new; |
425 | new = NULL; | ||
426 | |||
427 | out_unlock: | ||
428 | spin_unlock_irqrestore(&pcpu_lock, flags); | ||
429 | |||
430 | /* | ||
431 | * pcpu_mem_free() might end up calling vfree() which uses | ||
432 | * IRQ-unsafe lock and thus can't be called under pcpu_lock. | ||
433 | */ | ||
434 | pcpu_mem_free(old, old_size); | ||
435 | pcpu_mem_free(new, new_size); | ||
436 | |||
410 | return 0; | 437 | return 0; |
411 | } | 438 | } |
412 | 439 | ||
@@ -1043,7 +1070,8 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) | |||
1043 | static int warn_limit = 10; | 1070 | static int warn_limit = 10; |
1044 | struct pcpu_chunk *chunk; | 1071 | struct pcpu_chunk *chunk; |
1045 | const char *err; | 1072 | const char *err; |
1046 | int slot, off; | 1073 | int slot, off, new_alloc; |
1074 | unsigned long flags; | ||
1047 | 1075 | ||
1048 | if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { | 1076 | if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { |
1049 | WARN(true, "illegal size (%zu) or align (%zu) for " | 1077 | WARN(true, "illegal size (%zu) or align (%zu) for " |
@@ -1052,19 +1080,30 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) | |||
1052 | } | 1080 | } |
1053 | 1081 | ||
1054 | mutex_lock(&pcpu_alloc_mutex); | 1082 | mutex_lock(&pcpu_alloc_mutex); |
1055 | spin_lock_irq(&pcpu_lock); | 1083 | spin_lock_irqsave(&pcpu_lock, flags); |
1056 | 1084 | ||
1057 | /* serve reserved allocations from the reserved chunk if available */ | 1085 | /* serve reserved allocations from the reserved chunk if available */ |
1058 | if (reserved && pcpu_reserved_chunk) { | 1086 | if (reserved && pcpu_reserved_chunk) { |
1059 | chunk = pcpu_reserved_chunk; | 1087 | chunk = pcpu_reserved_chunk; |
1060 | if (size > chunk->contig_hint || | 1088 | |
1061 | pcpu_extend_area_map(chunk) < 0) { | 1089 | if (size > chunk->contig_hint) { |
1062 | err = "failed to extend area map of reserved chunk"; | 1090 | err = "alloc from reserved chunk failed"; |
1063 | goto fail_unlock; | 1091 | goto fail_unlock; |
1064 | } | 1092 | } |
1093 | |||
1094 | while ((new_alloc = pcpu_need_to_extend(chunk))) { | ||
1095 | spin_unlock_irqrestore(&pcpu_lock, flags); | ||
1096 | if (pcpu_extend_area_map(chunk, new_alloc) < 0) { | ||
1097 | err = "failed to extend area map of reserved chunk"; | ||
1098 | goto fail_unlock_mutex; | ||
1099 | } | ||
1100 | spin_lock_irqsave(&pcpu_lock, flags); | ||
1101 | } | ||
1102 | |||
1065 | off = pcpu_alloc_area(chunk, size, align); | 1103 | off = pcpu_alloc_area(chunk, size, align); |
1066 | if (off >= 0) | 1104 | if (off >= 0) |
1067 | goto area_found; | 1105 | goto area_found; |
1106 | |||
1068 | err = "alloc from reserved chunk failed"; | 1107 | err = "alloc from reserved chunk failed"; |
1069 | goto fail_unlock; | 1108 | goto fail_unlock; |
1070 | } | 1109 | } |
@@ -1076,14 +1115,20 @@ restart: | |||
1076 | if (size > chunk->contig_hint) | 1115 | if (size > chunk->contig_hint) |
1077 | continue; | 1116 | continue; |
1078 | 1117 | ||
1079 | switch (pcpu_extend_area_map(chunk)) { | 1118 | new_alloc = pcpu_need_to_extend(chunk); |
1080 | case 0: | 1119 | if (new_alloc) { |
1081 | break; | 1120 | spin_unlock_irqrestore(&pcpu_lock, flags); |
1082 | case 1: | 1121 | if (pcpu_extend_area_map(chunk, |
1083 | goto restart; /* pcpu_lock dropped, restart */ | 1122 | new_alloc) < 0) { |
1084 | default: | 1123 | err = "failed to extend area map"; |
1085 | err = "failed to extend area map"; | 1124 | goto fail_unlock_mutex; |
1086 | goto fail_unlock; | 1125 | } |
1126 | spin_lock_irqsave(&pcpu_lock, flags); | ||
1127 | /* | ||
1128 | * pcpu_lock has been dropped, need to | ||
1129 | * restart cpu_slot list walking. | ||
1130 | */ | ||
1131 | goto restart; | ||
1087 | } | 1132 | } |
1088 | 1133 | ||
1089 | off = pcpu_alloc_area(chunk, size, align); | 1134 | off = pcpu_alloc_area(chunk, size, align); |
@@ -1093,7 +1138,7 @@ restart: | |||
1093 | } | 1138 | } |
1094 | 1139 | ||
1095 | /* hmmm... no space left, create a new chunk */ | 1140 | /* hmmm... no space left, create a new chunk */ |
1096 | spin_unlock_irq(&pcpu_lock); | 1141 | spin_unlock_irqrestore(&pcpu_lock, flags); |
1097 | 1142 | ||
1098 | chunk = alloc_pcpu_chunk(); | 1143 | chunk = alloc_pcpu_chunk(); |
1099 | if (!chunk) { | 1144 | if (!chunk) { |
@@ -1101,16 +1146,16 @@ restart: | |||
1101 | goto fail_unlock_mutex; | 1146 | goto fail_unlock_mutex; |
1102 | } | 1147 | } |
1103 | 1148 | ||
1104 | spin_lock_irq(&pcpu_lock); | 1149 | spin_lock_irqsave(&pcpu_lock, flags); |
1105 | pcpu_chunk_relocate(chunk, -1); | 1150 | pcpu_chunk_relocate(chunk, -1); |
1106 | goto restart; | 1151 | goto restart; |
1107 | 1152 | ||
1108 | area_found: | 1153 | area_found: |
1109 | spin_unlock_irq(&pcpu_lock); | 1154 | spin_unlock_irqrestore(&pcpu_lock, flags); |
1110 | 1155 | ||
1111 | /* populate, map and clear the area */ | 1156 | /* populate, map and clear the area */ |
1112 | if (pcpu_populate_chunk(chunk, off, size)) { | 1157 | if (pcpu_populate_chunk(chunk, off, size)) { |
1113 | spin_lock_irq(&pcpu_lock); | 1158 | spin_lock_irqsave(&pcpu_lock, flags); |
1114 | pcpu_free_area(chunk, off); | 1159 | pcpu_free_area(chunk, off); |
1115 | err = "failed to populate"; | 1160 | err = "failed to populate"; |
1116 | goto fail_unlock; | 1161 | goto fail_unlock; |
@@ -1122,7 +1167,7 @@ area_found: | |||
1122 | return __addr_to_pcpu_ptr(chunk->base_addr + off); | 1167 | return __addr_to_pcpu_ptr(chunk->base_addr + off); |
1123 | 1168 | ||
1124 | fail_unlock: | 1169 | fail_unlock: |
1125 | spin_unlock_irq(&pcpu_lock); | 1170 | spin_unlock_irqrestore(&pcpu_lock, flags); |
1126 | fail_unlock_mutex: | 1171 | fail_unlock_mutex: |
1127 | mutex_unlock(&pcpu_alloc_mutex); | 1172 | mutex_unlock(&pcpu_alloc_mutex); |
1128 | if (warn_limit) { | 1173 | if (warn_limit) { |
@@ -1254,6 +1299,27 @@ void free_percpu(void *ptr) | |||
1254 | } | 1299 | } |
1255 | EXPORT_SYMBOL_GPL(free_percpu); | 1300 | EXPORT_SYMBOL_GPL(free_percpu); |
1256 | 1301 | ||
1302 | /** | ||
1303 | * per_cpu_ptr_to_phys - convert translated percpu address to physical address | ||
1304 | * @addr: the address to be converted to physical address | ||
1305 | * | ||
1306 | * Given @addr which is dereferenceable address obtained via one of | ||
1307 | * percpu access macros, this function translates it into its physical | ||
1308 | * address. The caller is responsible for ensuring @addr stays valid | ||
1309 | * until this function finishes. | ||
1310 | * | ||
1311 | * RETURNS: | ||
1312 | * The physical address for @addr. | ||
1313 | */ | ||
1314 | phys_addr_t per_cpu_ptr_to_phys(void *addr) | ||
1315 | { | ||
1316 | if ((unsigned long)addr < VMALLOC_START || | ||
1317 | (unsigned long)addr >= VMALLOC_END) | ||
1318 | return __pa(addr); | ||
1319 | else | ||
1320 | return page_to_phys(vmalloc_to_page(addr)); | ||
1321 | } | ||
1322 | |||
1257 | static inline size_t pcpu_calc_fc_sizes(size_t static_size, | 1323 | static inline size_t pcpu_calc_fc_sizes(size_t static_size, |
1258 | size_t reserved_size, | 1324 | size_t reserved_size, |
1259 | ssize_t *dyn_sizep) | 1325 | ssize_t *dyn_sizep) |
diff --git a/mm/readahead.c b/mm/readahead.c index aa1aa2345235..033bc135a41f 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -547,5 +547,17 @@ page_cache_async_readahead(struct address_space *mapping, | |||
547 | 547 | ||
548 | /* do read-ahead */ | 548 | /* do read-ahead */ |
549 | ondemand_readahead(mapping, ra, filp, true, offset, req_size); | 549 | ondemand_readahead(mapping, ra, filp, true, offset, req_size); |
550 | |||
551 | #ifdef CONFIG_BLOCK | ||
552 | /* | ||
553 | * Normally the current page is !uptodate and lock_page() will be | ||
554 | * immediately called to implicitly unplug the device. However this | ||
555 | * is not always true for RAID conifgurations, where data arrives | ||
556 | * not strictly in their submission order. In this case we need to | ||
557 | * explicitly kick off the IO. | ||
558 | */ | ||
559 | if (PageUptodate(page)) | ||
560 | blk_run_backing_dev(mapping->backing_dev_info, NULL); | ||
561 | #endif | ||
550 | } | 562 | } |
551 | EXPORT_SYMBOL_GPL(page_cache_async_readahead); | 563 | EXPORT_SYMBOL_GPL(page_cache_async_readahead); |
@@ -49,6 +49,7 @@ | |||
49 | #include <linux/swapops.h> | 49 | #include <linux/swapops.h> |
50 | #include <linux/slab.h> | 50 | #include <linux/slab.h> |
51 | #include <linux/init.h> | 51 | #include <linux/init.h> |
52 | #include <linux/ksm.h> | ||
52 | #include <linux/rmap.h> | 53 | #include <linux/rmap.h> |
53 | #include <linux/rcupdate.h> | 54 | #include <linux/rcupdate.h> |
54 | #include <linux/module.h> | 55 | #include <linux/module.h> |
@@ -67,7 +68,7 @@ static inline struct anon_vma *anon_vma_alloc(void) | |||
67 | return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); | 68 | return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); |
68 | } | 69 | } |
69 | 70 | ||
70 | static inline void anon_vma_free(struct anon_vma *anon_vma) | 71 | void anon_vma_free(struct anon_vma *anon_vma) |
71 | { | 72 | { |
72 | kmem_cache_free(anon_vma_cachep, anon_vma); | 73 | kmem_cache_free(anon_vma_cachep, anon_vma); |
73 | } | 74 | } |
@@ -171,7 +172,7 @@ void anon_vma_unlink(struct vm_area_struct *vma) | |||
171 | list_del(&vma->anon_vma_node); | 172 | list_del(&vma->anon_vma_node); |
172 | 173 | ||
173 | /* We must garbage collect the anon_vma if it's empty */ | 174 | /* We must garbage collect the anon_vma if it's empty */ |
174 | empty = list_empty(&anon_vma->head); | 175 | empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma); |
175 | spin_unlock(&anon_vma->lock); | 176 | spin_unlock(&anon_vma->lock); |
176 | 177 | ||
177 | if (empty) | 178 | if (empty) |
@@ -183,6 +184,7 @@ static void anon_vma_ctor(void *data) | |||
183 | struct anon_vma *anon_vma = data; | 184 | struct anon_vma *anon_vma = data; |
184 | 185 | ||
185 | spin_lock_init(&anon_vma->lock); | 186 | spin_lock_init(&anon_vma->lock); |
187 | ksm_refcount_init(anon_vma); | ||
186 | INIT_LIST_HEAD(&anon_vma->head); | 188 | INIT_LIST_HEAD(&anon_vma->head); |
187 | } | 189 | } |
188 | 190 | ||
@@ -202,8 +204,8 @@ struct anon_vma *page_lock_anon_vma(struct page *page) | |||
202 | unsigned long anon_mapping; | 204 | unsigned long anon_mapping; |
203 | 205 | ||
204 | rcu_read_lock(); | 206 | rcu_read_lock(); |
205 | anon_mapping = (unsigned long) page->mapping; | 207 | anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); |
206 | if (!(anon_mapping & PAGE_MAPPING_ANON)) | 208 | if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) |
207 | goto out; | 209 | goto out; |
208 | if (!page_mapped(page)) | 210 | if (!page_mapped(page)) |
209 | goto out; | 211 | goto out; |
@@ -248,8 +250,7 @@ vma_address(struct page *page, struct vm_area_struct *vma) | |||
248 | unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | 250 | unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) |
249 | { | 251 | { |
250 | if (PageAnon(page)) { | 252 | if (PageAnon(page)) { |
251 | if ((void *)vma->anon_vma != | 253 | if (vma->anon_vma != page_anon_vma(page)) |
252 | (void *)page->mapping - PAGE_MAPPING_ANON) | ||
253 | return -EFAULT; | 254 | return -EFAULT; |
254 | } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { | 255 | } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { |
255 | if (!vma->vm_file || | 256 | if (!vma->vm_file || |
@@ -337,21 +338,15 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) | |||
337 | * Subfunctions of page_referenced: page_referenced_one called | 338 | * Subfunctions of page_referenced: page_referenced_one called |
338 | * repeatedly from either page_referenced_anon or page_referenced_file. | 339 | * repeatedly from either page_referenced_anon or page_referenced_file. |
339 | */ | 340 | */ |
340 | static int page_referenced_one(struct page *page, | 341 | int page_referenced_one(struct page *page, struct vm_area_struct *vma, |
341 | struct vm_area_struct *vma, | 342 | unsigned long address, unsigned int *mapcount, |
342 | unsigned int *mapcount, | 343 | unsigned long *vm_flags) |
343 | unsigned long *vm_flags) | ||
344 | { | 344 | { |
345 | struct mm_struct *mm = vma->vm_mm; | 345 | struct mm_struct *mm = vma->vm_mm; |
346 | unsigned long address; | ||
347 | pte_t *pte; | 346 | pte_t *pte; |
348 | spinlock_t *ptl; | 347 | spinlock_t *ptl; |
349 | int referenced = 0; | 348 | int referenced = 0; |
350 | 349 | ||
351 | address = vma_address(page, vma); | ||
352 | if (address == -EFAULT) | ||
353 | goto out; | ||
354 | |||
355 | pte = page_check_address(page, mm, address, &ptl, 0); | 350 | pte = page_check_address(page, mm, address, &ptl, 0); |
356 | if (!pte) | 351 | if (!pte) |
357 | goto out; | 352 | goto out; |
@@ -388,9 +383,10 @@ static int page_referenced_one(struct page *page, | |||
388 | out_unmap: | 383 | out_unmap: |
389 | (*mapcount)--; | 384 | (*mapcount)--; |
390 | pte_unmap_unlock(pte, ptl); | 385 | pte_unmap_unlock(pte, ptl); |
391 | out: | 386 | |
392 | if (referenced) | 387 | if (referenced) |
393 | *vm_flags |= vma->vm_flags; | 388 | *vm_flags |= vma->vm_flags; |
389 | out: | ||
394 | return referenced; | 390 | return referenced; |
395 | } | 391 | } |
396 | 392 | ||
@@ -409,6 +405,9 @@ static int page_referenced_anon(struct page *page, | |||
409 | 405 | ||
410 | mapcount = page_mapcount(page); | 406 | mapcount = page_mapcount(page); |
411 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 407 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { |
408 | unsigned long address = vma_address(page, vma); | ||
409 | if (address == -EFAULT) | ||
410 | continue; | ||
412 | /* | 411 | /* |
413 | * If we are reclaiming on behalf of a cgroup, skip | 412 | * If we are reclaiming on behalf of a cgroup, skip |
414 | * counting on behalf of references from different | 413 | * counting on behalf of references from different |
@@ -416,7 +415,7 @@ static int page_referenced_anon(struct page *page, | |||
416 | */ | 415 | */ |
417 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) | 416 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) |
418 | continue; | 417 | continue; |
419 | referenced += page_referenced_one(page, vma, | 418 | referenced += page_referenced_one(page, vma, address, |
420 | &mapcount, vm_flags); | 419 | &mapcount, vm_flags); |
421 | if (!mapcount) | 420 | if (!mapcount) |
422 | break; | 421 | break; |
@@ -474,6 +473,9 @@ static int page_referenced_file(struct page *page, | |||
474 | mapcount = page_mapcount(page); | 473 | mapcount = page_mapcount(page); |
475 | 474 | ||
476 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 475 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
476 | unsigned long address = vma_address(page, vma); | ||
477 | if (address == -EFAULT) | ||
478 | continue; | ||
477 | /* | 479 | /* |
478 | * If we are reclaiming on behalf of a cgroup, skip | 480 | * If we are reclaiming on behalf of a cgroup, skip |
479 | * counting on behalf of references from different | 481 | * counting on behalf of references from different |
@@ -481,7 +483,7 @@ static int page_referenced_file(struct page *page, | |||
481 | */ | 483 | */ |
482 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) | 484 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) |
483 | continue; | 485 | continue; |
484 | referenced += page_referenced_one(page, vma, | 486 | referenced += page_referenced_one(page, vma, address, |
485 | &mapcount, vm_flags); | 487 | &mapcount, vm_flags); |
486 | if (!mapcount) | 488 | if (!mapcount) |
487 | break; | 489 | break; |
@@ -507,46 +509,47 @@ int page_referenced(struct page *page, | |||
507 | unsigned long *vm_flags) | 509 | unsigned long *vm_flags) |
508 | { | 510 | { |
509 | int referenced = 0; | 511 | int referenced = 0; |
512 | int we_locked = 0; | ||
510 | 513 | ||
511 | if (TestClearPageReferenced(page)) | 514 | if (TestClearPageReferenced(page)) |
512 | referenced++; | 515 | referenced++; |
513 | 516 | ||
514 | *vm_flags = 0; | 517 | *vm_flags = 0; |
515 | if (page_mapped(page) && page->mapping) { | 518 | if (page_mapped(page) && page_rmapping(page)) { |
516 | if (PageAnon(page)) | 519 | if (!is_locked && (!PageAnon(page) || PageKsm(page))) { |
520 | we_locked = trylock_page(page); | ||
521 | if (!we_locked) { | ||
522 | referenced++; | ||
523 | goto out; | ||
524 | } | ||
525 | } | ||
526 | if (unlikely(PageKsm(page))) | ||
527 | referenced += page_referenced_ksm(page, mem_cont, | ||
528 | vm_flags); | ||
529 | else if (PageAnon(page)) | ||
517 | referenced += page_referenced_anon(page, mem_cont, | 530 | referenced += page_referenced_anon(page, mem_cont, |
518 | vm_flags); | 531 | vm_flags); |
519 | else if (is_locked) | 532 | else if (page->mapping) |
520 | referenced += page_referenced_file(page, mem_cont, | 533 | referenced += page_referenced_file(page, mem_cont, |
521 | vm_flags); | 534 | vm_flags); |
522 | else if (!trylock_page(page)) | 535 | if (we_locked) |
523 | referenced++; | ||
524 | else { | ||
525 | if (page->mapping) | ||
526 | referenced += page_referenced_file(page, | ||
527 | mem_cont, vm_flags); | ||
528 | unlock_page(page); | 536 | unlock_page(page); |
529 | } | ||
530 | } | 537 | } |
531 | 538 | out: | |
532 | if (page_test_and_clear_young(page)) | 539 | if (page_test_and_clear_young(page)) |
533 | referenced++; | 540 | referenced++; |
534 | 541 | ||
535 | return referenced; | 542 | return referenced; |
536 | } | 543 | } |
537 | 544 | ||
538 | static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) | 545 | static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, |
546 | unsigned long address) | ||
539 | { | 547 | { |
540 | struct mm_struct *mm = vma->vm_mm; | 548 | struct mm_struct *mm = vma->vm_mm; |
541 | unsigned long address; | ||
542 | pte_t *pte; | 549 | pte_t *pte; |
543 | spinlock_t *ptl; | 550 | spinlock_t *ptl; |
544 | int ret = 0; | 551 | int ret = 0; |
545 | 552 | ||
546 | address = vma_address(page, vma); | ||
547 | if (address == -EFAULT) | ||
548 | goto out; | ||
549 | |||
550 | pte = page_check_address(page, mm, address, &ptl, 1); | 553 | pte = page_check_address(page, mm, address, &ptl, 1); |
551 | if (!pte) | 554 | if (!pte) |
552 | goto out; | 555 | goto out; |
@@ -578,8 +581,12 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) | |||
578 | 581 | ||
579 | spin_lock(&mapping->i_mmap_lock); | 582 | spin_lock(&mapping->i_mmap_lock); |
580 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 583 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
581 | if (vma->vm_flags & VM_SHARED) | 584 | if (vma->vm_flags & VM_SHARED) { |
582 | ret += page_mkclean_one(page, vma); | 585 | unsigned long address = vma_address(page, vma); |
586 | if (address == -EFAULT) | ||
587 | continue; | ||
588 | ret += page_mkclean_one(page, vma, address); | ||
589 | } | ||
583 | } | 590 | } |
584 | spin_unlock(&mapping->i_mmap_lock); | 591 | spin_unlock(&mapping->i_mmap_lock); |
585 | return ret; | 592 | return ret; |
@@ -620,14 +627,7 @@ static void __page_set_anon_rmap(struct page *page, | |||
620 | BUG_ON(!anon_vma); | 627 | BUG_ON(!anon_vma); |
621 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | 628 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; |
622 | page->mapping = (struct address_space *) anon_vma; | 629 | page->mapping = (struct address_space *) anon_vma; |
623 | |||
624 | page->index = linear_page_index(vma, address); | 630 | page->index = linear_page_index(vma, address); |
625 | |||
626 | /* | ||
627 | * nr_mapped state can be updated without turning off | ||
628 | * interrupts because it is not modified via interrupt. | ||
629 | */ | ||
630 | __inc_zone_page_state(page, NR_ANON_PAGES); | ||
631 | } | 631 | } |
632 | 632 | ||
633 | /** | 633 | /** |
@@ -665,14 +665,23 @@ static void __page_check_anon_rmap(struct page *page, | |||
665 | * @vma: the vm area in which the mapping is added | 665 | * @vma: the vm area in which the mapping is added |
666 | * @address: the user virtual address mapped | 666 | * @address: the user virtual address mapped |
667 | * | 667 | * |
668 | * The caller needs to hold the pte lock and the page must be locked. | 668 | * The caller needs to hold the pte lock, and the page must be locked in |
669 | * the anon_vma case: to serialize mapping,index checking after setting, | ||
670 | * and to ensure that PageAnon is not being upgraded racily to PageKsm | ||
671 | * (but PageKsm is never downgraded to PageAnon). | ||
669 | */ | 672 | */ |
670 | void page_add_anon_rmap(struct page *page, | 673 | void page_add_anon_rmap(struct page *page, |
671 | struct vm_area_struct *vma, unsigned long address) | 674 | struct vm_area_struct *vma, unsigned long address) |
672 | { | 675 | { |
676 | int first = atomic_inc_and_test(&page->_mapcount); | ||
677 | if (first) | ||
678 | __inc_zone_page_state(page, NR_ANON_PAGES); | ||
679 | if (unlikely(PageKsm(page))) | ||
680 | return; | ||
681 | |||
673 | VM_BUG_ON(!PageLocked(page)); | 682 | VM_BUG_ON(!PageLocked(page)); |
674 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 683 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
675 | if (atomic_inc_and_test(&page->_mapcount)) | 684 | if (first) |
676 | __page_set_anon_rmap(page, vma, address); | 685 | __page_set_anon_rmap(page, vma, address); |
677 | else | 686 | else |
678 | __page_check_anon_rmap(page, vma, address); | 687 | __page_check_anon_rmap(page, vma, address); |
@@ -694,6 +703,7 @@ void page_add_new_anon_rmap(struct page *page, | |||
694 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 703 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
695 | SetPageSwapBacked(page); | 704 | SetPageSwapBacked(page); |
696 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ | 705 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ |
706 | __inc_zone_page_state(page, NR_ANON_PAGES); | ||
697 | __page_set_anon_rmap(page, vma, address); | 707 | __page_set_anon_rmap(page, vma, address); |
698 | if (page_evictable(page, vma)) | 708 | if (page_evictable(page, vma)) |
699 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); | 709 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); |
@@ -711,7 +721,7 @@ void page_add_file_rmap(struct page *page) | |||
711 | { | 721 | { |
712 | if (atomic_inc_and_test(&page->_mapcount)) { | 722 | if (atomic_inc_and_test(&page->_mapcount)) { |
713 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 723 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
714 | mem_cgroup_update_mapped_file_stat(page, 1); | 724 | mem_cgroup_update_file_mapped(page, 1); |
715 | } | 725 | } |
716 | } | 726 | } |
717 | 727 | ||
@@ -743,8 +753,8 @@ void page_remove_rmap(struct page *page) | |||
743 | __dec_zone_page_state(page, NR_ANON_PAGES); | 753 | __dec_zone_page_state(page, NR_ANON_PAGES); |
744 | } else { | 754 | } else { |
745 | __dec_zone_page_state(page, NR_FILE_MAPPED); | 755 | __dec_zone_page_state(page, NR_FILE_MAPPED); |
756 | mem_cgroup_update_file_mapped(page, -1); | ||
746 | } | 757 | } |
747 | mem_cgroup_update_mapped_file_stat(page, -1); | ||
748 | /* | 758 | /* |
749 | * It would be tidy to reset the PageAnon mapping here, | 759 | * It would be tidy to reset the PageAnon mapping here, |
750 | * but that might overwrite a racing page_add_anon_rmap | 760 | * but that might overwrite a racing page_add_anon_rmap |
@@ -760,20 +770,15 @@ void page_remove_rmap(struct page *page) | |||
760 | * Subfunctions of try_to_unmap: try_to_unmap_one called | 770 | * Subfunctions of try_to_unmap: try_to_unmap_one called |
761 | * repeatedly from either try_to_unmap_anon or try_to_unmap_file. | 771 | * repeatedly from either try_to_unmap_anon or try_to_unmap_file. |
762 | */ | 772 | */ |
763 | static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | 773 | int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, |
764 | enum ttu_flags flags) | 774 | unsigned long address, enum ttu_flags flags) |
765 | { | 775 | { |
766 | struct mm_struct *mm = vma->vm_mm; | 776 | struct mm_struct *mm = vma->vm_mm; |
767 | unsigned long address; | ||
768 | pte_t *pte; | 777 | pte_t *pte; |
769 | pte_t pteval; | 778 | pte_t pteval; |
770 | spinlock_t *ptl; | 779 | spinlock_t *ptl; |
771 | int ret = SWAP_AGAIN; | 780 | int ret = SWAP_AGAIN; |
772 | 781 | ||
773 | address = vma_address(page, vma); | ||
774 | if (address == -EFAULT) | ||
775 | goto out; | ||
776 | |||
777 | pte = page_check_address(page, mm, address, &ptl, 0); | 782 | pte = page_check_address(page, mm, address, &ptl, 0); |
778 | if (!pte) | 783 | if (!pte) |
779 | goto out; | 784 | goto out; |
@@ -784,10 +789,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
784 | * skipped over this mm) then we should reactivate it. | 789 | * skipped over this mm) then we should reactivate it. |
785 | */ | 790 | */ |
786 | if (!(flags & TTU_IGNORE_MLOCK)) { | 791 | if (!(flags & TTU_IGNORE_MLOCK)) { |
787 | if (vma->vm_flags & VM_LOCKED) { | 792 | if (vma->vm_flags & VM_LOCKED) |
788 | ret = SWAP_MLOCK; | 793 | goto out_mlock; |
794 | |||
795 | if (TTU_ACTION(flags) == TTU_MUNLOCK) | ||
789 | goto out_unmap; | 796 | goto out_unmap; |
790 | } | ||
791 | } | 797 | } |
792 | if (!(flags & TTU_IGNORE_ACCESS)) { | 798 | if (!(flags & TTU_IGNORE_ACCESS)) { |
793 | if (ptep_clear_flush_young_notify(vma, address, pte)) { | 799 | if (ptep_clear_flush_young_notify(vma, address, pte)) { |
@@ -822,7 +828,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
822 | * Store the swap location in the pte. | 828 | * Store the swap location in the pte. |
823 | * See handle_pte_fault() ... | 829 | * See handle_pte_fault() ... |
824 | */ | 830 | */ |
825 | swap_duplicate(entry); | 831 | if (swap_duplicate(entry) < 0) { |
832 | set_pte_at(mm, address, pte, pteval); | ||
833 | ret = SWAP_FAIL; | ||
834 | goto out_unmap; | ||
835 | } | ||
826 | if (list_empty(&mm->mmlist)) { | 836 | if (list_empty(&mm->mmlist)) { |
827 | spin_lock(&mmlist_lock); | 837 | spin_lock(&mmlist_lock); |
828 | if (list_empty(&mm->mmlist)) | 838 | if (list_empty(&mm->mmlist)) |
@@ -849,7 +859,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
849 | } else | 859 | } else |
850 | dec_mm_counter(mm, file_rss); | 860 | dec_mm_counter(mm, file_rss); |
851 | 861 | ||
852 | |||
853 | page_remove_rmap(page); | 862 | page_remove_rmap(page); |
854 | page_cache_release(page); | 863 | page_cache_release(page); |
855 | 864 | ||
@@ -857,6 +866,27 @@ out_unmap: | |||
857 | pte_unmap_unlock(pte, ptl); | 866 | pte_unmap_unlock(pte, ptl); |
858 | out: | 867 | out: |
859 | return ret; | 868 | return ret; |
869 | |||
870 | out_mlock: | ||
871 | pte_unmap_unlock(pte, ptl); | ||
872 | |||
873 | |||
874 | /* | ||
875 | * We need mmap_sem locking, Otherwise VM_LOCKED check makes | ||
876 | * unstable result and race. Plus, We can't wait here because | ||
877 | * we now hold anon_vma->lock or mapping->i_mmap_lock. | ||
878 | * if trylock failed, the page remain in evictable lru and later | ||
879 | * vmscan could retry to move the page to unevictable lru if the | ||
880 | * page is actually mlocked. | ||
881 | */ | ||
882 | if (down_read_trylock(&vma->vm_mm->mmap_sem)) { | ||
883 | if (vma->vm_flags & VM_LOCKED) { | ||
884 | mlock_vma_page(page); | ||
885 | ret = SWAP_MLOCK; | ||
886 | } | ||
887 | up_read(&vma->vm_mm->mmap_sem); | ||
888 | } | ||
889 | return ret; | ||
860 | } | 890 | } |
861 | 891 | ||
862 | /* | 892 | /* |
@@ -922,11 +952,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
922 | return ret; | 952 | return ret; |
923 | 953 | ||
924 | /* | 954 | /* |
925 | * MLOCK_PAGES => feature is configured. | 955 | * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, |
926 | * if we can acquire the mmap_sem for read, and vma is VM_LOCKED, | ||
927 | * keep the sem while scanning the cluster for mlocking pages. | 956 | * keep the sem while scanning the cluster for mlocking pages. |
928 | */ | 957 | */ |
929 | if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) { | 958 | if (down_read_trylock(&vma->vm_mm->mmap_sem)) { |
930 | locked_vma = (vma->vm_flags & VM_LOCKED); | 959 | locked_vma = (vma->vm_flags & VM_LOCKED); |
931 | if (!locked_vma) | 960 | if (!locked_vma) |
932 | up_read(&vma->vm_mm->mmap_sem); /* don't need it */ | 961 | up_read(&vma->vm_mm->mmap_sem); /* don't need it */ |
@@ -976,29 +1005,11 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
976 | return ret; | 1005 | return ret; |
977 | } | 1006 | } |
978 | 1007 | ||
979 | /* | ||
980 | * common handling for pages mapped in VM_LOCKED vmas | ||
981 | */ | ||
982 | static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma) | ||
983 | { | ||
984 | int mlocked = 0; | ||
985 | |||
986 | if (down_read_trylock(&vma->vm_mm->mmap_sem)) { | ||
987 | if (vma->vm_flags & VM_LOCKED) { | ||
988 | mlock_vma_page(page); | ||
989 | mlocked++; /* really mlocked the page */ | ||
990 | } | ||
991 | up_read(&vma->vm_mm->mmap_sem); | ||
992 | } | ||
993 | return mlocked; | ||
994 | } | ||
995 | |||
996 | /** | 1008 | /** |
997 | * try_to_unmap_anon - unmap or unlock anonymous page using the object-based | 1009 | * try_to_unmap_anon - unmap or unlock anonymous page using the object-based |
998 | * rmap method | 1010 | * rmap method |
999 | * @page: the page to unmap/unlock | 1011 | * @page: the page to unmap/unlock |
1000 | * @unlock: request for unlock rather than unmap [unlikely] | 1012 | * @flags: action and flags |
1001 | * @migration: unmapping for migration - ignored if @unlock | ||
1002 | * | 1013 | * |
1003 | * Find all the mappings of a page using the mapping pointer and the vma chains | 1014 | * Find all the mappings of a page using the mapping pointer and the vma chains |
1004 | * contained in the anon_vma struct it points to. | 1015 | * contained in the anon_vma struct it points to. |
@@ -1014,42 +1025,22 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | |||
1014 | { | 1025 | { |
1015 | struct anon_vma *anon_vma; | 1026 | struct anon_vma *anon_vma; |
1016 | struct vm_area_struct *vma; | 1027 | struct vm_area_struct *vma; |
1017 | unsigned int mlocked = 0; | ||
1018 | int ret = SWAP_AGAIN; | 1028 | int ret = SWAP_AGAIN; |
1019 | int unlock = TTU_ACTION(flags) == TTU_MUNLOCK; | ||
1020 | |||
1021 | if (MLOCK_PAGES && unlikely(unlock)) | ||
1022 | ret = SWAP_SUCCESS; /* default for try_to_munlock() */ | ||
1023 | 1029 | ||
1024 | anon_vma = page_lock_anon_vma(page); | 1030 | anon_vma = page_lock_anon_vma(page); |
1025 | if (!anon_vma) | 1031 | if (!anon_vma) |
1026 | return ret; | 1032 | return ret; |
1027 | 1033 | ||
1028 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 1034 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { |
1029 | if (MLOCK_PAGES && unlikely(unlock)) { | 1035 | unsigned long address = vma_address(page, vma); |
1030 | if (!((vma->vm_flags & VM_LOCKED) && | 1036 | if (address == -EFAULT) |
1031 | page_mapped_in_vma(page, vma))) | 1037 | continue; |
1032 | continue; /* must visit all unlocked vmas */ | 1038 | ret = try_to_unmap_one(page, vma, address, flags); |
1033 | ret = SWAP_MLOCK; /* saw at least one mlocked vma */ | 1039 | if (ret != SWAP_AGAIN || !page_mapped(page)) |
1034 | } else { | 1040 | break; |
1035 | ret = try_to_unmap_one(page, vma, flags); | ||
1036 | if (ret == SWAP_FAIL || !page_mapped(page)) | ||
1037 | break; | ||
1038 | } | ||
1039 | if (ret == SWAP_MLOCK) { | ||
1040 | mlocked = try_to_mlock_page(page, vma); | ||
1041 | if (mlocked) | ||
1042 | break; /* stop if actually mlocked page */ | ||
1043 | } | ||
1044 | } | 1041 | } |
1045 | 1042 | ||
1046 | page_unlock_anon_vma(anon_vma); | 1043 | page_unlock_anon_vma(anon_vma); |
1047 | |||
1048 | if (mlocked) | ||
1049 | ret = SWAP_MLOCK; /* actually mlocked the page */ | ||
1050 | else if (ret == SWAP_MLOCK) | ||
1051 | ret = SWAP_AGAIN; /* saw VM_LOCKED vma */ | ||
1052 | |||
1053 | return ret; | 1044 | return ret; |
1054 | } | 1045 | } |
1055 | 1046 | ||
@@ -1079,48 +1070,30 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1079 | unsigned long max_nl_cursor = 0; | 1070 | unsigned long max_nl_cursor = 0; |
1080 | unsigned long max_nl_size = 0; | 1071 | unsigned long max_nl_size = 0; |
1081 | unsigned int mapcount; | 1072 | unsigned int mapcount; |
1082 | unsigned int mlocked = 0; | ||
1083 | int unlock = TTU_ACTION(flags) == TTU_MUNLOCK; | ||
1084 | |||
1085 | if (MLOCK_PAGES && unlikely(unlock)) | ||
1086 | ret = SWAP_SUCCESS; /* default for try_to_munlock() */ | ||
1087 | 1073 | ||
1088 | spin_lock(&mapping->i_mmap_lock); | 1074 | spin_lock(&mapping->i_mmap_lock); |
1089 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 1075 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
1090 | if (MLOCK_PAGES && unlikely(unlock)) { | 1076 | unsigned long address = vma_address(page, vma); |
1091 | if (!((vma->vm_flags & VM_LOCKED) && | 1077 | if (address == -EFAULT) |
1092 | page_mapped_in_vma(page, vma))) | 1078 | continue; |
1093 | continue; /* must visit all vmas */ | 1079 | ret = try_to_unmap_one(page, vma, address, flags); |
1094 | ret = SWAP_MLOCK; | 1080 | if (ret != SWAP_AGAIN || !page_mapped(page)) |
1095 | } else { | 1081 | goto out; |
1096 | ret = try_to_unmap_one(page, vma, flags); | ||
1097 | if (ret == SWAP_FAIL || !page_mapped(page)) | ||
1098 | goto out; | ||
1099 | } | ||
1100 | if (ret == SWAP_MLOCK) { | ||
1101 | mlocked = try_to_mlock_page(page, vma); | ||
1102 | if (mlocked) | ||
1103 | break; /* stop if actually mlocked page */ | ||
1104 | } | ||
1105 | } | 1082 | } |
1106 | 1083 | ||
1107 | if (mlocked) | 1084 | if (list_empty(&mapping->i_mmap_nonlinear)) |
1108 | goto out; | 1085 | goto out; |
1109 | 1086 | ||
1110 | if (list_empty(&mapping->i_mmap_nonlinear)) | 1087 | /* |
1088 | * We don't bother to try to find the munlocked page in nonlinears. | ||
1089 | * It's costly. Instead, later, page reclaim logic may call | ||
1090 | * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily. | ||
1091 | */ | ||
1092 | if (TTU_ACTION(flags) == TTU_MUNLOCK) | ||
1111 | goto out; | 1093 | goto out; |
1112 | 1094 | ||
1113 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 1095 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, |
1114 | shared.vm_set.list) { | 1096 | shared.vm_set.list) { |
1115 | if (MLOCK_PAGES && unlikely(unlock)) { | ||
1116 | if (!(vma->vm_flags & VM_LOCKED)) | ||
1117 | continue; /* must visit all vmas */ | ||
1118 | ret = SWAP_MLOCK; /* leave mlocked == 0 */ | ||
1119 | goto out; /* no need to look further */ | ||
1120 | } | ||
1121 | if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) && | ||
1122 | (vma->vm_flags & VM_LOCKED)) | ||
1123 | continue; | ||
1124 | cursor = (unsigned long) vma->vm_private_data; | 1097 | cursor = (unsigned long) vma->vm_private_data; |
1125 | if (cursor > max_nl_cursor) | 1098 | if (cursor > max_nl_cursor) |
1126 | max_nl_cursor = cursor; | 1099 | max_nl_cursor = cursor; |
@@ -1153,16 +1126,12 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1153 | do { | 1126 | do { |
1154 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 1127 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, |
1155 | shared.vm_set.list) { | 1128 | shared.vm_set.list) { |
1156 | if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) && | ||
1157 | (vma->vm_flags & VM_LOCKED)) | ||
1158 | continue; | ||
1159 | cursor = (unsigned long) vma->vm_private_data; | 1129 | cursor = (unsigned long) vma->vm_private_data; |
1160 | while ( cursor < max_nl_cursor && | 1130 | while ( cursor < max_nl_cursor && |
1161 | cursor < vma->vm_end - vma->vm_start) { | 1131 | cursor < vma->vm_end - vma->vm_start) { |
1162 | ret = try_to_unmap_cluster(cursor, &mapcount, | 1132 | if (try_to_unmap_cluster(cursor, &mapcount, |
1163 | vma, page); | 1133 | vma, page) == SWAP_MLOCK) |
1164 | if (ret == SWAP_MLOCK) | 1134 | ret = SWAP_MLOCK; |
1165 | mlocked = 2; /* to return below */ | ||
1166 | cursor += CLUSTER_SIZE; | 1135 | cursor += CLUSTER_SIZE; |
1167 | vma->vm_private_data = (void *) cursor; | 1136 | vma->vm_private_data = (void *) cursor; |
1168 | if ((int)mapcount <= 0) | 1137 | if ((int)mapcount <= 0) |
@@ -1183,10 +1152,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1183 | vma->vm_private_data = NULL; | 1152 | vma->vm_private_data = NULL; |
1184 | out: | 1153 | out: |
1185 | spin_unlock(&mapping->i_mmap_lock); | 1154 | spin_unlock(&mapping->i_mmap_lock); |
1186 | if (mlocked) | ||
1187 | ret = SWAP_MLOCK; /* actually mlocked the page */ | ||
1188 | else if (ret == SWAP_MLOCK) | ||
1189 | ret = SWAP_AGAIN; /* saw VM_LOCKED vma */ | ||
1190 | return ret; | 1155 | return ret; |
1191 | } | 1156 | } |
1192 | 1157 | ||
@@ -1210,7 +1175,9 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) | |||
1210 | 1175 | ||
1211 | BUG_ON(!PageLocked(page)); | 1176 | BUG_ON(!PageLocked(page)); |
1212 | 1177 | ||
1213 | if (PageAnon(page)) | 1178 | if (unlikely(PageKsm(page))) |
1179 | ret = try_to_unmap_ksm(page, flags); | ||
1180 | else if (PageAnon(page)) | ||
1214 | ret = try_to_unmap_anon(page, flags); | 1181 | ret = try_to_unmap_anon(page, flags); |
1215 | else | 1182 | else |
1216 | ret = try_to_unmap_file(page, flags); | 1183 | ret = try_to_unmap_file(page, flags); |
@@ -1229,17 +1196,98 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) | |||
1229 | * | 1196 | * |
1230 | * Return values are: | 1197 | * Return values are: |
1231 | * | 1198 | * |
1232 | * SWAP_SUCCESS - no vma's holding page mlocked. | 1199 | * SWAP_AGAIN - no vma is holding page mlocked, or, |
1233 | * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem | 1200 | * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem |
1201 | * SWAP_FAIL - page cannot be located at present | ||
1234 | * SWAP_MLOCK - page is now mlocked. | 1202 | * SWAP_MLOCK - page is now mlocked. |
1235 | */ | 1203 | */ |
1236 | int try_to_munlock(struct page *page) | 1204 | int try_to_munlock(struct page *page) |
1237 | { | 1205 | { |
1238 | VM_BUG_ON(!PageLocked(page) || PageLRU(page)); | 1206 | VM_BUG_ON(!PageLocked(page) || PageLRU(page)); |
1239 | 1207 | ||
1240 | if (PageAnon(page)) | 1208 | if (unlikely(PageKsm(page))) |
1209 | return try_to_unmap_ksm(page, TTU_MUNLOCK); | ||
1210 | else if (PageAnon(page)) | ||
1241 | return try_to_unmap_anon(page, TTU_MUNLOCK); | 1211 | return try_to_unmap_anon(page, TTU_MUNLOCK); |
1242 | else | 1212 | else |
1243 | return try_to_unmap_file(page, TTU_MUNLOCK); | 1213 | return try_to_unmap_file(page, TTU_MUNLOCK); |
1244 | } | 1214 | } |
1245 | 1215 | ||
1216 | #ifdef CONFIG_MIGRATION | ||
1217 | /* | ||
1218 | * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): | ||
1219 | * Called by migrate.c to remove migration ptes, but might be used more later. | ||
1220 | */ | ||
1221 | static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | ||
1222 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
1223 | { | ||
1224 | struct anon_vma *anon_vma; | ||
1225 | struct vm_area_struct *vma; | ||
1226 | int ret = SWAP_AGAIN; | ||
1227 | |||
1228 | /* | ||
1229 | * Note: remove_migration_ptes() cannot use page_lock_anon_vma() | ||
1230 | * because that depends on page_mapped(); but not all its usages | ||
1231 | * are holding mmap_sem, which also gave the necessary guarantee | ||
1232 | * (that this anon_vma's slab has not already been destroyed). | ||
1233 | * This needs to be reviewed later: avoiding page_lock_anon_vma() | ||
1234 | * is risky, and currently limits the usefulness of rmap_walk(). | ||
1235 | */ | ||
1236 | anon_vma = page_anon_vma(page); | ||
1237 | if (!anon_vma) | ||
1238 | return ret; | ||
1239 | spin_lock(&anon_vma->lock); | ||
1240 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | ||
1241 | unsigned long address = vma_address(page, vma); | ||
1242 | if (address == -EFAULT) | ||
1243 | continue; | ||
1244 | ret = rmap_one(page, vma, address, arg); | ||
1245 | if (ret != SWAP_AGAIN) | ||
1246 | break; | ||
1247 | } | ||
1248 | spin_unlock(&anon_vma->lock); | ||
1249 | return ret; | ||
1250 | } | ||
1251 | |||
1252 | static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, | ||
1253 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
1254 | { | ||
1255 | struct address_space *mapping = page->mapping; | ||
1256 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
1257 | struct vm_area_struct *vma; | ||
1258 | struct prio_tree_iter iter; | ||
1259 | int ret = SWAP_AGAIN; | ||
1260 | |||
1261 | if (!mapping) | ||
1262 | return ret; | ||
1263 | spin_lock(&mapping->i_mmap_lock); | ||
1264 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | ||
1265 | unsigned long address = vma_address(page, vma); | ||
1266 | if (address == -EFAULT) | ||
1267 | continue; | ||
1268 | ret = rmap_one(page, vma, address, arg); | ||
1269 | if (ret != SWAP_AGAIN) | ||
1270 | break; | ||
1271 | } | ||
1272 | /* | ||
1273 | * No nonlinear handling: being always shared, nonlinear vmas | ||
1274 | * never contain migration ptes. Decide what to do about this | ||
1275 | * limitation to linear when we need rmap_walk() on nonlinear. | ||
1276 | */ | ||
1277 | spin_unlock(&mapping->i_mmap_lock); | ||
1278 | return ret; | ||
1279 | } | ||
1280 | |||
1281 | int rmap_walk(struct page *page, int (*rmap_one)(struct page *, | ||
1282 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
1283 | { | ||
1284 | VM_BUG_ON(!PageLocked(page)); | ||
1285 | |||
1286 | if (unlikely(PageKsm(page))) | ||
1287 | return rmap_walk_ksm(page, rmap_one, arg); | ||
1288 | else if (PageAnon(page)) | ||
1289 | return rmap_walk_anon(page, rmap_one, arg); | ||
1290 | else | ||
1291 | return rmap_walk_file(page, rmap_one, arg); | ||
1292 | } | ||
1293 | #endif /* CONFIG_MIGRATION */ | ||
diff --git a/mm/shmem.c b/mm/shmem.c index 356dd99566ec..eef4ebea5158 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -29,7 +29,6 @@ | |||
29 | #include <linux/mm.h> | 29 | #include <linux/mm.h> |
30 | #include <linux/module.h> | 30 | #include <linux/module.h> |
31 | #include <linux/swap.h> | 31 | #include <linux/swap.h> |
32 | #include <linux/ima.h> | ||
33 | 32 | ||
34 | static struct vfsmount *shm_mnt; | 33 | static struct vfsmount *shm_mnt; |
35 | 34 | ||
@@ -42,6 +41,7 @@ static struct vfsmount *shm_mnt; | |||
42 | 41 | ||
43 | #include <linux/xattr.h> | 42 | #include <linux/xattr.h> |
44 | #include <linux/exportfs.h> | 43 | #include <linux/exportfs.h> |
44 | #include <linux/posix_acl.h> | ||
45 | #include <linux/generic_acl.h> | 45 | #include <linux/generic_acl.h> |
46 | #include <linux/mman.h> | 46 | #include <linux/mman.h> |
47 | #include <linux/string.h> | 47 | #include <linux/string.h> |
@@ -810,7 +810,7 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) | |||
810 | error = inode_setattr(inode, attr); | 810 | error = inode_setattr(inode, attr); |
811 | #ifdef CONFIG_TMPFS_POSIX_ACL | 811 | #ifdef CONFIG_TMPFS_POSIX_ACL |
812 | if (!error && (attr->ia_valid & ATTR_MODE)) | 812 | if (!error && (attr->ia_valid & ATTR_MODE)) |
813 | error = generic_acl_chmod(inode, &shmem_acl_ops); | 813 | error = generic_acl_chmod(inode); |
814 | #endif | 814 | #endif |
815 | if (page) | 815 | if (page) |
816 | page_cache_release(page); | 816 | page_cache_release(page); |
@@ -1017,7 +1017,14 @@ int shmem_unuse(swp_entry_t entry, struct page *page) | |||
1017 | goto out; | 1017 | goto out; |
1018 | } | 1018 | } |
1019 | mutex_unlock(&shmem_swaplist_mutex); | 1019 | mutex_unlock(&shmem_swaplist_mutex); |
1020 | out: return found; /* 0 or 1 or -ENOMEM */ | 1020 | /* |
1021 | * Can some race bring us here? We've been holding page lock, | ||
1022 | * so I think not; but would rather try again later than BUG() | ||
1023 | */ | ||
1024 | unlock_page(page); | ||
1025 | page_cache_release(page); | ||
1026 | out: | ||
1027 | return (found < 0) ? found : 0; | ||
1021 | } | 1028 | } |
1022 | 1029 | ||
1023 | /* | 1030 | /* |
@@ -1080,7 +1087,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
1080 | else | 1087 | else |
1081 | inode = NULL; | 1088 | inode = NULL; |
1082 | spin_unlock(&info->lock); | 1089 | spin_unlock(&info->lock); |
1083 | swap_duplicate(swap); | 1090 | swap_shmem_alloc(swap); |
1084 | BUG_ON(page_mapped(page)); | 1091 | BUG_ON(page_mapped(page)); |
1085 | page_cache_release(page); /* pagecache ref */ | 1092 | page_cache_release(page); /* pagecache ref */ |
1086 | swap_writepage(page, wbc); | 1093 | swap_writepage(page, wbc); |
@@ -1817,11 +1824,15 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) | |||
1817 | return error; | 1824 | return error; |
1818 | } | 1825 | } |
1819 | } | 1826 | } |
1820 | error = shmem_acl_init(inode, dir); | 1827 | #ifdef CONFIG_TMPFS_POSIX_ACL |
1828 | error = generic_acl_init(inode, dir); | ||
1821 | if (error) { | 1829 | if (error) { |
1822 | iput(inode); | 1830 | iput(inode); |
1823 | return error; | 1831 | return error; |
1824 | } | 1832 | } |
1833 | #else | ||
1834 | error = 0; | ||
1835 | #endif | ||
1825 | if (dir->i_mode & S_ISGID) { | 1836 | if (dir->i_mode & S_ISGID) { |
1826 | inode->i_gid = dir->i_gid; | 1837 | inode->i_gid = dir->i_gid; |
1827 | if (S_ISDIR(mode)) | 1838 | if (S_ISDIR(mode)) |
@@ -2036,27 +2047,28 @@ static const struct inode_operations shmem_symlink_inode_operations = { | |||
2036 | * filesystem level, though. | 2047 | * filesystem level, though. |
2037 | */ | 2048 | */ |
2038 | 2049 | ||
2039 | static size_t shmem_xattr_security_list(struct inode *inode, char *list, | 2050 | static size_t shmem_xattr_security_list(struct dentry *dentry, char *list, |
2040 | size_t list_len, const char *name, | 2051 | size_t list_len, const char *name, |
2041 | size_t name_len) | 2052 | size_t name_len, int handler_flags) |
2042 | { | 2053 | { |
2043 | return security_inode_listsecurity(inode, list, list_len); | 2054 | return security_inode_listsecurity(dentry->d_inode, list, list_len); |
2044 | } | 2055 | } |
2045 | 2056 | ||
2046 | static int shmem_xattr_security_get(struct inode *inode, const char *name, | 2057 | static int shmem_xattr_security_get(struct dentry *dentry, const char *name, |
2047 | void *buffer, size_t size) | 2058 | void *buffer, size_t size, int handler_flags) |
2048 | { | 2059 | { |
2049 | if (strcmp(name, "") == 0) | 2060 | if (strcmp(name, "") == 0) |
2050 | return -EINVAL; | 2061 | return -EINVAL; |
2051 | return xattr_getsecurity(inode, name, buffer, size); | 2062 | return xattr_getsecurity(dentry->d_inode, name, buffer, size); |
2052 | } | 2063 | } |
2053 | 2064 | ||
2054 | static int shmem_xattr_security_set(struct inode *inode, const char *name, | 2065 | static int shmem_xattr_security_set(struct dentry *dentry, const char *name, |
2055 | const void *value, size_t size, int flags) | 2066 | const void *value, size_t size, int flags, int handler_flags) |
2056 | { | 2067 | { |
2057 | if (strcmp(name, "") == 0) | 2068 | if (strcmp(name, "") == 0) |
2058 | return -EINVAL; | 2069 | return -EINVAL; |
2059 | return security_inode_setsecurity(inode, name, value, size, flags); | 2070 | return security_inode_setsecurity(dentry->d_inode, name, value, |
2071 | size, flags); | ||
2060 | } | 2072 | } |
2061 | 2073 | ||
2062 | static struct xattr_handler shmem_xattr_security_handler = { | 2074 | static struct xattr_handler shmem_xattr_security_handler = { |
@@ -2067,8 +2079,8 @@ static struct xattr_handler shmem_xattr_security_handler = { | |||
2067 | }; | 2079 | }; |
2068 | 2080 | ||
2069 | static struct xattr_handler *shmem_xattr_handlers[] = { | 2081 | static struct xattr_handler *shmem_xattr_handlers[] = { |
2070 | &shmem_xattr_acl_access_handler, | 2082 | &generic_acl_access_handler, |
2071 | &shmem_xattr_acl_default_handler, | 2083 | &generic_acl_default_handler, |
2072 | &shmem_xattr_security_handler, | 2084 | &shmem_xattr_security_handler, |
2073 | NULL | 2085 | NULL |
2074 | }; | 2086 | }; |
@@ -2447,7 +2459,7 @@ static const struct inode_operations shmem_inode_operations = { | |||
2447 | .getxattr = generic_getxattr, | 2459 | .getxattr = generic_getxattr, |
2448 | .listxattr = generic_listxattr, | 2460 | .listxattr = generic_listxattr, |
2449 | .removexattr = generic_removexattr, | 2461 | .removexattr = generic_removexattr, |
2450 | .check_acl = shmem_check_acl, | 2462 | .check_acl = generic_check_acl, |
2451 | #endif | 2463 | #endif |
2452 | 2464 | ||
2453 | }; | 2465 | }; |
@@ -2470,7 +2482,7 @@ static const struct inode_operations shmem_dir_inode_operations = { | |||
2470 | .getxattr = generic_getxattr, | 2482 | .getxattr = generic_getxattr, |
2471 | .listxattr = generic_listxattr, | 2483 | .listxattr = generic_listxattr, |
2472 | .removexattr = generic_removexattr, | 2484 | .removexattr = generic_removexattr, |
2473 | .check_acl = shmem_check_acl, | 2485 | .check_acl = generic_check_acl, |
2474 | #endif | 2486 | #endif |
2475 | }; | 2487 | }; |
2476 | 2488 | ||
@@ -2481,7 +2493,7 @@ static const struct inode_operations shmem_special_inode_operations = { | |||
2481 | .getxattr = generic_getxattr, | 2493 | .getxattr = generic_getxattr, |
2482 | .listxattr = generic_listxattr, | 2494 | .listxattr = generic_listxattr, |
2483 | .removexattr = generic_removexattr, | 2495 | .removexattr = generic_removexattr, |
2484 | .check_acl = shmem_check_acl, | 2496 | .check_acl = generic_check_acl, |
2485 | #endif | 2497 | #endif |
2486 | }; | 2498 | }; |
2487 | 2499 | ||
@@ -2619,7 +2631,8 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags | |||
2619 | int error; | 2631 | int error; |
2620 | struct file *file; | 2632 | struct file *file; |
2621 | struct inode *inode; | 2633 | struct inode *inode; |
2622 | struct dentry *dentry, *root; | 2634 | struct path path; |
2635 | struct dentry *root; | ||
2623 | struct qstr this; | 2636 | struct qstr this; |
2624 | 2637 | ||
2625 | if (IS_ERR(shm_mnt)) | 2638 | if (IS_ERR(shm_mnt)) |
@@ -2636,38 +2649,35 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags | |||
2636 | this.len = strlen(name); | 2649 | this.len = strlen(name); |
2637 | this.hash = 0; /* will go */ | 2650 | this.hash = 0; /* will go */ |
2638 | root = shm_mnt->mnt_root; | 2651 | root = shm_mnt->mnt_root; |
2639 | dentry = d_alloc(root, &this); | 2652 | path.dentry = d_alloc(root, &this); |
2640 | if (!dentry) | 2653 | if (!path.dentry) |
2641 | goto put_memory; | 2654 | goto put_memory; |
2642 | 2655 | path.mnt = mntget(shm_mnt); | |
2643 | error = -ENFILE; | ||
2644 | file = get_empty_filp(); | ||
2645 | if (!file) | ||
2646 | goto put_dentry; | ||
2647 | 2656 | ||
2648 | error = -ENOSPC; | 2657 | error = -ENOSPC; |
2649 | inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags); | 2658 | inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags); |
2650 | if (!inode) | 2659 | if (!inode) |
2651 | goto close_file; | 2660 | goto put_dentry; |
2652 | 2661 | ||
2653 | d_instantiate(dentry, inode); | 2662 | d_instantiate(path.dentry, inode); |
2654 | inode->i_size = size; | 2663 | inode->i_size = size; |
2655 | inode->i_nlink = 0; /* It is unlinked */ | 2664 | inode->i_nlink = 0; /* It is unlinked */ |
2656 | init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, | ||
2657 | &shmem_file_operations); | ||
2658 | |||
2659 | #ifndef CONFIG_MMU | 2665 | #ifndef CONFIG_MMU |
2660 | error = ramfs_nommu_expand_for_mapping(inode, size); | 2666 | error = ramfs_nommu_expand_for_mapping(inode, size); |
2661 | if (error) | 2667 | if (error) |
2662 | goto close_file; | 2668 | goto put_dentry; |
2663 | #endif | 2669 | #endif |
2664 | ima_counts_get(file); | 2670 | |
2671 | error = -ENFILE; | ||
2672 | file = alloc_file(&path, FMODE_WRITE | FMODE_READ, | ||
2673 | &shmem_file_operations); | ||
2674 | if (!file) | ||
2675 | goto put_dentry; | ||
2676 | |||
2665 | return file; | 2677 | return file; |
2666 | 2678 | ||
2667 | close_file: | ||
2668 | put_filp(file); | ||
2669 | put_dentry: | 2679 | put_dentry: |
2670 | dput(dentry); | 2680 | path_put(&path); |
2671 | put_memory: | 2681 | put_memory: |
2672 | shmem_unacct_size(flags, size); | 2682 | shmem_unacct_size(flags, size); |
2673 | return ERR_PTR(error); | 2683 | return ERR_PTR(error); |
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c deleted file mode 100644 index df2c87fdae50..000000000000 --- a/mm/shmem_acl.c +++ /dev/null | |||
@@ -1,171 +0,0 @@ | |||
1 | /* | ||
2 | * mm/shmem_acl.c | ||
3 | * | ||
4 | * (C) 2005 Andreas Gruenbacher <agruen@suse.de> | ||
5 | * | ||
6 | * This file is released under the GPL. | ||
7 | */ | ||
8 | |||
9 | #include <linux/fs.h> | ||
10 | #include <linux/shmem_fs.h> | ||
11 | #include <linux/xattr.h> | ||
12 | #include <linux/generic_acl.h> | ||
13 | |||
14 | /** | ||
15 | * shmem_get_acl - generic_acl_operations->getacl() operation | ||
16 | */ | ||
17 | static struct posix_acl * | ||
18 | shmem_get_acl(struct inode *inode, int type) | ||
19 | { | ||
20 | struct posix_acl *acl = NULL; | ||
21 | |||
22 | spin_lock(&inode->i_lock); | ||
23 | switch(type) { | ||
24 | case ACL_TYPE_ACCESS: | ||
25 | acl = posix_acl_dup(inode->i_acl); | ||
26 | break; | ||
27 | |||
28 | case ACL_TYPE_DEFAULT: | ||
29 | acl = posix_acl_dup(inode->i_default_acl); | ||
30 | break; | ||
31 | } | ||
32 | spin_unlock(&inode->i_lock); | ||
33 | |||
34 | return acl; | ||
35 | } | ||
36 | |||
37 | /** | ||
38 | * shmem_set_acl - generic_acl_operations->setacl() operation | ||
39 | */ | ||
40 | static void | ||
41 | shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl) | ||
42 | { | ||
43 | struct posix_acl *free = NULL; | ||
44 | |||
45 | spin_lock(&inode->i_lock); | ||
46 | switch(type) { | ||
47 | case ACL_TYPE_ACCESS: | ||
48 | free = inode->i_acl; | ||
49 | inode->i_acl = posix_acl_dup(acl); | ||
50 | break; | ||
51 | |||
52 | case ACL_TYPE_DEFAULT: | ||
53 | free = inode->i_default_acl; | ||
54 | inode->i_default_acl = posix_acl_dup(acl); | ||
55 | break; | ||
56 | } | ||
57 | spin_unlock(&inode->i_lock); | ||
58 | posix_acl_release(free); | ||
59 | } | ||
60 | |||
61 | struct generic_acl_operations shmem_acl_ops = { | ||
62 | .getacl = shmem_get_acl, | ||
63 | .setacl = shmem_set_acl, | ||
64 | }; | ||
65 | |||
66 | /** | ||
67 | * shmem_list_acl_access, shmem_get_acl_access, shmem_set_acl_access, | ||
68 | * shmem_xattr_acl_access_handler - plumbing code to implement the | ||
69 | * system.posix_acl_access xattr using the generic acl functions. | ||
70 | */ | ||
71 | |||
72 | static size_t | ||
73 | shmem_list_acl_access(struct inode *inode, char *list, size_t list_size, | ||
74 | const char *name, size_t name_len) | ||
75 | { | ||
76 | return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, | ||
77 | list, list_size); | ||
78 | } | ||
79 | |||
80 | static int | ||
81 | shmem_get_acl_access(struct inode *inode, const char *name, void *buffer, | ||
82 | size_t size) | ||
83 | { | ||
84 | if (strcmp(name, "") != 0) | ||
85 | return -EINVAL; | ||
86 | return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, buffer, | ||
87 | size); | ||
88 | } | ||
89 | |||
90 | static int | ||
91 | shmem_set_acl_access(struct inode *inode, const char *name, const void *value, | ||
92 | size_t size, int flags) | ||
93 | { | ||
94 | if (strcmp(name, "") != 0) | ||
95 | return -EINVAL; | ||
96 | return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, value, | ||
97 | size); | ||
98 | } | ||
99 | |||
100 | struct xattr_handler shmem_xattr_acl_access_handler = { | ||
101 | .prefix = POSIX_ACL_XATTR_ACCESS, | ||
102 | .list = shmem_list_acl_access, | ||
103 | .get = shmem_get_acl_access, | ||
104 | .set = shmem_set_acl_access, | ||
105 | }; | ||
106 | |||
107 | /** | ||
108 | * shmem_list_acl_default, shmem_get_acl_default, shmem_set_acl_default, | ||
109 | * shmem_xattr_acl_default_handler - plumbing code to implement the | ||
110 | * system.posix_acl_default xattr using the generic acl functions. | ||
111 | */ | ||
112 | |||
113 | static size_t | ||
114 | shmem_list_acl_default(struct inode *inode, char *list, size_t list_size, | ||
115 | const char *name, size_t name_len) | ||
116 | { | ||
117 | return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, | ||
118 | list, list_size); | ||
119 | } | ||
120 | |||
121 | static int | ||
122 | shmem_get_acl_default(struct inode *inode, const char *name, void *buffer, | ||
123 | size_t size) | ||
124 | { | ||
125 | if (strcmp(name, "") != 0) | ||
126 | return -EINVAL; | ||
127 | return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, buffer, | ||
128 | size); | ||
129 | } | ||
130 | |||
131 | static int | ||
132 | shmem_set_acl_default(struct inode *inode, const char *name, const void *value, | ||
133 | size_t size, int flags) | ||
134 | { | ||
135 | if (strcmp(name, "") != 0) | ||
136 | return -EINVAL; | ||
137 | return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, value, | ||
138 | size); | ||
139 | } | ||
140 | |||
141 | struct xattr_handler shmem_xattr_acl_default_handler = { | ||
142 | .prefix = POSIX_ACL_XATTR_DEFAULT, | ||
143 | .list = shmem_list_acl_default, | ||
144 | .get = shmem_get_acl_default, | ||
145 | .set = shmem_set_acl_default, | ||
146 | }; | ||
147 | |||
148 | /** | ||
149 | * shmem_acl_init - Inizialize the acl(s) of a new inode | ||
150 | */ | ||
151 | int | ||
152 | shmem_acl_init(struct inode *inode, struct inode *dir) | ||
153 | { | ||
154 | return generic_acl_init(inode, dir, &shmem_acl_ops); | ||
155 | } | ||
156 | |||
157 | /** | ||
158 | * shmem_check_acl - check_acl() callback for generic_permission() | ||
159 | */ | ||
160 | int | ||
161 | shmem_check_acl(struct inode *inode, int mask) | ||
162 | { | ||
163 | struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS); | ||
164 | |||
165 | if (acl) { | ||
166 | int error = posix_acl_permission(inode, acl, mask); | ||
167 | posix_acl_release(acl); | ||
168 | return error; | ||
169 | } | ||
170 | return -EAGAIN; | ||
171 | } | ||
@@ -490,7 +490,7 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) | |||
490 | 490 | ||
491 | #endif | 491 | #endif |
492 | 492 | ||
493 | #ifdef CONFIG_KMEMTRACE | 493 | #ifdef CONFIG_TRACING |
494 | size_t slab_buffer_size(struct kmem_cache *cachep) | 494 | size_t slab_buffer_size(struct kmem_cache *cachep) |
495 | { | 495 | { |
496 | return cachep->buffer_size; | 496 | return cachep->buffer_size; |
@@ -604,6 +604,26 @@ static struct kmem_cache cache_cache = { | |||
604 | 604 | ||
605 | #define BAD_ALIEN_MAGIC 0x01020304ul | 605 | #define BAD_ALIEN_MAGIC 0x01020304ul |
606 | 606 | ||
607 | /* | ||
608 | * chicken and egg problem: delay the per-cpu array allocation | ||
609 | * until the general caches are up. | ||
610 | */ | ||
611 | static enum { | ||
612 | NONE, | ||
613 | PARTIAL_AC, | ||
614 | PARTIAL_L3, | ||
615 | EARLY, | ||
616 | FULL | ||
617 | } g_cpucache_up; | ||
618 | |||
619 | /* | ||
620 | * used by boot code to determine if it can use slab based allocator | ||
621 | */ | ||
622 | int slab_is_available(void) | ||
623 | { | ||
624 | return g_cpucache_up >= EARLY; | ||
625 | } | ||
626 | |||
607 | #ifdef CONFIG_LOCKDEP | 627 | #ifdef CONFIG_LOCKDEP |
608 | 628 | ||
609 | /* | 629 | /* |
@@ -620,40 +640,52 @@ static struct kmem_cache cache_cache = { | |||
620 | static struct lock_class_key on_slab_l3_key; | 640 | static struct lock_class_key on_slab_l3_key; |
621 | static struct lock_class_key on_slab_alc_key; | 641 | static struct lock_class_key on_slab_alc_key; |
622 | 642 | ||
623 | static inline void init_lock_keys(void) | 643 | static void init_node_lock_keys(int q) |
624 | |||
625 | { | 644 | { |
626 | int q; | ||
627 | struct cache_sizes *s = malloc_sizes; | 645 | struct cache_sizes *s = malloc_sizes; |
628 | 646 | ||
629 | while (s->cs_size != ULONG_MAX) { | 647 | if (g_cpucache_up != FULL) |
630 | for_each_node(q) { | 648 | return; |
631 | struct array_cache **alc; | 649 | |
632 | int r; | 650 | for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { |
633 | struct kmem_list3 *l3 = s->cs_cachep->nodelists[q]; | 651 | struct array_cache **alc; |
634 | if (!l3 || OFF_SLAB(s->cs_cachep)) | 652 | struct kmem_list3 *l3; |
635 | continue; | 653 | int r; |
636 | lockdep_set_class(&l3->list_lock, &on_slab_l3_key); | 654 | |
637 | alc = l3->alien; | 655 | l3 = s->cs_cachep->nodelists[q]; |
638 | /* | 656 | if (!l3 || OFF_SLAB(s->cs_cachep)) |
639 | * FIXME: This check for BAD_ALIEN_MAGIC | 657 | continue; |
640 | * should go away when common slab code is taught to | 658 | lockdep_set_class(&l3->list_lock, &on_slab_l3_key); |
641 | * work even without alien caches. | 659 | alc = l3->alien; |
642 | * Currently, non NUMA code returns BAD_ALIEN_MAGIC | 660 | /* |
643 | * for alloc_alien_cache, | 661 | * FIXME: This check for BAD_ALIEN_MAGIC |
644 | */ | 662 | * should go away when common slab code is taught to |
645 | if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) | 663 | * work even without alien caches. |
646 | continue; | 664 | * Currently, non NUMA code returns BAD_ALIEN_MAGIC |
647 | for_each_node(r) { | 665 | * for alloc_alien_cache, |
648 | if (alc[r]) | 666 | */ |
649 | lockdep_set_class(&alc[r]->lock, | 667 | if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) |
650 | &on_slab_alc_key); | 668 | continue; |
651 | } | 669 | for_each_node(r) { |
670 | if (alc[r]) | ||
671 | lockdep_set_class(&alc[r]->lock, | ||
672 | &on_slab_alc_key); | ||
652 | } | 673 | } |
653 | s++; | ||
654 | } | 674 | } |
655 | } | 675 | } |
676 | |||
677 | static inline void init_lock_keys(void) | ||
678 | { | ||
679 | int node; | ||
680 | |||
681 | for_each_node(node) | ||
682 | init_node_lock_keys(node); | ||
683 | } | ||
656 | #else | 684 | #else |
685 | static void init_node_lock_keys(int q) | ||
686 | { | ||
687 | } | ||
688 | |||
657 | static inline void init_lock_keys(void) | 689 | static inline void init_lock_keys(void) |
658 | { | 690 | { |
659 | } | 691 | } |
@@ -665,26 +697,6 @@ static inline void init_lock_keys(void) | |||
665 | static DEFINE_MUTEX(cache_chain_mutex); | 697 | static DEFINE_MUTEX(cache_chain_mutex); |
666 | static struct list_head cache_chain; | 698 | static struct list_head cache_chain; |
667 | 699 | ||
668 | /* | ||
669 | * chicken and egg problem: delay the per-cpu array allocation | ||
670 | * until the general caches are up. | ||
671 | */ | ||
672 | static enum { | ||
673 | NONE, | ||
674 | PARTIAL_AC, | ||
675 | PARTIAL_L3, | ||
676 | EARLY, | ||
677 | FULL | ||
678 | } g_cpucache_up; | ||
679 | |||
680 | /* | ||
681 | * used by boot code to determine if it can use slab based allocator | ||
682 | */ | ||
683 | int slab_is_available(void) | ||
684 | { | ||
685 | return g_cpucache_up >= EARLY; | ||
686 | } | ||
687 | |||
688 | static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); | 700 | static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); |
689 | 701 | ||
690 | static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) | 702 | static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) |
@@ -1120,7 +1132,7 @@ static void __cpuinit cpuup_canceled(long cpu) | |||
1120 | if (nc) | 1132 | if (nc) |
1121 | free_block(cachep, nc->entry, nc->avail, node); | 1133 | free_block(cachep, nc->entry, nc->avail, node); |
1122 | 1134 | ||
1123 | if (!cpus_empty(*mask)) { | 1135 | if (!cpumask_empty(mask)) { |
1124 | spin_unlock_irq(&l3->list_lock); | 1136 | spin_unlock_irq(&l3->list_lock); |
1125 | goto free_array_cache; | 1137 | goto free_array_cache; |
1126 | } | 1138 | } |
@@ -1254,6 +1266,8 @@ static int __cpuinit cpuup_prepare(long cpu) | |||
1254 | kfree(shared); | 1266 | kfree(shared); |
1255 | free_alien_cache(alien); | 1267 | free_alien_cache(alien); |
1256 | } | 1268 | } |
1269 | init_node_lock_keys(node); | ||
1270 | |||
1257 | return 0; | 1271 | return 0; |
1258 | bad: | 1272 | bad: |
1259 | cpuup_canceled(cpu); | 1273 | cpuup_canceled(cpu); |
@@ -2261,9 +2275,11 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2261 | /* | 2275 | /* |
2262 | * Determine if the slab management is 'on' or 'off' slab. | 2276 | * Determine if the slab management is 'on' or 'off' slab. |
2263 | * (bootstrapping cannot cope with offslab caches so don't do | 2277 | * (bootstrapping cannot cope with offslab caches so don't do |
2264 | * it too early on.) | 2278 | * it too early on. Always use on-slab management when |
2279 | * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) | ||
2265 | */ | 2280 | */ |
2266 | if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init) | 2281 | if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init && |
2282 | !(flags & SLAB_NOLEAKTRACE)) | ||
2267 | /* | 2283 | /* |
2268 | * Size is large, assume best to place the slab management obj | 2284 | * Size is large, assume best to place the slab management obj |
2269 | * off-slab (should allow better packing of objs). | 2285 | * off-slab (should allow better packing of objs). |
@@ -2582,8 +2598,8 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, | |||
2582 | * kmemleak does not treat the ->s_mem pointer as a reference | 2598 | * kmemleak does not treat the ->s_mem pointer as a reference |
2583 | * to the object. Otherwise we will not report the leak. | 2599 | * to the object. Otherwise we will not report the leak. |
2584 | */ | 2600 | */ |
2585 | kmemleak_scan_area(slabp, offsetof(struct slab, list), | 2601 | kmemleak_scan_area(&slabp->list, sizeof(struct list_head), |
2586 | sizeof(struct list_head), local_flags); | 2602 | local_flags); |
2587 | if (!slabp) | 2603 | if (!slabp) |
2588 | return NULL; | 2604 | return NULL; |
2589 | } else { | 2605 | } else { |
@@ -3103,13 +3119,19 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3103 | } else { | 3119 | } else { |
3104 | STATS_INC_ALLOCMISS(cachep); | 3120 | STATS_INC_ALLOCMISS(cachep); |
3105 | objp = cache_alloc_refill(cachep, flags); | 3121 | objp = cache_alloc_refill(cachep, flags); |
3122 | /* | ||
3123 | * the 'ac' may be updated by cache_alloc_refill(), | ||
3124 | * and kmemleak_erase() requires its correct value. | ||
3125 | */ | ||
3126 | ac = cpu_cache_get(cachep); | ||
3106 | } | 3127 | } |
3107 | /* | 3128 | /* |
3108 | * To avoid a false negative, if an object that is in one of the | 3129 | * To avoid a false negative, if an object that is in one of the |
3109 | * per-CPU caches is leaked, we need to make sure kmemleak doesn't | 3130 | * per-CPU caches is leaked, we need to make sure kmemleak doesn't |
3110 | * treat the array pointers as a reference to the object. | 3131 | * treat the array pointers as a reference to the object. |
3111 | */ | 3132 | */ |
3112 | kmemleak_erase(&ac->entry[ac->avail]); | 3133 | if (objp) |
3134 | kmemleak_erase(&ac->entry[ac->avail]); | ||
3113 | return objp; | 3135 | return objp; |
3114 | } | 3136 | } |
3115 | 3137 | ||
@@ -3306,7 +3328,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, | |||
3306 | cache_alloc_debugcheck_before(cachep, flags); | 3328 | cache_alloc_debugcheck_before(cachep, flags); |
3307 | local_irq_save(save_flags); | 3329 | local_irq_save(save_flags); |
3308 | 3330 | ||
3309 | if (unlikely(nodeid == -1)) | 3331 | if (nodeid == -1) |
3310 | nodeid = numa_node_id(); | 3332 | nodeid = numa_node_id(); |
3311 | 3333 | ||
3312 | if (unlikely(!cachep->nodelists[nodeid])) { | 3334 | if (unlikely(!cachep->nodelists[nodeid])) { |
@@ -3558,7 +3580,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3558 | } | 3580 | } |
3559 | EXPORT_SYMBOL(kmem_cache_alloc); | 3581 | EXPORT_SYMBOL(kmem_cache_alloc); |
3560 | 3582 | ||
3561 | #ifdef CONFIG_KMEMTRACE | 3583 | #ifdef CONFIG_TRACING |
3562 | void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) | 3584 | void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) |
3563 | { | 3585 | { |
3564 | return __cache_alloc(cachep, flags, __builtin_return_address(0)); | 3586 | return __cache_alloc(cachep, flags, __builtin_return_address(0)); |
@@ -3621,7 +3643,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
3621 | } | 3643 | } |
3622 | EXPORT_SYMBOL(kmem_cache_alloc_node); | 3644 | EXPORT_SYMBOL(kmem_cache_alloc_node); |
3623 | 3645 | ||
3624 | #ifdef CONFIG_KMEMTRACE | 3646 | #ifdef CONFIG_TRACING |
3625 | void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, | 3647 | void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, |
3626 | gfp_t flags, | 3648 | gfp_t flags, |
3627 | int nodeid) | 3649 | int nodeid) |
@@ -3649,7 +3671,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) | |||
3649 | return ret; | 3671 | return ret; |
3650 | } | 3672 | } |
3651 | 3673 | ||
3652 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) | 3674 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) |
3653 | void *__kmalloc_node(size_t size, gfp_t flags, int node) | 3675 | void *__kmalloc_node(size_t size, gfp_t flags, int node) |
3654 | { | 3676 | { |
3655 | return __do_kmalloc_node(size, flags, node, | 3677 | return __do_kmalloc_node(size, flags, node, |
@@ -3669,7 +3691,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) | |||
3669 | return __do_kmalloc_node(size, flags, node, NULL); | 3691 | return __do_kmalloc_node(size, flags, node, NULL); |
3670 | } | 3692 | } |
3671 | EXPORT_SYMBOL(__kmalloc_node); | 3693 | EXPORT_SYMBOL(__kmalloc_node); |
3672 | #endif /* CONFIG_DEBUG_SLAB */ | 3694 | #endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */ |
3673 | #endif /* CONFIG_NUMA */ | 3695 | #endif /* CONFIG_NUMA */ |
3674 | 3696 | ||
3675 | /** | 3697 | /** |
@@ -3701,7 +3723,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, | |||
3701 | } | 3723 | } |
3702 | 3724 | ||
3703 | 3725 | ||
3704 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) | 3726 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) |
3705 | void *__kmalloc(size_t size, gfp_t flags) | 3727 | void *__kmalloc(size_t size, gfp_t flags) |
3706 | { | 3728 | { |
3707 | return __do_kmalloc(size, flags, __builtin_return_address(0)); | 3729 | return __do_kmalloc(size, flags, __builtin_return_address(0)); |
@@ -1735,7 +1735,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, | |||
1735 | } | 1735 | } |
1736 | local_irq_restore(flags); | 1736 | local_irq_restore(flags); |
1737 | 1737 | ||
1738 | if (unlikely((gfpflags & __GFP_ZERO) && object)) | 1738 | if (unlikely(gfpflags & __GFP_ZERO) && object) |
1739 | memset(object, 0, objsize); | 1739 | memset(object, 0, objsize); |
1740 | 1740 | ||
1741 | kmemcheck_slab_alloc(s, gfpflags, object, c->objsize); | 1741 | kmemcheck_slab_alloc(s, gfpflags, object, c->objsize); |
@@ -1754,7 +1754,7 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) | |||
1754 | } | 1754 | } |
1755 | EXPORT_SYMBOL(kmem_cache_alloc); | 1755 | EXPORT_SYMBOL(kmem_cache_alloc); |
1756 | 1756 | ||
1757 | #ifdef CONFIG_KMEMTRACE | 1757 | #ifdef CONFIG_TRACING |
1758 | void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) | 1758 | void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) |
1759 | { | 1759 | { |
1760 | return slab_alloc(s, gfpflags, -1, _RET_IP_); | 1760 | return slab_alloc(s, gfpflags, -1, _RET_IP_); |
@@ -1775,7 +1775,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) | |||
1775 | EXPORT_SYMBOL(kmem_cache_alloc_node); | 1775 | EXPORT_SYMBOL(kmem_cache_alloc_node); |
1776 | #endif | 1776 | #endif |
1777 | 1777 | ||
1778 | #ifdef CONFIG_KMEMTRACE | 1778 | #ifdef CONFIG_TRACING |
1779 | void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, | 1779 | void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, |
1780 | gfp_t gfpflags, | 1780 | gfp_t gfpflags, |
1781 | int node) | 1781 | int node) |
@@ -4371,12 +4371,28 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) | |||
4371 | return len + sprintf(buf + len, "\n"); | 4371 | return len + sprintf(buf + len, "\n"); |
4372 | } | 4372 | } |
4373 | 4373 | ||
4374 | static void clear_stat(struct kmem_cache *s, enum stat_item si) | ||
4375 | { | ||
4376 | int cpu; | ||
4377 | |||
4378 | for_each_online_cpu(cpu) | ||
4379 | get_cpu_slab(s, cpu)->stat[si] = 0; | ||
4380 | } | ||
4381 | |||
4374 | #define STAT_ATTR(si, text) \ | 4382 | #define STAT_ATTR(si, text) \ |
4375 | static ssize_t text##_show(struct kmem_cache *s, char *buf) \ | 4383 | static ssize_t text##_show(struct kmem_cache *s, char *buf) \ |
4376 | { \ | 4384 | { \ |
4377 | return show_stat(s, buf, si); \ | 4385 | return show_stat(s, buf, si); \ |
4378 | } \ | 4386 | } \ |
4379 | SLAB_ATTR_RO(text); \ | 4387 | static ssize_t text##_store(struct kmem_cache *s, \ |
4388 | const char *buf, size_t length) \ | ||
4389 | { \ | ||
4390 | if (buf[0] != '0') \ | ||
4391 | return -EINVAL; \ | ||
4392 | clear_stat(s, si); \ | ||
4393 | return length; \ | ||
4394 | } \ | ||
4395 | SLAB_ATTR(text); \ | ||
4380 | 4396 | ||
4381 | STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); | 4397 | STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); |
4382 | STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); | 4398 | STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 4de7f02f820b..6c0585b16418 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/seq_file.h> | 22 | #include <linux/seq_file.h> |
23 | #include <linux/init.h> | 23 | #include <linux/init.h> |
24 | #include <linux/module.h> | 24 | #include <linux/module.h> |
25 | #include <linux/ksm.h> | ||
25 | #include <linux/rmap.h> | 26 | #include <linux/rmap.h> |
26 | #include <linux/security.h> | 27 | #include <linux/security.h> |
27 | #include <linux/backing-dev.h> | 28 | #include <linux/backing-dev.h> |
@@ -35,11 +36,15 @@ | |||
35 | #include <linux/swapops.h> | 36 | #include <linux/swapops.h> |
36 | #include <linux/page_cgroup.h> | 37 | #include <linux/page_cgroup.h> |
37 | 38 | ||
39 | static bool swap_count_continued(struct swap_info_struct *, pgoff_t, | ||
40 | unsigned char); | ||
41 | static void free_swap_count_continuations(struct swap_info_struct *); | ||
42 | static sector_t map_swap_entry(swp_entry_t, struct block_device**); | ||
43 | |||
38 | static DEFINE_SPINLOCK(swap_lock); | 44 | static DEFINE_SPINLOCK(swap_lock); |
39 | static unsigned int nr_swapfiles; | 45 | static unsigned int nr_swapfiles; |
40 | long nr_swap_pages; | 46 | long nr_swap_pages; |
41 | long total_swap_pages; | 47 | long total_swap_pages; |
42 | static int swap_overflow; | ||
43 | static int least_priority; | 48 | static int least_priority; |
44 | 49 | ||
45 | static const char Bad_file[] = "Bad swap file entry "; | 50 | static const char Bad_file[] = "Bad swap file entry "; |
@@ -49,42 +54,20 @@ static const char Unused_offset[] = "Unused swap offset entry "; | |||
49 | 54 | ||
50 | static struct swap_list_t swap_list = {-1, -1}; | 55 | static struct swap_list_t swap_list = {-1, -1}; |
51 | 56 | ||
52 | static struct swap_info_struct swap_info[MAX_SWAPFILES]; | 57 | static struct swap_info_struct *swap_info[MAX_SWAPFILES]; |
53 | 58 | ||
54 | static DEFINE_MUTEX(swapon_mutex); | 59 | static DEFINE_MUTEX(swapon_mutex); |
55 | 60 | ||
56 | /* For reference count accounting in swap_map */ | 61 | static inline unsigned char swap_count(unsigned char ent) |
57 | /* enum for swap_map[] handling. internal use only */ | ||
58 | enum { | ||
59 | SWAP_MAP = 0, /* ops for reference from swap users */ | ||
60 | SWAP_CACHE, /* ops for reference from swap cache */ | ||
61 | }; | ||
62 | |||
63 | static inline int swap_count(unsigned short ent) | ||
64 | { | ||
65 | return ent & SWAP_COUNT_MASK; | ||
66 | } | ||
67 | |||
68 | static inline bool swap_has_cache(unsigned short ent) | ||
69 | { | 62 | { |
70 | return !!(ent & SWAP_HAS_CACHE); | 63 | return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ |
71 | } | 64 | } |
72 | 65 | ||
73 | static inline unsigned short encode_swapmap(int count, bool has_cache) | 66 | /* returns 1 if swap entry is freed */ |
74 | { | ||
75 | unsigned short ret = count; | ||
76 | |||
77 | if (has_cache) | ||
78 | return SWAP_HAS_CACHE | ret; | ||
79 | return ret; | ||
80 | } | ||
81 | |||
82 | /* returnes 1 if swap entry is freed */ | ||
83 | static int | 67 | static int |
84 | __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) | 68 | __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) |
85 | { | 69 | { |
86 | int type = si - swap_info; | 70 | swp_entry_t entry = swp_entry(si->type, offset); |
87 | swp_entry_t entry = swp_entry(type, offset); | ||
88 | struct page *page; | 71 | struct page *page; |
89 | int ret = 0; | 72 | int ret = 0; |
90 | 73 | ||
@@ -120,7 +103,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) | |||
120 | down_read(&swap_unplug_sem); | 103 | down_read(&swap_unplug_sem); |
121 | entry.val = page_private(page); | 104 | entry.val = page_private(page); |
122 | if (PageSwapCache(page)) { | 105 | if (PageSwapCache(page)) { |
123 | struct block_device *bdev = swap_info[swp_type(entry)].bdev; | 106 | struct block_device *bdev = swap_info[swp_type(entry)]->bdev; |
124 | struct backing_dev_info *bdi; | 107 | struct backing_dev_info *bdi; |
125 | 108 | ||
126 | /* | 109 | /* |
@@ -146,23 +129,28 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) | |||
146 | static int discard_swap(struct swap_info_struct *si) | 129 | static int discard_swap(struct swap_info_struct *si) |
147 | { | 130 | { |
148 | struct swap_extent *se; | 131 | struct swap_extent *se; |
132 | sector_t start_block; | ||
133 | sector_t nr_blocks; | ||
149 | int err = 0; | 134 | int err = 0; |
150 | 135 | ||
151 | list_for_each_entry(se, &si->extent_list, list) { | 136 | /* Do not discard the swap header page! */ |
152 | sector_t start_block = se->start_block << (PAGE_SHIFT - 9); | 137 | se = &si->first_swap_extent; |
153 | sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); | 138 | start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); |
139 | nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); | ||
140 | if (nr_blocks) { | ||
141 | err = blkdev_issue_discard(si->bdev, start_block, | ||
142 | nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER); | ||
143 | if (err) | ||
144 | return err; | ||
145 | cond_resched(); | ||
146 | } | ||
154 | 147 | ||
155 | if (se->start_page == 0) { | 148 | list_for_each_entry(se, &si->first_swap_extent.list, list) { |
156 | /* Do not discard the swap header page! */ | 149 | start_block = se->start_block << (PAGE_SHIFT - 9); |
157 | start_block += 1 << (PAGE_SHIFT - 9); | 150 | nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); |
158 | nr_blocks -= 1 << (PAGE_SHIFT - 9); | ||
159 | if (!nr_blocks) | ||
160 | continue; | ||
161 | } | ||
162 | 151 | ||
163 | err = blkdev_issue_discard(si->bdev, start_block, | 152 | err = blkdev_issue_discard(si->bdev, start_block, |
164 | nr_blocks, GFP_KERNEL, | 153 | nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER); |
165 | DISCARD_FL_BARRIER); | ||
166 | if (err) | 154 | if (err) |
167 | break; | 155 | break; |
168 | 156 | ||
@@ -201,14 +189,11 @@ static void discard_swap_cluster(struct swap_info_struct *si, | |||
201 | start_block <<= PAGE_SHIFT - 9; | 189 | start_block <<= PAGE_SHIFT - 9; |
202 | nr_blocks <<= PAGE_SHIFT - 9; | 190 | nr_blocks <<= PAGE_SHIFT - 9; |
203 | if (blkdev_issue_discard(si->bdev, start_block, | 191 | if (blkdev_issue_discard(si->bdev, start_block, |
204 | nr_blocks, GFP_NOIO, | 192 | nr_blocks, GFP_NOIO, DISCARD_FL_BARRIER)) |
205 | DISCARD_FL_BARRIER)) | ||
206 | break; | 193 | break; |
207 | } | 194 | } |
208 | 195 | ||
209 | lh = se->list.next; | 196 | lh = se->list.next; |
210 | if (lh == &si->extent_list) | ||
211 | lh = lh->next; | ||
212 | se = list_entry(lh, struct swap_extent, list); | 197 | se = list_entry(lh, struct swap_extent, list); |
213 | } | 198 | } |
214 | } | 199 | } |
@@ -223,7 +208,7 @@ static int wait_for_discard(void *word) | |||
223 | #define LATENCY_LIMIT 256 | 208 | #define LATENCY_LIMIT 256 |
224 | 209 | ||
225 | static inline unsigned long scan_swap_map(struct swap_info_struct *si, | 210 | static inline unsigned long scan_swap_map(struct swap_info_struct *si, |
226 | int cache) | 211 | unsigned char usage) |
227 | { | 212 | { |
228 | unsigned long offset; | 213 | unsigned long offset; |
229 | unsigned long scan_base; | 214 | unsigned long scan_base; |
@@ -354,10 +339,7 @@ checks: | |||
354 | si->lowest_bit = si->max; | 339 | si->lowest_bit = si->max; |
355 | si->highest_bit = 0; | 340 | si->highest_bit = 0; |
356 | } | 341 | } |
357 | if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */ | 342 | si->swap_map[offset] = usage; |
358 | si->swap_map[offset] = encode_swapmap(0, true); | ||
359 | else /* at suspend */ | ||
360 | si->swap_map[offset] = encode_swapmap(1, false); | ||
361 | si->cluster_next = offset + 1; | 343 | si->cluster_next = offset + 1; |
362 | si->flags -= SWP_SCANNING; | 344 | si->flags -= SWP_SCANNING; |
363 | 345 | ||
@@ -467,10 +449,10 @@ swp_entry_t get_swap_page(void) | |||
467 | nr_swap_pages--; | 449 | nr_swap_pages--; |
468 | 450 | ||
469 | for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { | 451 | for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { |
470 | si = swap_info + type; | 452 | si = swap_info[type]; |
471 | next = si->next; | 453 | next = si->next; |
472 | if (next < 0 || | 454 | if (next < 0 || |
473 | (!wrapped && si->prio != swap_info[next].prio)) { | 455 | (!wrapped && si->prio != swap_info[next]->prio)) { |
474 | next = swap_list.head; | 456 | next = swap_list.head; |
475 | wrapped++; | 457 | wrapped++; |
476 | } | 458 | } |
@@ -482,7 +464,7 @@ swp_entry_t get_swap_page(void) | |||
482 | 464 | ||
483 | swap_list.next = next; | 465 | swap_list.next = next; |
484 | /* This is called for allocating swap entry for cache */ | 466 | /* This is called for allocating swap entry for cache */ |
485 | offset = scan_swap_map(si, SWAP_CACHE); | 467 | offset = scan_swap_map(si, SWAP_HAS_CACHE); |
486 | if (offset) { | 468 | if (offset) { |
487 | spin_unlock(&swap_lock); | 469 | spin_unlock(&swap_lock); |
488 | return swp_entry(type, offset); | 470 | return swp_entry(type, offset); |
@@ -503,11 +485,11 @@ swp_entry_t get_swap_page_of_type(int type) | |||
503 | pgoff_t offset; | 485 | pgoff_t offset; |
504 | 486 | ||
505 | spin_lock(&swap_lock); | 487 | spin_lock(&swap_lock); |
506 | si = swap_info + type; | 488 | si = swap_info[type]; |
507 | if (si->flags & SWP_WRITEOK) { | 489 | if (si && (si->flags & SWP_WRITEOK)) { |
508 | nr_swap_pages--; | 490 | nr_swap_pages--; |
509 | /* This is called for allocating swap entry, not cache */ | 491 | /* This is called for allocating swap entry, not cache */ |
510 | offset = scan_swap_map(si, SWAP_MAP); | 492 | offset = scan_swap_map(si, 1); |
511 | if (offset) { | 493 | if (offset) { |
512 | spin_unlock(&swap_lock); | 494 | spin_unlock(&swap_lock); |
513 | return swp_entry(type, offset); | 495 | return swp_entry(type, offset); |
@@ -518,9 +500,9 @@ swp_entry_t get_swap_page_of_type(int type) | |||
518 | return (swp_entry_t) {0}; | 500 | return (swp_entry_t) {0}; |
519 | } | 501 | } |
520 | 502 | ||
521 | static struct swap_info_struct * swap_info_get(swp_entry_t entry) | 503 | static struct swap_info_struct *swap_info_get(swp_entry_t entry) |
522 | { | 504 | { |
523 | struct swap_info_struct * p; | 505 | struct swap_info_struct *p; |
524 | unsigned long offset, type; | 506 | unsigned long offset, type; |
525 | 507 | ||
526 | if (!entry.val) | 508 | if (!entry.val) |
@@ -528,7 +510,7 @@ static struct swap_info_struct * swap_info_get(swp_entry_t entry) | |||
528 | type = swp_type(entry); | 510 | type = swp_type(entry); |
529 | if (type >= nr_swapfiles) | 511 | if (type >= nr_swapfiles) |
530 | goto bad_nofile; | 512 | goto bad_nofile; |
531 | p = & swap_info[type]; | 513 | p = swap_info[type]; |
532 | if (!(p->flags & SWP_USED)) | 514 | if (!(p->flags & SWP_USED)) |
533 | goto bad_device; | 515 | goto bad_device; |
534 | offset = swp_offset(entry); | 516 | offset = swp_offset(entry); |
@@ -554,41 +536,56 @@ out: | |||
554 | return NULL; | 536 | return NULL; |
555 | } | 537 | } |
556 | 538 | ||
557 | static int swap_entry_free(struct swap_info_struct *p, | 539 | static unsigned char swap_entry_free(struct swap_info_struct *p, |
558 | swp_entry_t ent, int cache) | 540 | swp_entry_t entry, unsigned char usage) |
559 | { | 541 | { |
560 | unsigned long offset = swp_offset(ent); | 542 | unsigned long offset = swp_offset(entry); |
561 | int count = swap_count(p->swap_map[offset]); | 543 | unsigned char count; |
562 | bool has_cache; | 544 | unsigned char has_cache; |
563 | 545 | ||
564 | has_cache = swap_has_cache(p->swap_map[offset]); | 546 | count = p->swap_map[offset]; |
547 | has_cache = count & SWAP_HAS_CACHE; | ||
548 | count &= ~SWAP_HAS_CACHE; | ||
565 | 549 | ||
566 | if (cache == SWAP_MAP) { /* dropping usage count of swap */ | 550 | if (usage == SWAP_HAS_CACHE) { |
567 | if (count < SWAP_MAP_MAX) { | ||
568 | count--; | ||
569 | p->swap_map[offset] = encode_swapmap(count, has_cache); | ||
570 | } | ||
571 | } else { /* dropping swap cache flag */ | ||
572 | VM_BUG_ON(!has_cache); | 551 | VM_BUG_ON(!has_cache); |
573 | p->swap_map[offset] = encode_swapmap(count, false); | 552 | has_cache = 0; |
574 | 553 | } else if (count == SWAP_MAP_SHMEM) { | |
554 | /* | ||
555 | * Or we could insist on shmem.c using a special | ||
556 | * swap_shmem_free() and free_shmem_swap_and_cache()... | ||
557 | */ | ||
558 | count = 0; | ||
559 | } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { | ||
560 | if (count == COUNT_CONTINUED) { | ||
561 | if (swap_count_continued(p, offset, count)) | ||
562 | count = SWAP_MAP_MAX | COUNT_CONTINUED; | ||
563 | else | ||
564 | count = SWAP_MAP_MAX; | ||
565 | } else | ||
566 | count--; | ||
575 | } | 567 | } |
576 | /* return code. */ | 568 | |
577 | count = p->swap_map[offset]; | 569 | if (!count) |
570 | mem_cgroup_uncharge_swap(entry); | ||
571 | |||
572 | usage = count | has_cache; | ||
573 | p->swap_map[offset] = usage; | ||
574 | |||
578 | /* free if no reference */ | 575 | /* free if no reference */ |
579 | if (!count) { | 576 | if (!usage) { |
580 | if (offset < p->lowest_bit) | 577 | if (offset < p->lowest_bit) |
581 | p->lowest_bit = offset; | 578 | p->lowest_bit = offset; |
582 | if (offset > p->highest_bit) | 579 | if (offset > p->highest_bit) |
583 | p->highest_bit = offset; | 580 | p->highest_bit = offset; |
584 | if (p->prio > swap_info[swap_list.next].prio) | 581 | if (swap_list.next >= 0 && |
585 | swap_list.next = p - swap_info; | 582 | p->prio > swap_info[swap_list.next]->prio) |
583 | swap_list.next = p->type; | ||
586 | nr_swap_pages++; | 584 | nr_swap_pages++; |
587 | p->inuse_pages--; | 585 | p->inuse_pages--; |
588 | } | 586 | } |
589 | if (!swap_count(count)) | 587 | |
590 | mem_cgroup_uncharge_swap(ent); | 588 | return usage; |
591 | return count; | ||
592 | } | 589 | } |
593 | 590 | ||
594 | /* | 591 | /* |
@@ -597,11 +594,11 @@ static int swap_entry_free(struct swap_info_struct *p, | |||
597 | */ | 594 | */ |
598 | void swap_free(swp_entry_t entry) | 595 | void swap_free(swp_entry_t entry) |
599 | { | 596 | { |
600 | struct swap_info_struct * p; | 597 | struct swap_info_struct *p; |
601 | 598 | ||
602 | p = swap_info_get(entry); | 599 | p = swap_info_get(entry); |
603 | if (p) { | 600 | if (p) { |
604 | swap_entry_free(p, entry, SWAP_MAP); | 601 | swap_entry_free(p, entry, 1); |
605 | spin_unlock(&swap_lock); | 602 | spin_unlock(&swap_lock); |
606 | } | 603 | } |
607 | } | 604 | } |
@@ -612,26 +609,21 @@ void swap_free(swp_entry_t entry) | |||
612 | void swapcache_free(swp_entry_t entry, struct page *page) | 609 | void swapcache_free(swp_entry_t entry, struct page *page) |
613 | { | 610 | { |
614 | struct swap_info_struct *p; | 611 | struct swap_info_struct *p; |
615 | int ret; | 612 | unsigned char count; |
616 | 613 | ||
617 | p = swap_info_get(entry); | 614 | p = swap_info_get(entry); |
618 | if (p) { | 615 | if (p) { |
619 | ret = swap_entry_free(p, entry, SWAP_CACHE); | 616 | count = swap_entry_free(p, entry, SWAP_HAS_CACHE); |
620 | if (page) { | 617 | if (page) |
621 | bool swapout; | 618 | mem_cgroup_uncharge_swapcache(page, entry, count != 0); |
622 | if (ret) | ||
623 | swapout = true; /* the end of swap out */ | ||
624 | else | ||
625 | swapout = false; /* no more swap users! */ | ||
626 | mem_cgroup_uncharge_swapcache(page, entry, swapout); | ||
627 | } | ||
628 | spin_unlock(&swap_lock); | 619 | spin_unlock(&swap_lock); |
629 | } | 620 | } |
630 | return; | ||
631 | } | 621 | } |
632 | 622 | ||
633 | /* | 623 | /* |
634 | * How many references to page are currently swapped out? | 624 | * How many references to page are currently swapped out? |
625 | * This does not give an exact answer when swap count is continued, | ||
626 | * but does include the high COUNT_CONTINUED flag to allow for that. | ||
635 | */ | 627 | */ |
636 | static inline int page_swapcount(struct page *page) | 628 | static inline int page_swapcount(struct page *page) |
637 | { | 629 | { |
@@ -659,6 +651,8 @@ int reuse_swap_page(struct page *page) | |||
659 | int count; | 651 | int count; |
660 | 652 | ||
661 | VM_BUG_ON(!PageLocked(page)); | 653 | VM_BUG_ON(!PageLocked(page)); |
654 | if (unlikely(PageKsm(page))) | ||
655 | return 0; | ||
662 | count = page_mapcount(page); | 656 | count = page_mapcount(page); |
663 | if (count <= 1 && PageSwapCache(page)) { | 657 | if (count <= 1 && PageSwapCache(page)) { |
664 | count += page_swapcount(page); | 658 | count += page_swapcount(page); |
@@ -667,7 +661,7 @@ int reuse_swap_page(struct page *page) | |||
667 | SetPageDirty(page); | 661 | SetPageDirty(page); |
668 | } | 662 | } |
669 | } | 663 | } |
670 | return count == 1; | 664 | return count <= 1; |
671 | } | 665 | } |
672 | 666 | ||
673 | /* | 667 | /* |
@@ -704,7 +698,7 @@ int free_swap_and_cache(swp_entry_t entry) | |||
704 | 698 | ||
705 | p = swap_info_get(entry); | 699 | p = swap_info_get(entry); |
706 | if (p) { | 700 | if (p) { |
707 | if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) { | 701 | if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { |
708 | page = find_get_page(&swapper_space, entry.val); | 702 | page = find_get_page(&swapper_space, entry.val); |
709 | if (page && !trylock_page(page)) { | 703 | if (page && !trylock_page(page)) { |
710 | page_cache_release(page); | 704 | page_cache_release(page); |
@@ -741,14 +735,14 @@ int free_swap_and_cache(swp_entry_t entry) | |||
741 | int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) | 735 | int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) |
742 | { | 736 | { |
743 | struct block_device *bdev = NULL; | 737 | struct block_device *bdev = NULL; |
744 | int i; | 738 | int type; |
745 | 739 | ||
746 | if (device) | 740 | if (device) |
747 | bdev = bdget(device); | 741 | bdev = bdget(device); |
748 | 742 | ||
749 | spin_lock(&swap_lock); | 743 | spin_lock(&swap_lock); |
750 | for (i = 0; i < nr_swapfiles; i++) { | 744 | for (type = 0; type < nr_swapfiles; type++) { |
751 | struct swap_info_struct *sis = swap_info + i; | 745 | struct swap_info_struct *sis = swap_info[type]; |
752 | 746 | ||
753 | if (!(sis->flags & SWP_WRITEOK)) | 747 | if (!(sis->flags & SWP_WRITEOK)) |
754 | continue; | 748 | continue; |
@@ -758,20 +752,18 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) | |||
758 | *bdev_p = bdgrab(sis->bdev); | 752 | *bdev_p = bdgrab(sis->bdev); |
759 | 753 | ||
760 | spin_unlock(&swap_lock); | 754 | spin_unlock(&swap_lock); |
761 | return i; | 755 | return type; |
762 | } | 756 | } |
763 | if (bdev == sis->bdev) { | 757 | if (bdev == sis->bdev) { |
764 | struct swap_extent *se; | 758 | struct swap_extent *se = &sis->first_swap_extent; |
765 | 759 | ||
766 | se = list_entry(sis->extent_list.next, | ||
767 | struct swap_extent, list); | ||
768 | if (se->start_block == offset) { | 760 | if (se->start_block == offset) { |
769 | if (bdev_p) | 761 | if (bdev_p) |
770 | *bdev_p = bdgrab(sis->bdev); | 762 | *bdev_p = bdgrab(sis->bdev); |
771 | 763 | ||
772 | spin_unlock(&swap_lock); | 764 | spin_unlock(&swap_lock); |
773 | bdput(bdev); | 765 | bdput(bdev); |
774 | return i; | 766 | return type; |
775 | } | 767 | } |
776 | } | 768 | } |
777 | } | 769 | } |
@@ -783,6 +775,21 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) | |||
783 | } | 775 | } |
784 | 776 | ||
785 | /* | 777 | /* |
778 | * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev | ||
779 | * corresponding to given index in swap_info (swap type). | ||
780 | */ | ||
781 | sector_t swapdev_block(int type, pgoff_t offset) | ||
782 | { | ||
783 | struct block_device *bdev; | ||
784 | |||
785 | if ((unsigned int)type >= nr_swapfiles) | ||
786 | return 0; | ||
787 | if (!(swap_info[type]->flags & SWP_WRITEOK)) | ||
788 | return 0; | ||
789 | return map_swap_entry(swp_entry(type, offset), &bdev); | ||
790 | } | ||
791 | |||
792 | /* | ||
786 | * Return either the total number of swap pages of given type, or the number | 793 | * Return either the total number of swap pages of given type, or the number |
787 | * of free pages of that type (depending on @free) | 794 | * of free pages of that type (depending on @free) |
788 | * | 795 | * |
@@ -792,18 +799,20 @@ unsigned int count_swap_pages(int type, int free) | |||
792 | { | 799 | { |
793 | unsigned int n = 0; | 800 | unsigned int n = 0; |
794 | 801 | ||
795 | if (type < nr_swapfiles) { | 802 | spin_lock(&swap_lock); |
796 | spin_lock(&swap_lock); | 803 | if ((unsigned int)type < nr_swapfiles) { |
797 | if (swap_info[type].flags & SWP_WRITEOK) { | 804 | struct swap_info_struct *sis = swap_info[type]; |
798 | n = swap_info[type].pages; | 805 | |
806 | if (sis->flags & SWP_WRITEOK) { | ||
807 | n = sis->pages; | ||
799 | if (free) | 808 | if (free) |
800 | n -= swap_info[type].inuse_pages; | 809 | n -= sis->inuse_pages; |
801 | } | 810 | } |
802 | spin_unlock(&swap_lock); | ||
803 | } | 811 | } |
812 | spin_unlock(&swap_lock); | ||
804 | return n; | 813 | return n; |
805 | } | 814 | } |
806 | #endif | 815 | #endif /* CONFIG_HIBERNATION */ |
807 | 816 | ||
808 | /* | 817 | /* |
809 | * No need to decide whether this PTE shares the swap entry with others, | 818 | * No need to decide whether this PTE shares the swap entry with others, |
@@ -932,7 +941,7 @@ static int unuse_vma(struct vm_area_struct *vma, | |||
932 | unsigned long addr, end, next; | 941 | unsigned long addr, end, next; |
933 | int ret; | 942 | int ret; |
934 | 943 | ||
935 | if (page->mapping) { | 944 | if (page_anon_vma(page)) { |
936 | addr = page_address_in_vma(page, vma); | 945 | addr = page_address_in_vma(page, vma); |
937 | if (addr == -EFAULT) | 946 | if (addr == -EFAULT) |
938 | return 0; | 947 | return 0; |
@@ -988,7 +997,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
988 | { | 997 | { |
989 | unsigned int max = si->max; | 998 | unsigned int max = si->max; |
990 | unsigned int i = prev; | 999 | unsigned int i = prev; |
991 | int count; | 1000 | unsigned char count; |
992 | 1001 | ||
993 | /* | 1002 | /* |
994 | * No need for swap_lock here: we're just looking | 1003 | * No need for swap_lock here: we're just looking |
@@ -1024,16 +1033,14 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
1024 | */ | 1033 | */ |
1025 | static int try_to_unuse(unsigned int type) | 1034 | static int try_to_unuse(unsigned int type) |
1026 | { | 1035 | { |
1027 | struct swap_info_struct * si = &swap_info[type]; | 1036 | struct swap_info_struct *si = swap_info[type]; |
1028 | struct mm_struct *start_mm; | 1037 | struct mm_struct *start_mm; |
1029 | unsigned short *swap_map; | 1038 | unsigned char *swap_map; |
1030 | unsigned short swcount; | 1039 | unsigned char swcount; |
1031 | struct page *page; | 1040 | struct page *page; |
1032 | swp_entry_t entry; | 1041 | swp_entry_t entry; |
1033 | unsigned int i = 0; | 1042 | unsigned int i = 0; |
1034 | int retval = 0; | 1043 | int retval = 0; |
1035 | int reset_overflow = 0; | ||
1036 | int shmem; | ||
1037 | 1044 | ||
1038 | /* | 1045 | /* |
1039 | * When searching mms for an entry, a good strategy is to | 1046 | * When searching mms for an entry, a good strategy is to |
@@ -1047,8 +1054,7 @@ static int try_to_unuse(unsigned int type) | |||
1047 | * together, child after parent. If we race with dup_mmap(), we | 1054 | * together, child after parent. If we race with dup_mmap(), we |
1048 | * prefer to resolve parent before child, lest we miss entries | 1055 | * prefer to resolve parent before child, lest we miss entries |
1049 | * duplicated after we scanned child: using last mm would invert | 1056 | * duplicated after we scanned child: using last mm would invert |
1050 | * that. Though it's only a serious concern when an overflowed | 1057 | * that. |
1051 | * swap count is reset from SWAP_MAP_MAX, preventing a rescan. | ||
1052 | */ | 1058 | */ |
1053 | start_mm = &init_mm; | 1059 | start_mm = &init_mm; |
1054 | atomic_inc(&init_mm.mm_users); | 1060 | atomic_inc(&init_mm.mm_users); |
@@ -1110,17 +1116,18 @@ static int try_to_unuse(unsigned int type) | |||
1110 | 1116 | ||
1111 | /* | 1117 | /* |
1112 | * Remove all references to entry. | 1118 | * Remove all references to entry. |
1113 | * Whenever we reach init_mm, there's no address space | ||
1114 | * to search, but use it as a reminder to search shmem. | ||
1115 | */ | 1119 | */ |
1116 | shmem = 0; | ||
1117 | swcount = *swap_map; | 1120 | swcount = *swap_map; |
1118 | if (swap_count(swcount)) { | 1121 | if (swap_count(swcount) == SWAP_MAP_SHMEM) { |
1119 | if (start_mm == &init_mm) | 1122 | retval = shmem_unuse(entry, page); |
1120 | shmem = shmem_unuse(entry, page); | 1123 | /* page has already been unlocked and released */ |
1121 | else | 1124 | if (retval < 0) |
1122 | retval = unuse_mm(start_mm, entry, page); | 1125 | break; |
1126 | continue; | ||
1123 | } | 1127 | } |
1128 | if (swap_count(swcount) && start_mm != &init_mm) | ||
1129 | retval = unuse_mm(start_mm, entry, page); | ||
1130 | |||
1124 | if (swap_count(*swap_map)) { | 1131 | if (swap_count(*swap_map)) { |
1125 | int set_start_mm = (*swap_map >= swcount); | 1132 | int set_start_mm = (*swap_map >= swcount); |
1126 | struct list_head *p = &start_mm->mmlist; | 1133 | struct list_head *p = &start_mm->mmlist; |
@@ -1131,7 +1138,7 @@ static int try_to_unuse(unsigned int type) | |||
1131 | atomic_inc(&new_start_mm->mm_users); | 1138 | atomic_inc(&new_start_mm->mm_users); |
1132 | atomic_inc(&prev_mm->mm_users); | 1139 | atomic_inc(&prev_mm->mm_users); |
1133 | spin_lock(&mmlist_lock); | 1140 | spin_lock(&mmlist_lock); |
1134 | while (swap_count(*swap_map) && !retval && !shmem && | 1141 | while (swap_count(*swap_map) && !retval && |
1135 | (p = p->next) != &start_mm->mmlist) { | 1142 | (p = p->next) != &start_mm->mmlist) { |
1136 | mm = list_entry(p, struct mm_struct, mmlist); | 1143 | mm = list_entry(p, struct mm_struct, mmlist); |
1137 | if (!atomic_inc_not_zero(&mm->mm_users)) | 1144 | if (!atomic_inc_not_zero(&mm->mm_users)) |
@@ -1145,14 +1152,12 @@ static int try_to_unuse(unsigned int type) | |||
1145 | swcount = *swap_map; | 1152 | swcount = *swap_map; |
1146 | if (!swap_count(swcount)) /* any usage ? */ | 1153 | if (!swap_count(swcount)) /* any usage ? */ |
1147 | ; | 1154 | ; |
1148 | else if (mm == &init_mm) { | 1155 | else if (mm == &init_mm) |
1149 | set_start_mm = 1; | 1156 | set_start_mm = 1; |
1150 | shmem = shmem_unuse(entry, page); | 1157 | else |
1151 | } else | ||
1152 | retval = unuse_mm(mm, entry, page); | 1158 | retval = unuse_mm(mm, entry, page); |
1153 | 1159 | ||
1154 | if (set_start_mm && | 1160 | if (set_start_mm && *swap_map < swcount) { |
1155 | swap_count(*swap_map) < swcount) { | ||
1156 | mmput(new_start_mm); | 1161 | mmput(new_start_mm); |
1157 | atomic_inc(&mm->mm_users); | 1162 | atomic_inc(&mm->mm_users); |
1158 | new_start_mm = mm; | 1163 | new_start_mm = mm; |
@@ -1165,13 +1170,6 @@ static int try_to_unuse(unsigned int type) | |||
1165 | mmput(start_mm); | 1170 | mmput(start_mm); |
1166 | start_mm = new_start_mm; | 1171 | start_mm = new_start_mm; |
1167 | } | 1172 | } |
1168 | if (shmem) { | ||
1169 | /* page has already been unlocked and released */ | ||
1170 | if (shmem > 0) | ||
1171 | continue; | ||
1172 | retval = shmem; | ||
1173 | break; | ||
1174 | } | ||
1175 | if (retval) { | 1173 | if (retval) { |
1176 | unlock_page(page); | 1174 | unlock_page(page); |
1177 | page_cache_release(page); | 1175 | page_cache_release(page); |
@@ -1179,30 +1177,6 @@ static int try_to_unuse(unsigned int type) | |||
1179 | } | 1177 | } |
1180 | 1178 | ||
1181 | /* | 1179 | /* |
1182 | * How could swap count reach 0x7ffe ? | ||
1183 | * There's no way to repeat a swap page within an mm | ||
1184 | * (except in shmem, where it's the shared object which takes | ||
1185 | * the reference count)? | ||
1186 | * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned | ||
1187 | * short is too small....) | ||
1188 | * If that's wrong, then we should worry more about | ||
1189 | * exit_mmap() and do_munmap() cases described above: | ||
1190 | * we might be resetting SWAP_MAP_MAX too early here. | ||
1191 | * We know "Undead"s can happen, they're okay, so don't | ||
1192 | * report them; but do report if we reset SWAP_MAP_MAX. | ||
1193 | */ | ||
1194 | /* We might release the lock_page() in unuse_mm(). */ | ||
1195 | if (!PageSwapCache(page) || page_private(page) != entry.val) | ||
1196 | goto retry; | ||
1197 | |||
1198 | if (swap_count(*swap_map) == SWAP_MAP_MAX) { | ||
1199 | spin_lock(&swap_lock); | ||
1200 | *swap_map = encode_swapmap(0, true); | ||
1201 | spin_unlock(&swap_lock); | ||
1202 | reset_overflow = 1; | ||
1203 | } | ||
1204 | |||
1205 | /* | ||
1206 | * If a reference remains (rare), we would like to leave | 1180 | * If a reference remains (rare), we would like to leave |
1207 | * the page in the swap cache; but try_to_unmap could | 1181 | * the page in the swap cache; but try_to_unmap could |
1208 | * then re-duplicate the entry once we drop page lock, | 1182 | * then re-duplicate the entry once we drop page lock, |
@@ -1214,6 +1188,12 @@ static int try_to_unuse(unsigned int type) | |||
1214 | * read from disk into another page. Splitting into two | 1188 | * read from disk into another page. Splitting into two |
1215 | * pages would be incorrect if swap supported "shared | 1189 | * pages would be incorrect if swap supported "shared |
1216 | * private" pages, but they are handled by tmpfs files. | 1190 | * private" pages, but they are handled by tmpfs files. |
1191 | * | ||
1192 | * Given how unuse_vma() targets one particular offset | ||
1193 | * in an anon_vma, once the anon_vma has been determined, | ||
1194 | * this splitting happens to be just what is needed to | ||
1195 | * handle where KSM pages have been swapped out: re-reading | ||
1196 | * is unnecessarily slow, but we can fix that later on. | ||
1217 | */ | 1197 | */ |
1218 | if (swap_count(*swap_map) && | 1198 | if (swap_count(*swap_map) && |
1219 | PageDirty(page) && PageSwapCache(page)) { | 1199 | PageDirty(page) && PageSwapCache(page)) { |
@@ -1243,7 +1223,6 @@ static int try_to_unuse(unsigned int type) | |||
1243 | * mark page dirty so shrink_page_list will preserve it. | 1223 | * mark page dirty so shrink_page_list will preserve it. |
1244 | */ | 1224 | */ |
1245 | SetPageDirty(page); | 1225 | SetPageDirty(page); |
1246 | retry: | ||
1247 | unlock_page(page); | 1226 | unlock_page(page); |
1248 | page_cache_release(page); | 1227 | page_cache_release(page); |
1249 | 1228 | ||
@@ -1255,10 +1234,6 @@ retry: | |||
1255 | } | 1234 | } |
1256 | 1235 | ||
1257 | mmput(start_mm); | 1236 | mmput(start_mm); |
1258 | if (reset_overflow) { | ||
1259 | printk(KERN_WARNING "swapoff: cleared swap entry overflow\n"); | ||
1260 | swap_overflow = 0; | ||
1261 | } | ||
1262 | return retval; | 1237 | return retval; |
1263 | } | 1238 | } |
1264 | 1239 | ||
@@ -1271,10 +1246,10 @@ retry: | |||
1271 | static void drain_mmlist(void) | 1246 | static void drain_mmlist(void) |
1272 | { | 1247 | { |
1273 | struct list_head *p, *next; | 1248 | struct list_head *p, *next; |
1274 | unsigned int i; | 1249 | unsigned int type; |
1275 | 1250 | ||
1276 | for (i = 0; i < nr_swapfiles; i++) | 1251 | for (type = 0; type < nr_swapfiles; type++) |
1277 | if (swap_info[i].inuse_pages) | 1252 | if (swap_info[type]->inuse_pages) |
1278 | return; | 1253 | return; |
1279 | spin_lock(&mmlist_lock); | 1254 | spin_lock(&mmlist_lock); |
1280 | list_for_each_safe(p, next, &init_mm.mmlist) | 1255 | list_for_each_safe(p, next, &init_mm.mmlist) |
@@ -1284,12 +1259,23 @@ static void drain_mmlist(void) | |||
1284 | 1259 | ||
1285 | /* | 1260 | /* |
1286 | * Use this swapdev's extent info to locate the (PAGE_SIZE) block which | 1261 | * Use this swapdev's extent info to locate the (PAGE_SIZE) block which |
1287 | * corresponds to page offset `offset'. | 1262 | * corresponds to page offset for the specified swap entry. |
1263 | * Note that the type of this function is sector_t, but it returns page offset | ||
1264 | * into the bdev, not sector offset. | ||
1288 | */ | 1265 | */ |
1289 | sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) | 1266 | static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) |
1290 | { | 1267 | { |
1291 | struct swap_extent *se = sis->curr_swap_extent; | 1268 | struct swap_info_struct *sis; |
1292 | struct swap_extent *start_se = se; | 1269 | struct swap_extent *start_se; |
1270 | struct swap_extent *se; | ||
1271 | pgoff_t offset; | ||
1272 | |||
1273 | sis = swap_info[swp_type(entry)]; | ||
1274 | *bdev = sis->bdev; | ||
1275 | |||
1276 | offset = swp_offset(entry); | ||
1277 | start_se = sis->curr_swap_extent; | ||
1278 | se = start_se; | ||
1293 | 1279 | ||
1294 | for ( ; ; ) { | 1280 | for ( ; ; ) { |
1295 | struct list_head *lh; | 1281 | struct list_head *lh; |
@@ -1299,40 +1285,31 @@ sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) | |||
1299 | return se->start_block + (offset - se->start_page); | 1285 | return se->start_block + (offset - se->start_page); |
1300 | } | 1286 | } |
1301 | lh = se->list.next; | 1287 | lh = se->list.next; |
1302 | if (lh == &sis->extent_list) | ||
1303 | lh = lh->next; | ||
1304 | se = list_entry(lh, struct swap_extent, list); | 1288 | se = list_entry(lh, struct swap_extent, list); |
1305 | sis->curr_swap_extent = se; | 1289 | sis->curr_swap_extent = se; |
1306 | BUG_ON(se == start_se); /* It *must* be present */ | 1290 | BUG_ON(se == start_se); /* It *must* be present */ |
1307 | } | 1291 | } |
1308 | } | 1292 | } |
1309 | 1293 | ||
1310 | #ifdef CONFIG_HIBERNATION | ||
1311 | /* | 1294 | /* |
1312 | * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev | 1295 | * Returns the page offset into bdev for the specified page's swap entry. |
1313 | * corresponding to given index in swap_info (swap type). | ||
1314 | */ | 1296 | */ |
1315 | sector_t swapdev_block(int swap_type, pgoff_t offset) | 1297 | sector_t map_swap_page(struct page *page, struct block_device **bdev) |
1316 | { | 1298 | { |
1317 | struct swap_info_struct *sis; | 1299 | swp_entry_t entry; |
1318 | 1300 | entry.val = page_private(page); | |
1319 | if (swap_type >= nr_swapfiles) | 1301 | return map_swap_entry(entry, bdev); |
1320 | return 0; | ||
1321 | |||
1322 | sis = swap_info + swap_type; | ||
1323 | return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0; | ||
1324 | } | 1302 | } |
1325 | #endif /* CONFIG_HIBERNATION */ | ||
1326 | 1303 | ||
1327 | /* | 1304 | /* |
1328 | * Free all of a swapdev's extent information | 1305 | * Free all of a swapdev's extent information |
1329 | */ | 1306 | */ |
1330 | static void destroy_swap_extents(struct swap_info_struct *sis) | 1307 | static void destroy_swap_extents(struct swap_info_struct *sis) |
1331 | { | 1308 | { |
1332 | while (!list_empty(&sis->extent_list)) { | 1309 | while (!list_empty(&sis->first_swap_extent.list)) { |
1333 | struct swap_extent *se; | 1310 | struct swap_extent *se; |
1334 | 1311 | ||
1335 | se = list_entry(sis->extent_list.next, | 1312 | se = list_entry(sis->first_swap_extent.list.next, |
1336 | struct swap_extent, list); | 1313 | struct swap_extent, list); |
1337 | list_del(&se->list); | 1314 | list_del(&se->list); |
1338 | kfree(se); | 1315 | kfree(se); |
@@ -1353,8 +1330,15 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, | |||
1353 | struct swap_extent *new_se; | 1330 | struct swap_extent *new_se; |
1354 | struct list_head *lh; | 1331 | struct list_head *lh; |
1355 | 1332 | ||
1356 | lh = sis->extent_list.prev; /* The highest page extent */ | 1333 | if (start_page == 0) { |
1357 | if (lh != &sis->extent_list) { | 1334 | se = &sis->first_swap_extent; |
1335 | sis->curr_swap_extent = se; | ||
1336 | se->start_page = 0; | ||
1337 | se->nr_pages = nr_pages; | ||
1338 | se->start_block = start_block; | ||
1339 | return 1; | ||
1340 | } else { | ||
1341 | lh = sis->first_swap_extent.list.prev; /* Highest extent */ | ||
1358 | se = list_entry(lh, struct swap_extent, list); | 1342 | se = list_entry(lh, struct swap_extent, list); |
1359 | BUG_ON(se->start_page + se->nr_pages != start_page); | 1343 | BUG_ON(se->start_page + se->nr_pages != start_page); |
1360 | if (se->start_block + se->nr_pages == start_block) { | 1344 | if (se->start_block + se->nr_pages == start_block) { |
@@ -1374,7 +1358,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, | |||
1374 | new_se->nr_pages = nr_pages; | 1358 | new_se->nr_pages = nr_pages; |
1375 | new_se->start_block = start_block; | 1359 | new_se->start_block = start_block; |
1376 | 1360 | ||
1377 | list_add_tail(&new_se->list, &sis->extent_list); | 1361 | list_add_tail(&new_se->list, &sis->first_swap_extent.list); |
1378 | return 1; | 1362 | return 1; |
1379 | } | 1363 | } |
1380 | 1364 | ||
@@ -1426,7 +1410,7 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) | |||
1426 | if (S_ISBLK(inode->i_mode)) { | 1410 | if (S_ISBLK(inode->i_mode)) { |
1427 | ret = add_swap_extent(sis, 0, sis->max, 0); | 1411 | ret = add_swap_extent(sis, 0, sis->max, 0); |
1428 | *span = sis->pages; | 1412 | *span = sis->pages; |
1429 | goto done; | 1413 | goto out; |
1430 | } | 1414 | } |
1431 | 1415 | ||
1432 | blkbits = inode->i_blkbits; | 1416 | blkbits = inode->i_blkbits; |
@@ -1497,25 +1481,22 @@ reprobe: | |||
1497 | sis->max = page_no; | 1481 | sis->max = page_no; |
1498 | sis->pages = page_no - 1; | 1482 | sis->pages = page_no - 1; |
1499 | sis->highest_bit = page_no - 1; | 1483 | sis->highest_bit = page_no - 1; |
1500 | done: | 1484 | out: |
1501 | sis->curr_swap_extent = list_entry(sis->extent_list.prev, | 1485 | return ret; |
1502 | struct swap_extent, list); | ||
1503 | goto out; | ||
1504 | bad_bmap: | 1486 | bad_bmap: |
1505 | printk(KERN_ERR "swapon: swapfile has holes\n"); | 1487 | printk(KERN_ERR "swapon: swapfile has holes\n"); |
1506 | ret = -EINVAL; | 1488 | ret = -EINVAL; |
1507 | out: | 1489 | goto out; |
1508 | return ret; | ||
1509 | } | 1490 | } |
1510 | 1491 | ||
1511 | SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | 1492 | SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) |
1512 | { | 1493 | { |
1513 | struct swap_info_struct * p = NULL; | 1494 | struct swap_info_struct *p = NULL; |
1514 | unsigned short *swap_map; | 1495 | unsigned char *swap_map; |
1515 | struct file *swap_file, *victim; | 1496 | struct file *swap_file, *victim; |
1516 | struct address_space *mapping; | 1497 | struct address_space *mapping; |
1517 | struct inode *inode; | 1498 | struct inode *inode; |
1518 | char * pathname; | 1499 | char *pathname; |
1519 | int i, type, prev; | 1500 | int i, type, prev; |
1520 | int err; | 1501 | int err; |
1521 | 1502 | ||
@@ -1536,8 +1517,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1536 | mapping = victim->f_mapping; | 1517 | mapping = victim->f_mapping; |
1537 | prev = -1; | 1518 | prev = -1; |
1538 | spin_lock(&swap_lock); | 1519 | spin_lock(&swap_lock); |
1539 | for (type = swap_list.head; type >= 0; type = swap_info[type].next) { | 1520 | for (type = swap_list.head; type >= 0; type = swap_info[type]->next) { |
1540 | p = swap_info + type; | 1521 | p = swap_info[type]; |
1541 | if (p->flags & SWP_WRITEOK) { | 1522 | if (p->flags & SWP_WRITEOK) { |
1542 | if (p->swap_file->f_mapping == mapping) | 1523 | if (p->swap_file->f_mapping == mapping) |
1543 | break; | 1524 | break; |
@@ -1556,18 +1537,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1556 | spin_unlock(&swap_lock); | 1537 | spin_unlock(&swap_lock); |
1557 | goto out_dput; | 1538 | goto out_dput; |
1558 | } | 1539 | } |
1559 | if (prev < 0) { | 1540 | if (prev < 0) |
1560 | swap_list.head = p->next; | 1541 | swap_list.head = p->next; |
1561 | } else { | 1542 | else |
1562 | swap_info[prev].next = p->next; | 1543 | swap_info[prev]->next = p->next; |
1563 | } | ||
1564 | if (type == swap_list.next) { | 1544 | if (type == swap_list.next) { |
1565 | /* just pick something that's safe... */ | 1545 | /* just pick something that's safe... */ |
1566 | swap_list.next = swap_list.head; | 1546 | swap_list.next = swap_list.head; |
1567 | } | 1547 | } |
1568 | if (p->prio < 0) { | 1548 | if (p->prio < 0) { |
1569 | for (i = p->next; i >= 0; i = swap_info[i].next) | 1549 | for (i = p->next; i >= 0; i = swap_info[i]->next) |
1570 | swap_info[i].prio = p->prio--; | 1550 | swap_info[i]->prio = p->prio--; |
1571 | least_priority++; | 1551 | least_priority++; |
1572 | } | 1552 | } |
1573 | nr_swap_pages -= p->pages; | 1553 | nr_swap_pages -= p->pages; |
@@ -1585,16 +1565,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1585 | if (p->prio < 0) | 1565 | if (p->prio < 0) |
1586 | p->prio = --least_priority; | 1566 | p->prio = --least_priority; |
1587 | prev = -1; | 1567 | prev = -1; |
1588 | for (i = swap_list.head; i >= 0; i = swap_info[i].next) { | 1568 | for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { |
1589 | if (p->prio >= swap_info[i].prio) | 1569 | if (p->prio >= swap_info[i]->prio) |
1590 | break; | 1570 | break; |
1591 | prev = i; | 1571 | prev = i; |
1592 | } | 1572 | } |
1593 | p->next = i; | 1573 | p->next = i; |
1594 | if (prev < 0) | 1574 | if (prev < 0) |
1595 | swap_list.head = swap_list.next = p - swap_info; | 1575 | swap_list.head = swap_list.next = type; |
1596 | else | 1576 | else |
1597 | swap_info[prev].next = p - swap_info; | 1577 | swap_info[prev]->next = type; |
1598 | nr_swap_pages += p->pages; | 1578 | nr_swap_pages += p->pages; |
1599 | total_swap_pages += p->pages; | 1579 | total_swap_pages += p->pages; |
1600 | p->flags |= SWP_WRITEOK; | 1580 | p->flags |= SWP_WRITEOK; |
@@ -1607,6 +1587,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1607 | up_write(&swap_unplug_sem); | 1587 | up_write(&swap_unplug_sem); |
1608 | 1588 | ||
1609 | destroy_swap_extents(p); | 1589 | destroy_swap_extents(p); |
1590 | if (p->flags & SWP_CONTINUED) | ||
1591 | free_swap_count_continuations(p); | ||
1592 | |||
1610 | mutex_lock(&swapon_mutex); | 1593 | mutex_lock(&swapon_mutex); |
1611 | spin_lock(&swap_lock); | 1594 | spin_lock(&swap_lock); |
1612 | drain_mmlist(); | 1595 | drain_mmlist(); |
@@ -1654,8 +1637,8 @@ out: | |||
1654 | /* iterator */ | 1637 | /* iterator */ |
1655 | static void *swap_start(struct seq_file *swap, loff_t *pos) | 1638 | static void *swap_start(struct seq_file *swap, loff_t *pos) |
1656 | { | 1639 | { |
1657 | struct swap_info_struct *ptr = swap_info; | 1640 | struct swap_info_struct *si; |
1658 | int i; | 1641 | int type; |
1659 | loff_t l = *pos; | 1642 | loff_t l = *pos; |
1660 | 1643 | ||
1661 | mutex_lock(&swapon_mutex); | 1644 | mutex_lock(&swapon_mutex); |
@@ -1663,11 +1646,13 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) | |||
1663 | if (!l) | 1646 | if (!l) |
1664 | return SEQ_START_TOKEN; | 1647 | return SEQ_START_TOKEN; |
1665 | 1648 | ||
1666 | for (i = 0; i < nr_swapfiles; i++, ptr++) { | 1649 | for (type = 0; type < nr_swapfiles; type++) { |
1667 | if (!(ptr->flags & SWP_USED) || !ptr->swap_map) | 1650 | smp_rmb(); /* read nr_swapfiles before swap_info[type] */ |
1651 | si = swap_info[type]; | ||
1652 | if (!(si->flags & SWP_USED) || !si->swap_map) | ||
1668 | continue; | 1653 | continue; |
1669 | if (!--l) | 1654 | if (!--l) |
1670 | return ptr; | 1655 | return si; |
1671 | } | 1656 | } |
1672 | 1657 | ||
1673 | return NULL; | 1658 | return NULL; |
@@ -1675,21 +1660,21 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) | |||
1675 | 1660 | ||
1676 | static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) | 1661 | static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) |
1677 | { | 1662 | { |
1678 | struct swap_info_struct *ptr; | 1663 | struct swap_info_struct *si = v; |
1679 | struct swap_info_struct *endptr = swap_info + nr_swapfiles; | 1664 | int type; |
1680 | 1665 | ||
1681 | if (v == SEQ_START_TOKEN) | 1666 | if (v == SEQ_START_TOKEN) |
1682 | ptr = swap_info; | 1667 | type = 0; |
1683 | else { | 1668 | else |
1684 | ptr = v; | 1669 | type = si->type + 1; |
1685 | ptr++; | ||
1686 | } | ||
1687 | 1670 | ||
1688 | for (; ptr < endptr; ptr++) { | 1671 | for (; type < nr_swapfiles; type++) { |
1689 | if (!(ptr->flags & SWP_USED) || !ptr->swap_map) | 1672 | smp_rmb(); /* read nr_swapfiles before swap_info[type] */ |
1673 | si = swap_info[type]; | ||
1674 | if (!(si->flags & SWP_USED) || !si->swap_map) | ||
1690 | continue; | 1675 | continue; |
1691 | ++*pos; | 1676 | ++*pos; |
1692 | return ptr; | 1677 | return si; |
1693 | } | 1678 | } |
1694 | 1679 | ||
1695 | return NULL; | 1680 | return NULL; |
@@ -1702,24 +1687,24 @@ static void swap_stop(struct seq_file *swap, void *v) | |||
1702 | 1687 | ||
1703 | static int swap_show(struct seq_file *swap, void *v) | 1688 | static int swap_show(struct seq_file *swap, void *v) |
1704 | { | 1689 | { |
1705 | struct swap_info_struct *ptr = v; | 1690 | struct swap_info_struct *si = v; |
1706 | struct file *file; | 1691 | struct file *file; |
1707 | int len; | 1692 | int len; |
1708 | 1693 | ||
1709 | if (ptr == SEQ_START_TOKEN) { | 1694 | if (si == SEQ_START_TOKEN) { |
1710 | seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); | 1695 | seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); |
1711 | return 0; | 1696 | return 0; |
1712 | } | 1697 | } |
1713 | 1698 | ||
1714 | file = ptr->swap_file; | 1699 | file = si->swap_file; |
1715 | len = seq_path(swap, &file->f_path, " \t\n\\"); | 1700 | len = seq_path(swap, &file->f_path, " \t\n\\"); |
1716 | seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", | 1701 | seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", |
1717 | len < 40 ? 40 - len : 1, " ", | 1702 | len < 40 ? 40 - len : 1, " ", |
1718 | S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? | 1703 | S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? |
1719 | "partition" : "file\t", | 1704 | "partition" : "file\t", |
1720 | ptr->pages << (PAGE_SHIFT - 10), | 1705 | si->pages << (PAGE_SHIFT - 10), |
1721 | ptr->inuse_pages << (PAGE_SHIFT - 10), | 1706 | si->inuse_pages << (PAGE_SHIFT - 10), |
1722 | ptr->prio); | 1707 | si->prio); |
1723 | return 0; | 1708 | return 0; |
1724 | } | 1709 | } |
1725 | 1710 | ||
@@ -1766,7 +1751,7 @@ late_initcall(max_swapfiles_check); | |||
1766 | */ | 1751 | */ |
1767 | SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | 1752 | SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) |
1768 | { | 1753 | { |
1769 | struct swap_info_struct * p; | 1754 | struct swap_info_struct *p; |
1770 | char *name = NULL; | 1755 | char *name = NULL; |
1771 | struct block_device *bdev = NULL; | 1756 | struct block_device *bdev = NULL; |
1772 | struct file *swap_file = NULL; | 1757 | struct file *swap_file = NULL; |
@@ -1780,30 +1765,52 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
1780 | sector_t span; | 1765 | sector_t span; |
1781 | unsigned long maxpages = 1; | 1766 | unsigned long maxpages = 1; |
1782 | unsigned long swapfilepages; | 1767 | unsigned long swapfilepages; |
1783 | unsigned short *swap_map = NULL; | 1768 | unsigned char *swap_map = NULL; |
1784 | struct page *page = NULL; | 1769 | struct page *page = NULL; |
1785 | struct inode *inode = NULL; | 1770 | struct inode *inode = NULL; |
1786 | int did_down = 0; | 1771 | int did_down = 0; |
1787 | 1772 | ||
1788 | if (!capable(CAP_SYS_ADMIN)) | 1773 | if (!capable(CAP_SYS_ADMIN)) |
1789 | return -EPERM; | 1774 | return -EPERM; |
1775 | |||
1776 | p = kzalloc(sizeof(*p), GFP_KERNEL); | ||
1777 | if (!p) | ||
1778 | return -ENOMEM; | ||
1779 | |||
1790 | spin_lock(&swap_lock); | 1780 | spin_lock(&swap_lock); |
1791 | p = swap_info; | 1781 | for (type = 0; type < nr_swapfiles; type++) { |
1792 | for (type = 0 ; type < nr_swapfiles ; type++,p++) | 1782 | if (!(swap_info[type]->flags & SWP_USED)) |
1793 | if (!(p->flags & SWP_USED)) | ||
1794 | break; | 1783 | break; |
1784 | } | ||
1795 | error = -EPERM; | 1785 | error = -EPERM; |
1796 | if (type >= MAX_SWAPFILES) { | 1786 | if (type >= MAX_SWAPFILES) { |
1797 | spin_unlock(&swap_lock); | 1787 | spin_unlock(&swap_lock); |
1788 | kfree(p); | ||
1798 | goto out; | 1789 | goto out; |
1799 | } | 1790 | } |
1800 | if (type >= nr_swapfiles) | 1791 | if (type >= nr_swapfiles) { |
1801 | nr_swapfiles = type+1; | 1792 | p->type = type; |
1802 | memset(p, 0, sizeof(*p)); | 1793 | swap_info[type] = p; |
1803 | INIT_LIST_HEAD(&p->extent_list); | 1794 | /* |
1795 | * Write swap_info[type] before nr_swapfiles, in case a | ||
1796 | * racing procfs swap_start() or swap_next() is reading them. | ||
1797 | * (We never shrink nr_swapfiles, we never free this entry.) | ||
1798 | */ | ||
1799 | smp_wmb(); | ||
1800 | nr_swapfiles++; | ||
1801 | } else { | ||
1802 | kfree(p); | ||
1803 | p = swap_info[type]; | ||
1804 | /* | ||
1805 | * Do not memset this entry: a racing procfs swap_next() | ||
1806 | * would be relying on p->type to remain valid. | ||
1807 | */ | ||
1808 | } | ||
1809 | INIT_LIST_HEAD(&p->first_swap_extent.list); | ||
1804 | p->flags = SWP_USED; | 1810 | p->flags = SWP_USED; |
1805 | p->next = -1; | 1811 | p->next = -1; |
1806 | spin_unlock(&swap_lock); | 1812 | spin_unlock(&swap_lock); |
1813 | |||
1807 | name = getname(specialfile); | 1814 | name = getname(specialfile); |
1808 | error = PTR_ERR(name); | 1815 | error = PTR_ERR(name); |
1809 | if (IS_ERR(name)) { | 1816 | if (IS_ERR(name)) { |
@@ -1823,7 +1830,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
1823 | 1830 | ||
1824 | error = -EBUSY; | 1831 | error = -EBUSY; |
1825 | for (i = 0; i < nr_swapfiles; i++) { | 1832 | for (i = 0; i < nr_swapfiles; i++) { |
1826 | struct swap_info_struct *q = &swap_info[i]; | 1833 | struct swap_info_struct *q = swap_info[i]; |
1827 | 1834 | ||
1828 | if (i == type || !q->swap_file) | 1835 | if (i == type || !q->swap_file) |
1829 | continue; | 1836 | continue; |
@@ -1898,6 +1905,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
1898 | 1905 | ||
1899 | p->lowest_bit = 1; | 1906 | p->lowest_bit = 1; |
1900 | p->cluster_next = 1; | 1907 | p->cluster_next = 1; |
1908 | p->cluster_nr = 0; | ||
1901 | 1909 | ||
1902 | /* | 1910 | /* |
1903 | * Find out how many pages are allowed for a single swap | 1911 | * Find out how many pages are allowed for a single swap |
@@ -1933,13 +1941,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
1933 | goto bad_swap; | 1941 | goto bad_swap; |
1934 | 1942 | ||
1935 | /* OK, set up the swap map and apply the bad block list */ | 1943 | /* OK, set up the swap map and apply the bad block list */ |
1936 | swap_map = vmalloc(maxpages * sizeof(short)); | 1944 | swap_map = vmalloc(maxpages); |
1937 | if (!swap_map) { | 1945 | if (!swap_map) { |
1938 | error = -ENOMEM; | 1946 | error = -ENOMEM; |
1939 | goto bad_swap; | 1947 | goto bad_swap; |
1940 | } | 1948 | } |
1941 | 1949 | ||
1942 | memset(swap_map, 0, maxpages * sizeof(short)); | 1950 | memset(swap_map, 0, maxpages); |
1943 | for (i = 0; i < swap_header->info.nr_badpages; i++) { | 1951 | for (i = 0; i < swap_header->info.nr_badpages; i++) { |
1944 | int page_nr = swap_header->info.badpages[i]; | 1952 | int page_nr = swap_header->info.badpages[i]; |
1945 | if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { | 1953 | if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { |
@@ -1974,12 +1982,14 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
1974 | goto bad_swap; | 1982 | goto bad_swap; |
1975 | } | 1983 | } |
1976 | 1984 | ||
1977 | if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { | 1985 | if (p->bdev) { |
1978 | p->flags |= SWP_SOLIDSTATE; | 1986 | if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { |
1979 | p->cluster_next = 1 + (random32() % p->highest_bit); | 1987 | p->flags |= SWP_SOLIDSTATE; |
1988 | p->cluster_next = 1 + (random32() % p->highest_bit); | ||
1989 | } | ||
1990 | if (discard_swap(p) == 0) | ||
1991 | p->flags |= SWP_DISCARDABLE; | ||
1980 | } | 1992 | } |
1981 | if (discard_swap(p) == 0) | ||
1982 | p->flags |= SWP_DISCARDABLE; | ||
1983 | 1993 | ||
1984 | mutex_lock(&swapon_mutex); | 1994 | mutex_lock(&swapon_mutex); |
1985 | spin_lock(&swap_lock); | 1995 | spin_lock(&swap_lock); |
@@ -2002,18 +2012,16 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2002 | 2012 | ||
2003 | /* insert swap space into swap_list: */ | 2013 | /* insert swap space into swap_list: */ |
2004 | prev = -1; | 2014 | prev = -1; |
2005 | for (i = swap_list.head; i >= 0; i = swap_info[i].next) { | 2015 | for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { |
2006 | if (p->prio >= swap_info[i].prio) { | 2016 | if (p->prio >= swap_info[i]->prio) |
2007 | break; | 2017 | break; |
2008 | } | ||
2009 | prev = i; | 2018 | prev = i; |
2010 | } | 2019 | } |
2011 | p->next = i; | 2020 | p->next = i; |
2012 | if (prev < 0) { | 2021 | if (prev < 0) |
2013 | swap_list.head = swap_list.next = p - swap_info; | 2022 | swap_list.head = swap_list.next = type; |
2014 | } else { | 2023 | else |
2015 | swap_info[prev].next = p - swap_info; | 2024 | swap_info[prev]->next = type; |
2016 | } | ||
2017 | spin_unlock(&swap_lock); | 2025 | spin_unlock(&swap_lock); |
2018 | mutex_unlock(&swapon_mutex); | 2026 | mutex_unlock(&swapon_mutex); |
2019 | error = 0; | 2027 | error = 0; |
@@ -2050,15 +2058,15 @@ out: | |||
2050 | 2058 | ||
2051 | void si_swapinfo(struct sysinfo *val) | 2059 | void si_swapinfo(struct sysinfo *val) |
2052 | { | 2060 | { |
2053 | unsigned int i; | 2061 | unsigned int type; |
2054 | unsigned long nr_to_be_unused = 0; | 2062 | unsigned long nr_to_be_unused = 0; |
2055 | 2063 | ||
2056 | spin_lock(&swap_lock); | 2064 | spin_lock(&swap_lock); |
2057 | for (i = 0; i < nr_swapfiles; i++) { | 2065 | for (type = 0; type < nr_swapfiles; type++) { |
2058 | if (!(swap_info[i].flags & SWP_USED) || | 2066 | struct swap_info_struct *si = swap_info[type]; |
2059 | (swap_info[i].flags & SWP_WRITEOK)) | 2067 | |
2060 | continue; | 2068 | if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) |
2061 | nr_to_be_unused += swap_info[i].inuse_pages; | 2069 | nr_to_be_unused += si->inuse_pages; |
2062 | } | 2070 | } |
2063 | val->freeswap = nr_swap_pages + nr_to_be_unused; | 2071 | val->freeswap = nr_swap_pages + nr_to_be_unused; |
2064 | val->totalswap = total_swap_pages + nr_to_be_unused; | 2072 | val->totalswap = total_swap_pages + nr_to_be_unused; |
@@ -2068,101 +2076,107 @@ void si_swapinfo(struct sysinfo *val) | |||
2068 | /* | 2076 | /* |
2069 | * Verify that a swap entry is valid and increment its swap map count. | 2077 | * Verify that a swap entry is valid and increment its swap map count. |
2070 | * | 2078 | * |
2071 | * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as | ||
2072 | * "permanent", but will be reclaimed by the next swapoff. | ||
2073 | * Returns error code in following case. | 2079 | * Returns error code in following case. |
2074 | * - success -> 0 | 2080 | * - success -> 0 |
2075 | * - swp_entry is invalid -> EINVAL | 2081 | * - swp_entry is invalid -> EINVAL |
2076 | * - swp_entry is migration entry -> EINVAL | 2082 | * - swp_entry is migration entry -> EINVAL |
2077 | * - swap-cache reference is requested but there is already one. -> EEXIST | 2083 | * - swap-cache reference is requested but there is already one. -> EEXIST |
2078 | * - swap-cache reference is requested but the entry is not used. -> ENOENT | 2084 | * - swap-cache reference is requested but the entry is not used. -> ENOENT |
2085 | * - swap-mapped reference requested but needs continued swap count. -> ENOMEM | ||
2079 | */ | 2086 | */ |
2080 | static int __swap_duplicate(swp_entry_t entry, bool cache) | 2087 | static int __swap_duplicate(swp_entry_t entry, unsigned char usage) |
2081 | { | 2088 | { |
2082 | struct swap_info_struct * p; | 2089 | struct swap_info_struct *p; |
2083 | unsigned long offset, type; | 2090 | unsigned long offset, type; |
2084 | int result = -EINVAL; | 2091 | unsigned char count; |
2085 | int count; | 2092 | unsigned char has_cache; |
2086 | bool has_cache; | 2093 | int err = -EINVAL; |
2087 | 2094 | ||
2088 | if (non_swap_entry(entry)) | 2095 | if (non_swap_entry(entry)) |
2089 | return -EINVAL; | 2096 | goto out; |
2090 | 2097 | ||
2091 | type = swp_type(entry); | 2098 | type = swp_type(entry); |
2092 | if (type >= nr_swapfiles) | 2099 | if (type >= nr_swapfiles) |
2093 | goto bad_file; | 2100 | goto bad_file; |
2094 | p = type + swap_info; | 2101 | p = swap_info[type]; |
2095 | offset = swp_offset(entry); | 2102 | offset = swp_offset(entry); |
2096 | 2103 | ||
2097 | spin_lock(&swap_lock); | 2104 | spin_lock(&swap_lock); |
2098 | |||
2099 | if (unlikely(offset >= p->max)) | 2105 | if (unlikely(offset >= p->max)) |
2100 | goto unlock_out; | 2106 | goto unlock_out; |
2101 | 2107 | ||
2102 | count = swap_count(p->swap_map[offset]); | 2108 | count = p->swap_map[offset]; |
2103 | has_cache = swap_has_cache(p->swap_map[offset]); | 2109 | has_cache = count & SWAP_HAS_CACHE; |
2110 | count &= ~SWAP_HAS_CACHE; | ||
2111 | err = 0; | ||
2104 | 2112 | ||
2105 | if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */ | 2113 | if (usage == SWAP_HAS_CACHE) { |
2106 | 2114 | ||
2107 | /* set SWAP_HAS_CACHE if there is no cache and entry is used */ | 2115 | /* set SWAP_HAS_CACHE if there is no cache and entry is used */ |
2108 | if (!has_cache && count) { | 2116 | if (!has_cache && count) |
2109 | p->swap_map[offset] = encode_swapmap(count, true); | 2117 | has_cache = SWAP_HAS_CACHE; |
2110 | result = 0; | 2118 | else if (has_cache) /* someone else added cache */ |
2111 | } else if (has_cache) /* someone added cache */ | 2119 | err = -EEXIST; |
2112 | result = -EEXIST; | 2120 | else /* no users remaining */ |
2113 | else if (!count) /* no users */ | 2121 | err = -ENOENT; |
2114 | result = -ENOENT; | ||
2115 | 2122 | ||
2116 | } else if (count || has_cache) { | 2123 | } else if (count || has_cache) { |
2117 | if (count < SWAP_MAP_MAX - 1) { | 2124 | |
2118 | p->swap_map[offset] = encode_swapmap(count + 1, | 2125 | if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) |
2119 | has_cache); | 2126 | count += usage; |
2120 | result = 0; | 2127 | else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) |
2121 | } else if (count <= SWAP_MAP_MAX) { | 2128 | err = -EINVAL; |
2122 | if (swap_overflow++ < 5) | 2129 | else if (swap_count_continued(p, offset, count)) |
2123 | printk(KERN_WARNING | 2130 | count = COUNT_CONTINUED; |
2124 | "swap_dup: swap entry overflow\n"); | 2131 | else |
2125 | p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX, | 2132 | err = -ENOMEM; |
2126 | has_cache); | ||
2127 | result = 0; | ||
2128 | } | ||
2129 | } else | 2133 | } else |
2130 | result = -ENOENT; /* unused swap entry */ | 2134 | err = -ENOENT; /* unused swap entry */ |
2135 | |||
2136 | p->swap_map[offset] = count | has_cache; | ||
2137 | |||
2131 | unlock_out: | 2138 | unlock_out: |
2132 | spin_unlock(&swap_lock); | 2139 | spin_unlock(&swap_lock); |
2133 | out: | 2140 | out: |
2134 | return result; | 2141 | return err; |
2135 | 2142 | ||
2136 | bad_file: | 2143 | bad_file: |
2137 | printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); | 2144 | printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); |
2138 | goto out; | 2145 | goto out; |
2139 | } | 2146 | } |
2147 | |||
2148 | /* | ||
2149 | * Help swapoff by noting that swap entry belongs to shmem/tmpfs | ||
2150 | * (in which case its reference count is never incremented). | ||
2151 | */ | ||
2152 | void swap_shmem_alloc(swp_entry_t entry) | ||
2153 | { | ||
2154 | __swap_duplicate(entry, SWAP_MAP_SHMEM); | ||
2155 | } | ||
2156 | |||
2140 | /* | 2157 | /* |
2141 | * increase reference count of swap entry by 1. | 2158 | * increase reference count of swap entry by 1. |
2142 | */ | 2159 | */ |
2143 | void swap_duplicate(swp_entry_t entry) | 2160 | int swap_duplicate(swp_entry_t entry) |
2144 | { | 2161 | { |
2145 | __swap_duplicate(entry, SWAP_MAP); | 2162 | int err = 0; |
2163 | |||
2164 | while (!err && __swap_duplicate(entry, 1) == -ENOMEM) | ||
2165 | err = add_swap_count_continuation(entry, GFP_ATOMIC); | ||
2166 | return err; | ||
2146 | } | 2167 | } |
2147 | 2168 | ||
2148 | /* | 2169 | /* |
2149 | * @entry: swap entry for which we allocate swap cache. | 2170 | * @entry: swap entry for which we allocate swap cache. |
2150 | * | 2171 | * |
2151 | * Called when allocating swap cache for exising swap entry, | 2172 | * Called when allocating swap cache for existing swap entry, |
2152 | * This can return error codes. Returns 0 at success. | 2173 | * This can return error codes. Returns 0 at success. |
2153 | * -EBUSY means there is a swap cache. | 2174 | * -EBUSY means there is a swap cache. |
2154 | * Note: return code is different from swap_duplicate(). | 2175 | * Note: return code is different from swap_duplicate(). |
2155 | */ | 2176 | */ |
2156 | int swapcache_prepare(swp_entry_t entry) | 2177 | int swapcache_prepare(swp_entry_t entry) |
2157 | { | 2178 | { |
2158 | return __swap_duplicate(entry, SWAP_CACHE); | 2179 | return __swap_duplicate(entry, SWAP_HAS_CACHE); |
2159 | } | ||
2160 | |||
2161 | |||
2162 | struct swap_info_struct * | ||
2163 | get_swap_info_struct(unsigned type) | ||
2164 | { | ||
2165 | return &swap_info[type]; | ||
2166 | } | 2180 | } |
2167 | 2181 | ||
2168 | /* | 2182 | /* |
@@ -2180,7 +2194,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | |||
2180 | if (!our_page_cluster) /* no readahead */ | 2194 | if (!our_page_cluster) /* no readahead */ |
2181 | return 0; | 2195 | return 0; |
2182 | 2196 | ||
2183 | si = &swap_info[swp_type(entry)]; | 2197 | si = swap_info[swp_type(entry)]; |
2184 | target = swp_offset(entry); | 2198 | target = swp_offset(entry); |
2185 | base = (target >> our_page_cluster) << our_page_cluster; | 2199 | base = (target >> our_page_cluster) << our_page_cluster; |
2186 | end = base + (1 << our_page_cluster); | 2200 | end = base + (1 << our_page_cluster); |
@@ -2216,3 +2230,219 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | |||
2216 | *offset = ++toff; | 2230 | *offset = ++toff; |
2217 | return nr_pages? ++nr_pages: 0; | 2231 | return nr_pages? ++nr_pages: 0; |
2218 | } | 2232 | } |
2233 | |||
2234 | /* | ||
2235 | * add_swap_count_continuation - called when a swap count is duplicated | ||
2236 | * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's | ||
2237 | * page of the original vmalloc'ed swap_map, to hold the continuation count | ||
2238 | * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called | ||
2239 | * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc. | ||
2240 | * | ||
2241 | * These continuation pages are seldom referenced: the common paths all work | ||
2242 | * on the original swap_map, only referring to a continuation page when the | ||
2243 | * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. | ||
2244 | * | ||
2245 | * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding | ||
2246 | * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL) | ||
2247 | * can be called after dropping locks. | ||
2248 | */ | ||
2249 | int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) | ||
2250 | { | ||
2251 | struct swap_info_struct *si; | ||
2252 | struct page *head; | ||
2253 | struct page *page; | ||
2254 | struct page *list_page; | ||
2255 | pgoff_t offset; | ||
2256 | unsigned char count; | ||
2257 | |||
2258 | /* | ||
2259 | * When debugging, it's easier to use __GFP_ZERO here; but it's better | ||
2260 | * for latency not to zero a page while GFP_ATOMIC and holding locks. | ||
2261 | */ | ||
2262 | page = alloc_page(gfp_mask | __GFP_HIGHMEM); | ||
2263 | |||
2264 | si = swap_info_get(entry); | ||
2265 | if (!si) { | ||
2266 | /* | ||
2267 | * An acceptable race has occurred since the failing | ||
2268 | * __swap_duplicate(): the swap entry has been freed, | ||
2269 | * perhaps even the whole swap_map cleared for swapoff. | ||
2270 | */ | ||
2271 | goto outer; | ||
2272 | } | ||
2273 | |||
2274 | offset = swp_offset(entry); | ||
2275 | count = si->swap_map[offset] & ~SWAP_HAS_CACHE; | ||
2276 | |||
2277 | if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { | ||
2278 | /* | ||
2279 | * The higher the swap count, the more likely it is that tasks | ||
2280 | * will race to add swap count continuation: we need to avoid | ||
2281 | * over-provisioning. | ||
2282 | */ | ||
2283 | goto out; | ||
2284 | } | ||
2285 | |||
2286 | if (!page) { | ||
2287 | spin_unlock(&swap_lock); | ||
2288 | return -ENOMEM; | ||
2289 | } | ||
2290 | |||
2291 | /* | ||
2292 | * We are fortunate that although vmalloc_to_page uses pte_offset_map, | ||
2293 | * no architecture is using highmem pages for kernel pagetables: so it | ||
2294 | * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps. | ||
2295 | */ | ||
2296 | head = vmalloc_to_page(si->swap_map + offset); | ||
2297 | offset &= ~PAGE_MASK; | ||
2298 | |||
2299 | /* | ||
2300 | * Page allocation does not initialize the page's lru field, | ||
2301 | * but it does always reset its private field. | ||
2302 | */ | ||
2303 | if (!page_private(head)) { | ||
2304 | BUG_ON(count & COUNT_CONTINUED); | ||
2305 | INIT_LIST_HEAD(&head->lru); | ||
2306 | set_page_private(head, SWP_CONTINUED); | ||
2307 | si->flags |= SWP_CONTINUED; | ||
2308 | } | ||
2309 | |||
2310 | list_for_each_entry(list_page, &head->lru, lru) { | ||
2311 | unsigned char *map; | ||
2312 | |||
2313 | /* | ||
2314 | * If the previous map said no continuation, but we've found | ||
2315 | * a continuation page, free our allocation and use this one. | ||
2316 | */ | ||
2317 | if (!(count & COUNT_CONTINUED)) | ||
2318 | goto out; | ||
2319 | |||
2320 | map = kmap_atomic(list_page, KM_USER0) + offset; | ||
2321 | count = *map; | ||
2322 | kunmap_atomic(map, KM_USER0); | ||
2323 | |||
2324 | /* | ||
2325 | * If this continuation count now has some space in it, | ||
2326 | * free our allocation and use this one. | ||
2327 | */ | ||
2328 | if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) | ||
2329 | goto out; | ||
2330 | } | ||
2331 | |||
2332 | list_add_tail(&page->lru, &head->lru); | ||
2333 | page = NULL; /* now it's attached, don't free it */ | ||
2334 | out: | ||
2335 | spin_unlock(&swap_lock); | ||
2336 | outer: | ||
2337 | if (page) | ||
2338 | __free_page(page); | ||
2339 | return 0; | ||
2340 | } | ||
2341 | |||
2342 | /* | ||
2343 | * swap_count_continued - when the original swap_map count is incremented | ||
2344 | * from SWAP_MAP_MAX, check if there is already a continuation page to carry | ||
2345 | * into, carry if so, or else fail until a new continuation page is allocated; | ||
2346 | * when the original swap_map count is decremented from 0 with continuation, | ||
2347 | * borrow from the continuation and report whether it still holds more. | ||
2348 | * Called while __swap_duplicate() or swap_entry_free() holds swap_lock. | ||
2349 | */ | ||
2350 | static bool swap_count_continued(struct swap_info_struct *si, | ||
2351 | pgoff_t offset, unsigned char count) | ||
2352 | { | ||
2353 | struct page *head; | ||
2354 | struct page *page; | ||
2355 | unsigned char *map; | ||
2356 | |||
2357 | head = vmalloc_to_page(si->swap_map + offset); | ||
2358 | if (page_private(head) != SWP_CONTINUED) { | ||
2359 | BUG_ON(count & COUNT_CONTINUED); | ||
2360 | return false; /* need to add count continuation */ | ||
2361 | } | ||
2362 | |||
2363 | offset &= ~PAGE_MASK; | ||
2364 | page = list_entry(head->lru.next, struct page, lru); | ||
2365 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2366 | |||
2367 | if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ | ||
2368 | goto init_map; /* jump over SWAP_CONT_MAX checks */ | ||
2369 | |||
2370 | if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */ | ||
2371 | /* | ||
2372 | * Think of how you add 1 to 999 | ||
2373 | */ | ||
2374 | while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { | ||
2375 | kunmap_atomic(map, KM_USER0); | ||
2376 | page = list_entry(page->lru.next, struct page, lru); | ||
2377 | BUG_ON(page == head); | ||
2378 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2379 | } | ||
2380 | if (*map == SWAP_CONT_MAX) { | ||
2381 | kunmap_atomic(map, KM_USER0); | ||
2382 | page = list_entry(page->lru.next, struct page, lru); | ||
2383 | if (page == head) | ||
2384 | return false; /* add count continuation */ | ||
2385 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2386 | init_map: *map = 0; /* we didn't zero the page */ | ||
2387 | } | ||
2388 | *map += 1; | ||
2389 | kunmap_atomic(map, KM_USER0); | ||
2390 | page = list_entry(page->lru.prev, struct page, lru); | ||
2391 | while (page != head) { | ||
2392 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2393 | *map = COUNT_CONTINUED; | ||
2394 | kunmap_atomic(map, KM_USER0); | ||
2395 | page = list_entry(page->lru.prev, struct page, lru); | ||
2396 | } | ||
2397 | return true; /* incremented */ | ||
2398 | |||
2399 | } else { /* decrementing */ | ||
2400 | /* | ||
2401 | * Think of how you subtract 1 from 1000 | ||
2402 | */ | ||
2403 | BUG_ON(count != COUNT_CONTINUED); | ||
2404 | while (*map == COUNT_CONTINUED) { | ||
2405 | kunmap_atomic(map, KM_USER0); | ||
2406 | page = list_entry(page->lru.next, struct page, lru); | ||
2407 | BUG_ON(page == head); | ||
2408 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2409 | } | ||
2410 | BUG_ON(*map == 0); | ||
2411 | *map -= 1; | ||
2412 | if (*map == 0) | ||
2413 | count = 0; | ||
2414 | kunmap_atomic(map, KM_USER0); | ||
2415 | page = list_entry(page->lru.prev, struct page, lru); | ||
2416 | while (page != head) { | ||
2417 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2418 | *map = SWAP_CONT_MAX | count; | ||
2419 | count = COUNT_CONTINUED; | ||
2420 | kunmap_atomic(map, KM_USER0); | ||
2421 | page = list_entry(page->lru.prev, struct page, lru); | ||
2422 | } | ||
2423 | return count == COUNT_CONTINUED; | ||
2424 | } | ||
2425 | } | ||
2426 | |||
2427 | /* | ||
2428 | * free_swap_count_continuations - swapoff free all the continuation pages | ||
2429 | * appended to the swap_map, after swap_map is quiesced, before vfree'ing it. | ||
2430 | */ | ||
2431 | static void free_swap_count_continuations(struct swap_info_struct *si) | ||
2432 | { | ||
2433 | pgoff_t offset; | ||
2434 | |||
2435 | for (offset = 0; offset < si->max; offset += PAGE_SIZE) { | ||
2436 | struct page *head; | ||
2437 | head = vmalloc_to_page(si->swap_map + offset); | ||
2438 | if (page_private(head)) { | ||
2439 | struct list_head *this, *next; | ||
2440 | list_for_each_safe(this, next, &head->lru) { | ||
2441 | struct page *page; | ||
2442 | page = list_entry(this, struct page, lru); | ||
2443 | list_del(this); | ||
2444 | __free_page(page); | ||
2445 | } | ||
2446 | } | ||
2447 | } | ||
2448 | } | ||
diff --git a/mm/truncate.c b/mm/truncate.c index 450cebdabfc0..342deee22684 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -272,6 +272,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
272 | pagevec_release(&pvec); | 272 | pagevec_release(&pvec); |
273 | break; | 273 | break; |
274 | } | 274 | } |
275 | mem_cgroup_uncharge_start(); | ||
275 | for (i = 0; i < pagevec_count(&pvec); i++) { | 276 | for (i = 0; i < pagevec_count(&pvec); i++) { |
276 | struct page *page = pvec.pages[i]; | 277 | struct page *page = pvec.pages[i]; |
277 | 278 | ||
@@ -286,6 +287,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
286 | unlock_page(page); | 287 | unlock_page(page); |
287 | } | 288 | } |
288 | pagevec_release(&pvec); | 289 | pagevec_release(&pvec); |
290 | mem_cgroup_uncharge_end(); | ||
289 | } | 291 | } |
290 | } | 292 | } |
291 | EXPORT_SYMBOL(truncate_inode_pages_range); | 293 | EXPORT_SYMBOL(truncate_inode_pages_range); |
@@ -327,6 +329,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
327 | pagevec_init(&pvec, 0); | 329 | pagevec_init(&pvec, 0); |
328 | while (next <= end && | 330 | while (next <= end && |
329 | pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { | 331 | pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { |
332 | mem_cgroup_uncharge_start(); | ||
330 | for (i = 0; i < pagevec_count(&pvec); i++) { | 333 | for (i = 0; i < pagevec_count(&pvec); i++) { |
331 | struct page *page = pvec.pages[i]; | 334 | struct page *page = pvec.pages[i]; |
332 | pgoff_t index; | 335 | pgoff_t index; |
@@ -354,6 +357,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
354 | break; | 357 | break; |
355 | } | 358 | } |
356 | pagevec_release(&pvec); | 359 | pagevec_release(&pvec); |
360 | mem_cgroup_uncharge_end(); | ||
357 | cond_resched(); | 361 | cond_resched(); |
358 | } | 362 | } |
359 | return ret; | 363 | return ret; |
@@ -428,6 +432,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
428 | while (next <= end && !wrapped && | 432 | while (next <= end && !wrapped && |
429 | pagevec_lookup(&pvec, mapping, next, | 433 | pagevec_lookup(&pvec, mapping, next, |
430 | min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { | 434 | min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { |
435 | mem_cgroup_uncharge_start(); | ||
431 | for (i = 0; i < pagevec_count(&pvec); i++) { | 436 | for (i = 0; i < pagevec_count(&pvec); i++) { |
432 | struct page *page = pvec.pages[i]; | 437 | struct page *page = pvec.pages[i]; |
433 | pgoff_t page_index; | 438 | pgoff_t page_index; |
@@ -477,6 +482,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
477 | unlock_page(page); | 482 | unlock_page(page); |
478 | } | 483 | } |
479 | pagevec_release(&pvec); | 484 | pagevec_release(&pvec); |
485 | mem_cgroup_uncharge_end(); | ||
480 | cond_resched(); | 486 | cond_resched(); |
481 | } | 487 | } |
482 | return ret; | 488 | return ret; |
@@ -490,7 +496,7 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); | |||
490 | * Any pages which are found to be mapped into pagetables are unmapped prior to | 496 | * Any pages which are found to be mapped into pagetables are unmapped prior to |
491 | * invalidation. | 497 | * invalidation. |
492 | * | 498 | * |
493 | * Returns -EIO if any pages could not be invalidated. | 499 | * Returns -EBUSY if any pages could not be invalidated. |
494 | */ | 500 | */ |
495 | int invalidate_inode_pages2(struct address_space *mapping) | 501 | int invalidate_inode_pages2(struct address_space *mapping) |
496 | { | 502 | { |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b65cfe44a562..37e69295f250 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
14 | #include <linux/highmem.h> | 14 | #include <linux/highmem.h> |
15 | #include <linux/sched.h> | ||
15 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
16 | #include <linux/spinlock.h> | 17 | #include <linux/spinlock.h> |
17 | #include <linux/interrupt.h> | 18 | #include <linux/interrupt.h> |
@@ -25,10 +26,10 @@ | |||
25 | #include <linux/rcupdate.h> | 26 | #include <linux/rcupdate.h> |
26 | #include <linux/pfn.h> | 27 | #include <linux/pfn.h> |
27 | #include <linux/kmemleak.h> | 28 | #include <linux/kmemleak.h> |
28 | #include <linux/highmem.h> | ||
29 | #include <asm/atomic.h> | 29 | #include <asm/atomic.h> |
30 | #include <asm/uaccess.h> | 30 | #include <asm/uaccess.h> |
31 | #include <asm/tlbflush.h> | 31 | #include <asm/tlbflush.h> |
32 | #include <asm/shmparam.h> | ||
32 | 33 | ||
33 | 34 | ||
34 | /*** Page table manipulation functions ***/ | 35 | /*** Page table manipulation functions ***/ |
@@ -1156,12 +1157,11 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, | |||
1156 | } | 1157 | } |
1157 | 1158 | ||
1158 | static struct vm_struct *__get_vm_area_node(unsigned long size, | 1159 | static struct vm_struct *__get_vm_area_node(unsigned long size, |
1159 | unsigned long flags, unsigned long start, unsigned long end, | 1160 | unsigned long align, unsigned long flags, unsigned long start, |
1160 | int node, gfp_t gfp_mask, void *caller) | 1161 | unsigned long end, int node, gfp_t gfp_mask, void *caller) |
1161 | { | 1162 | { |
1162 | static struct vmap_area *va; | 1163 | static struct vmap_area *va; |
1163 | struct vm_struct *area; | 1164 | struct vm_struct *area; |
1164 | unsigned long align = 1; | ||
1165 | 1165 | ||
1166 | BUG_ON(in_interrupt()); | 1166 | BUG_ON(in_interrupt()); |
1167 | if (flags & VM_IOREMAP) { | 1167 | if (flags & VM_IOREMAP) { |
@@ -1201,7 +1201,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, | |||
1201 | struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, | 1201 | struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, |
1202 | unsigned long start, unsigned long end) | 1202 | unsigned long start, unsigned long end) |
1203 | { | 1203 | { |
1204 | return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL, | 1204 | return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, |
1205 | __builtin_return_address(0)); | 1205 | __builtin_return_address(0)); |
1206 | } | 1206 | } |
1207 | EXPORT_SYMBOL_GPL(__get_vm_area); | 1207 | EXPORT_SYMBOL_GPL(__get_vm_area); |
@@ -1210,7 +1210,7 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, | |||
1210 | unsigned long start, unsigned long end, | 1210 | unsigned long start, unsigned long end, |
1211 | void *caller) | 1211 | void *caller) |
1212 | { | 1212 | { |
1213 | return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL, | 1213 | return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, |
1214 | caller); | 1214 | caller); |
1215 | } | 1215 | } |
1216 | 1216 | ||
@@ -1225,22 +1225,22 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, | |||
1225 | */ | 1225 | */ |
1226 | struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) | 1226 | struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) |
1227 | { | 1227 | { |
1228 | return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, | 1228 | return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, |
1229 | -1, GFP_KERNEL, __builtin_return_address(0)); | 1229 | -1, GFP_KERNEL, __builtin_return_address(0)); |
1230 | } | 1230 | } |
1231 | 1231 | ||
1232 | struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, | 1232 | struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, |
1233 | void *caller) | 1233 | void *caller) |
1234 | { | 1234 | { |
1235 | return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, | 1235 | return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, |
1236 | -1, GFP_KERNEL, caller); | 1236 | -1, GFP_KERNEL, caller); |
1237 | } | 1237 | } |
1238 | 1238 | ||
1239 | struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, | 1239 | struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, |
1240 | int node, gfp_t gfp_mask) | 1240 | int node, gfp_t gfp_mask) |
1241 | { | 1241 | { |
1242 | return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node, | 1242 | return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, |
1243 | gfp_mask, __builtin_return_address(0)); | 1243 | node, gfp_mask, __builtin_return_address(0)); |
1244 | } | 1244 | } |
1245 | 1245 | ||
1246 | static struct vm_struct *find_vm_area(const void *addr) | 1246 | static struct vm_struct *find_vm_area(const void *addr) |
@@ -1403,13 +1403,15 @@ void *vmap(struct page **pages, unsigned int count, | |||
1403 | } | 1403 | } |
1404 | EXPORT_SYMBOL(vmap); | 1404 | EXPORT_SYMBOL(vmap); |
1405 | 1405 | ||
1406 | static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, | 1406 | static void *__vmalloc_node(unsigned long size, unsigned long align, |
1407 | gfp_t gfp_mask, pgprot_t prot, | ||
1407 | int node, void *caller); | 1408 | int node, void *caller); |
1408 | static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | 1409 | static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, |
1409 | pgprot_t prot, int node, void *caller) | 1410 | pgprot_t prot, int node, void *caller) |
1410 | { | 1411 | { |
1411 | struct page **pages; | 1412 | struct page **pages; |
1412 | unsigned int nr_pages, array_size, i; | 1413 | unsigned int nr_pages, array_size, i; |
1414 | gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; | ||
1413 | 1415 | ||
1414 | nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT; | 1416 | nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT; |
1415 | array_size = (nr_pages * sizeof(struct page *)); | 1417 | array_size = (nr_pages * sizeof(struct page *)); |
@@ -1417,13 +1419,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | |||
1417 | area->nr_pages = nr_pages; | 1419 | area->nr_pages = nr_pages; |
1418 | /* Please note that the recursion is strictly bounded. */ | 1420 | /* Please note that the recursion is strictly bounded. */ |
1419 | if (array_size > PAGE_SIZE) { | 1421 | if (array_size > PAGE_SIZE) { |
1420 | pages = __vmalloc_node(array_size, gfp_mask | __GFP_ZERO, | 1422 | pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, |
1421 | PAGE_KERNEL, node, caller); | 1423 | PAGE_KERNEL, node, caller); |
1422 | area->flags |= VM_VPAGES; | 1424 | area->flags |= VM_VPAGES; |
1423 | } else { | 1425 | } else { |
1424 | pages = kmalloc_node(array_size, | 1426 | pages = kmalloc_node(array_size, nested_gfp, node); |
1425 | (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO, | ||
1426 | node); | ||
1427 | } | 1427 | } |
1428 | area->pages = pages; | 1428 | area->pages = pages; |
1429 | area->caller = caller; | 1429 | area->caller = caller; |
@@ -1476,6 +1476,7 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) | |||
1476 | /** | 1476 | /** |
1477 | * __vmalloc_node - allocate virtually contiguous memory | 1477 | * __vmalloc_node - allocate virtually contiguous memory |
1478 | * @size: allocation size | 1478 | * @size: allocation size |
1479 | * @align: desired alignment | ||
1479 | * @gfp_mask: flags for the page level allocator | 1480 | * @gfp_mask: flags for the page level allocator |
1480 | * @prot: protection mask for the allocated pages | 1481 | * @prot: protection mask for the allocated pages |
1481 | * @node: node to use for allocation or -1 | 1482 | * @node: node to use for allocation or -1 |
@@ -1485,8 +1486,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) | |||
1485 | * allocator with @gfp_mask flags. Map them into contiguous | 1486 | * allocator with @gfp_mask flags. Map them into contiguous |
1486 | * kernel virtual space, using a pagetable protection of @prot. | 1487 | * kernel virtual space, using a pagetable protection of @prot. |
1487 | */ | 1488 | */ |
1488 | static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, | 1489 | static void *__vmalloc_node(unsigned long size, unsigned long align, |
1489 | int node, void *caller) | 1490 | gfp_t gfp_mask, pgprot_t prot, |
1491 | int node, void *caller) | ||
1490 | { | 1492 | { |
1491 | struct vm_struct *area; | 1493 | struct vm_struct *area; |
1492 | void *addr; | 1494 | void *addr; |
@@ -1496,8 +1498,8 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, | |||
1496 | if (!size || (size >> PAGE_SHIFT) > totalram_pages) | 1498 | if (!size || (size >> PAGE_SHIFT) > totalram_pages) |
1497 | return NULL; | 1499 | return NULL; |
1498 | 1500 | ||
1499 | area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END, | 1501 | area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START, |
1500 | node, gfp_mask, caller); | 1502 | VMALLOC_END, node, gfp_mask, caller); |
1501 | 1503 | ||
1502 | if (!area) | 1504 | if (!area) |
1503 | return NULL; | 1505 | return NULL; |
@@ -1516,7 +1518,7 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, | |||
1516 | 1518 | ||
1517 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) | 1519 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) |
1518 | { | 1520 | { |
1519 | return __vmalloc_node(size, gfp_mask, prot, -1, | 1521 | return __vmalloc_node(size, 1, gfp_mask, prot, -1, |
1520 | __builtin_return_address(0)); | 1522 | __builtin_return_address(0)); |
1521 | } | 1523 | } |
1522 | EXPORT_SYMBOL(__vmalloc); | 1524 | EXPORT_SYMBOL(__vmalloc); |
@@ -1532,7 +1534,7 @@ EXPORT_SYMBOL(__vmalloc); | |||
1532 | */ | 1534 | */ |
1533 | void *vmalloc(unsigned long size) | 1535 | void *vmalloc(unsigned long size) |
1534 | { | 1536 | { |
1535 | return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, | 1537 | return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, |
1536 | -1, __builtin_return_address(0)); | 1538 | -1, __builtin_return_address(0)); |
1537 | } | 1539 | } |
1538 | EXPORT_SYMBOL(vmalloc); | 1540 | EXPORT_SYMBOL(vmalloc); |
@@ -1549,7 +1551,8 @@ void *vmalloc_user(unsigned long size) | |||
1549 | struct vm_struct *area; | 1551 | struct vm_struct *area; |
1550 | void *ret; | 1552 | void *ret; |
1551 | 1553 | ||
1552 | ret = __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, | 1554 | ret = __vmalloc_node(size, SHMLBA, |
1555 | GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, | ||
1553 | PAGE_KERNEL, -1, __builtin_return_address(0)); | 1556 | PAGE_KERNEL, -1, __builtin_return_address(0)); |
1554 | if (ret) { | 1557 | if (ret) { |
1555 | area = find_vm_area(ret); | 1558 | area = find_vm_area(ret); |
@@ -1572,7 +1575,7 @@ EXPORT_SYMBOL(vmalloc_user); | |||
1572 | */ | 1575 | */ |
1573 | void *vmalloc_node(unsigned long size, int node) | 1576 | void *vmalloc_node(unsigned long size, int node) |
1574 | { | 1577 | { |
1575 | return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, | 1578 | return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, |
1576 | node, __builtin_return_address(0)); | 1579 | node, __builtin_return_address(0)); |
1577 | } | 1580 | } |
1578 | EXPORT_SYMBOL(vmalloc_node); | 1581 | EXPORT_SYMBOL(vmalloc_node); |
@@ -1595,7 +1598,7 @@ EXPORT_SYMBOL(vmalloc_node); | |||
1595 | 1598 | ||
1596 | void *vmalloc_exec(unsigned long size) | 1599 | void *vmalloc_exec(unsigned long size) |
1597 | { | 1600 | { |
1598 | return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, | 1601 | return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, |
1599 | -1, __builtin_return_address(0)); | 1602 | -1, __builtin_return_address(0)); |
1600 | } | 1603 | } |
1601 | 1604 | ||
@@ -1616,7 +1619,7 @@ void *vmalloc_exec(unsigned long size) | |||
1616 | */ | 1619 | */ |
1617 | void *vmalloc_32(unsigned long size) | 1620 | void *vmalloc_32(unsigned long size) |
1618 | { | 1621 | { |
1619 | return __vmalloc_node(size, GFP_VMALLOC32, PAGE_KERNEL, | 1622 | return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL, |
1620 | -1, __builtin_return_address(0)); | 1623 | -1, __builtin_return_address(0)); |
1621 | } | 1624 | } |
1622 | EXPORT_SYMBOL(vmalloc_32); | 1625 | EXPORT_SYMBOL(vmalloc_32); |
@@ -1633,7 +1636,7 @@ void *vmalloc_32_user(unsigned long size) | |||
1633 | struct vm_struct *area; | 1636 | struct vm_struct *area; |
1634 | void *ret; | 1637 | void *ret; |
1635 | 1638 | ||
1636 | ret = __vmalloc_node(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, | 1639 | ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, |
1637 | -1, __builtin_return_address(0)); | 1640 | -1, __builtin_return_address(0)); |
1638 | if (ret) { | 1641 | if (ret) { |
1639 | area = find_vm_area(ret); | 1642 | area = find_vm_area(ret); |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 64e438898832..885207a6b6b7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -55,6 +55,11 @@ struct scan_control { | |||
55 | /* Number of pages freed so far during a call to shrink_zones() */ | 55 | /* Number of pages freed so far during a call to shrink_zones() */ |
56 | unsigned long nr_reclaimed; | 56 | unsigned long nr_reclaimed; |
57 | 57 | ||
58 | /* How many pages shrink_list() should reclaim */ | ||
59 | unsigned long nr_to_reclaim; | ||
60 | |||
61 | unsigned long hibernation_mode; | ||
62 | |||
58 | /* This context's GFP mask */ | 63 | /* This context's GFP mask */ |
59 | gfp_t gfp_mask; | 64 | gfp_t gfp_mask; |
60 | 65 | ||
@@ -66,12 +71,6 @@ struct scan_control { | |||
66 | /* Can pages be swapped as part of reclaim? */ | 71 | /* Can pages be swapped as part of reclaim? */ |
67 | int may_swap; | 72 | int may_swap; |
68 | 73 | ||
69 | /* This context's SWAP_CLUSTER_MAX. If freeing memory for | ||
70 | * suspend, we effectively ignore SWAP_CLUSTER_MAX. | ||
71 | * In this context, it doesn't matter that we scan the | ||
72 | * whole list at once. */ | ||
73 | int swap_cluster_max; | ||
74 | |||
75 | int swappiness; | 74 | int swappiness; |
76 | 75 | ||
77 | int all_unreclaimable; | 76 | int all_unreclaimable; |
@@ -358,7 +357,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
358 | * stalls if we need to run get_block(). We could test | 357 | * stalls if we need to run get_block(). We could test |
359 | * PagePrivate for that. | 358 | * PagePrivate for that. |
360 | * | 359 | * |
361 | * If this process is currently in generic_file_write() against | 360 | * If this process is currently in __generic_file_aio_write() against |
362 | * this page's queue, we can perform writeback even if that | 361 | * this page's queue, we can perform writeback even if that |
363 | * will block. | 362 | * will block. |
364 | * | 363 | * |
@@ -544,6 +543,16 @@ redo: | |||
544 | */ | 543 | */ |
545 | lru = LRU_UNEVICTABLE; | 544 | lru = LRU_UNEVICTABLE; |
546 | add_page_to_unevictable_list(page); | 545 | add_page_to_unevictable_list(page); |
546 | /* | ||
547 | * When racing with an mlock clearing (page is | ||
548 | * unlocked), make sure that if the other thread does | ||
549 | * not observe our setting of PG_lru and fails | ||
550 | * isolation, we see PG_mlocked cleared below and move | ||
551 | * the page back to the evictable list. | ||
552 | * | ||
553 | * The other side is TestClearPageMlocked(). | ||
554 | */ | ||
555 | smp_mb(); | ||
547 | } | 556 | } |
548 | 557 | ||
549 | /* | 558 | /* |
@@ -1088,7 +1097,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
1088 | int lumpy_reclaim = 0; | 1097 | int lumpy_reclaim = 0; |
1089 | 1098 | ||
1090 | while (unlikely(too_many_isolated(zone, file, sc))) { | 1099 | while (unlikely(too_many_isolated(zone, file, sc))) { |
1091 | congestion_wait(WRITE, HZ/10); | 1100 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
1092 | 1101 | ||
1093 | /* We are about to die and free our memory. Return now. */ | 1102 | /* We are about to die and free our memory. Return now. */ |
1094 | if (fatal_signal_pending(current)) | 1103 | if (fatal_signal_pending(current)) |
@@ -1122,7 +1131,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
1122 | unsigned long nr_anon; | 1131 | unsigned long nr_anon; |
1123 | unsigned long nr_file; | 1132 | unsigned long nr_file; |
1124 | 1133 | ||
1125 | nr_taken = sc->isolate_pages(sc->swap_cluster_max, | 1134 | nr_taken = sc->isolate_pages(SWAP_CLUSTER_MAX, |
1126 | &page_list, &nr_scan, sc->order, mode, | 1135 | &page_list, &nr_scan, sc->order, mode, |
1127 | zone, sc->mem_cgroup, 0, file); | 1136 | zone, sc->mem_cgroup, 0, file); |
1128 | 1137 | ||
@@ -1156,10 +1165,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
1156 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); | 1165 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); |
1157 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); | 1166 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); |
1158 | 1167 | ||
1159 | reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON]; | 1168 | reclaim_stat->recent_scanned[0] += nr_anon; |
1160 | reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON]; | 1169 | reclaim_stat->recent_scanned[1] += nr_file; |
1161 | reclaim_stat->recent_scanned[1] += count[LRU_INACTIVE_FILE]; | ||
1162 | reclaim_stat->recent_scanned[1] += count[LRU_ACTIVE_FILE]; | ||
1163 | 1170 | ||
1164 | spin_unlock_irq(&zone->lru_lock); | 1171 | spin_unlock_irq(&zone->lru_lock); |
1165 | 1172 | ||
@@ -1356,7 +1363,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1356 | * IO, plus JVM can create lots of anon VM_EXEC pages, | 1363 | * IO, plus JVM can create lots of anon VM_EXEC pages, |
1357 | * so we ignore them here. | 1364 | * so we ignore them here. |
1358 | */ | 1365 | */ |
1359 | if ((vm_flags & VM_EXEC) && !PageAnon(page)) { | 1366 | if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) { |
1360 | list_add(&page->lru, &l_active); | 1367 | list_add(&page->lru, &l_active); |
1361 | continue; | 1368 | continue; |
1362 | } | 1369 | } |
@@ -1454,20 +1461,26 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) | |||
1454 | return low; | 1461 | return low; |
1455 | } | 1462 | } |
1456 | 1463 | ||
1464 | static int inactive_list_is_low(struct zone *zone, struct scan_control *sc, | ||
1465 | int file) | ||
1466 | { | ||
1467 | if (file) | ||
1468 | return inactive_file_is_low(zone, sc); | ||
1469 | else | ||
1470 | return inactive_anon_is_low(zone, sc); | ||
1471 | } | ||
1472 | |||
1457 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, | 1473 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, |
1458 | struct zone *zone, struct scan_control *sc, int priority) | 1474 | struct zone *zone, struct scan_control *sc, int priority) |
1459 | { | 1475 | { |
1460 | int file = is_file_lru(lru); | 1476 | int file = is_file_lru(lru); |
1461 | 1477 | ||
1462 | if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) { | 1478 | if (is_active_lru(lru)) { |
1463 | shrink_active_list(nr_to_scan, zone, sc, priority, file); | 1479 | if (inactive_list_is_low(zone, sc, file)) |
1480 | shrink_active_list(nr_to_scan, zone, sc, priority, file); | ||
1464 | return 0; | 1481 | return 0; |
1465 | } | 1482 | } |
1466 | 1483 | ||
1467 | if (lru == LRU_ACTIVE_ANON && inactive_anon_is_low(zone, sc)) { | ||
1468 | shrink_active_list(nr_to_scan, zone, sc, priority, file); | ||
1469 | return 0; | ||
1470 | } | ||
1471 | return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); | 1484 | return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); |
1472 | } | 1485 | } |
1473 | 1486 | ||
@@ -1557,15 +1570,14 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
1557 | * until we collected @swap_cluster_max pages to scan. | 1570 | * until we collected @swap_cluster_max pages to scan. |
1558 | */ | 1571 | */ |
1559 | static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, | 1572 | static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, |
1560 | unsigned long *nr_saved_scan, | 1573 | unsigned long *nr_saved_scan) |
1561 | unsigned long swap_cluster_max) | ||
1562 | { | 1574 | { |
1563 | unsigned long nr; | 1575 | unsigned long nr; |
1564 | 1576 | ||
1565 | *nr_saved_scan += nr_to_scan; | 1577 | *nr_saved_scan += nr_to_scan; |
1566 | nr = *nr_saved_scan; | 1578 | nr = *nr_saved_scan; |
1567 | 1579 | ||
1568 | if (nr >= swap_cluster_max) | 1580 | if (nr >= SWAP_CLUSTER_MAX) |
1569 | *nr_saved_scan = 0; | 1581 | *nr_saved_scan = 0; |
1570 | else | 1582 | else |
1571 | nr = 0; | 1583 | nr = 0; |
@@ -1584,7 +1596,7 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1584 | unsigned long percent[2]; /* anon @ 0; file @ 1 */ | 1596 | unsigned long percent[2]; /* anon @ 0; file @ 1 */ |
1585 | enum lru_list l; | 1597 | enum lru_list l; |
1586 | unsigned long nr_reclaimed = sc->nr_reclaimed; | 1598 | unsigned long nr_reclaimed = sc->nr_reclaimed; |
1587 | unsigned long swap_cluster_max = sc->swap_cluster_max; | 1599 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; |
1588 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1600 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
1589 | int noswap = 0; | 1601 | int noswap = 0; |
1590 | 1602 | ||
@@ -1606,15 +1618,15 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1606 | scan = (scan * percent[file]) / 100; | 1618 | scan = (scan * percent[file]) / 100; |
1607 | } | 1619 | } |
1608 | nr[l] = nr_scan_try_batch(scan, | 1620 | nr[l] = nr_scan_try_batch(scan, |
1609 | &reclaim_stat->nr_saved_scan[l], | 1621 | &reclaim_stat->nr_saved_scan[l]); |
1610 | swap_cluster_max); | ||
1611 | } | 1622 | } |
1612 | 1623 | ||
1613 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | 1624 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || |
1614 | nr[LRU_INACTIVE_FILE]) { | 1625 | nr[LRU_INACTIVE_FILE]) { |
1615 | for_each_evictable_lru(l) { | 1626 | for_each_evictable_lru(l) { |
1616 | if (nr[l]) { | 1627 | if (nr[l]) { |
1617 | nr_to_scan = min(nr[l], swap_cluster_max); | 1628 | nr_to_scan = min_t(unsigned long, |
1629 | nr[l], SWAP_CLUSTER_MAX); | ||
1618 | nr[l] -= nr_to_scan; | 1630 | nr[l] -= nr_to_scan; |
1619 | 1631 | ||
1620 | nr_reclaimed += shrink_list(l, nr_to_scan, | 1632 | nr_reclaimed += shrink_list(l, nr_to_scan, |
@@ -1629,8 +1641,7 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1629 | * with multiple processes reclaiming pages, the total | 1641 | * with multiple processes reclaiming pages, the total |
1630 | * freeing target can get unreasonably large. | 1642 | * freeing target can get unreasonably large. |
1631 | */ | 1643 | */ |
1632 | if (nr_reclaimed > swap_cluster_max && | 1644 | if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) |
1633 | priority < DEF_PRIORITY && !current_is_kswapd()) | ||
1634 | break; | 1645 | break; |
1635 | } | 1646 | } |
1636 | 1647 | ||
@@ -1728,6 +1739,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1728 | struct zoneref *z; | 1739 | struct zoneref *z; |
1729 | struct zone *zone; | 1740 | struct zone *zone; |
1730 | enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); | 1741 | enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); |
1742 | unsigned long writeback_threshold; | ||
1731 | 1743 | ||
1732 | delayacct_freepages_start(); | 1744 | delayacct_freepages_start(); |
1733 | 1745 | ||
@@ -1763,7 +1775,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1763 | } | 1775 | } |
1764 | } | 1776 | } |
1765 | total_scanned += sc->nr_scanned; | 1777 | total_scanned += sc->nr_scanned; |
1766 | if (sc->nr_reclaimed >= sc->swap_cluster_max) { | 1778 | if (sc->nr_reclaimed >= sc->nr_to_reclaim) { |
1767 | ret = sc->nr_reclaimed; | 1779 | ret = sc->nr_reclaimed; |
1768 | goto out; | 1780 | goto out; |
1769 | } | 1781 | } |
@@ -1775,14 +1787,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1775 | * that's undesirable in laptop mode, where we *want* lumpy | 1787 | * that's undesirable in laptop mode, where we *want* lumpy |
1776 | * writeout. So in laptop mode, write out the whole world. | 1788 | * writeout. So in laptop mode, write out the whole world. |
1777 | */ | 1789 | */ |
1778 | if (total_scanned > sc->swap_cluster_max + | 1790 | writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; |
1779 | sc->swap_cluster_max / 2) { | 1791 | if (total_scanned > writeback_threshold) { |
1780 | wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); | 1792 | wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); |
1781 | sc->may_writepage = 1; | 1793 | sc->may_writepage = 1; |
1782 | } | 1794 | } |
1783 | 1795 | ||
1784 | /* Take a nap, wait for some writeback to complete */ | 1796 | /* Take a nap, wait for some writeback to complete */ |
1785 | if (sc->nr_scanned && priority < DEF_PRIORITY - 2) | 1797 | if (!sc->hibernation_mode && sc->nr_scanned && |
1798 | priority < DEF_PRIORITY - 2) | ||
1786 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 1799 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
1787 | } | 1800 | } |
1788 | /* top priority shrink_zones still had more to do? don't OOM, then */ | 1801 | /* top priority shrink_zones still had more to do? don't OOM, then */ |
@@ -1821,7 +1834,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
1821 | struct scan_control sc = { | 1834 | struct scan_control sc = { |
1822 | .gfp_mask = gfp_mask, | 1835 | .gfp_mask = gfp_mask, |
1823 | .may_writepage = !laptop_mode, | 1836 | .may_writepage = !laptop_mode, |
1824 | .swap_cluster_max = SWAP_CLUSTER_MAX, | 1837 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
1825 | .may_unmap = 1, | 1838 | .may_unmap = 1, |
1826 | .may_swap = 1, | 1839 | .may_swap = 1, |
1827 | .swappiness = vm_swappiness, | 1840 | .swappiness = vm_swappiness, |
@@ -1845,7 +1858,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | |||
1845 | .may_writepage = !laptop_mode, | 1858 | .may_writepage = !laptop_mode, |
1846 | .may_unmap = 1, | 1859 | .may_unmap = 1, |
1847 | .may_swap = !noswap, | 1860 | .may_swap = !noswap, |
1848 | .swap_cluster_max = SWAP_CLUSTER_MAX, | ||
1849 | .swappiness = swappiness, | 1861 | .swappiness = swappiness, |
1850 | .order = 0, | 1862 | .order = 0, |
1851 | .mem_cgroup = mem, | 1863 | .mem_cgroup = mem, |
@@ -1879,7 +1891,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
1879 | .may_writepage = !laptop_mode, | 1891 | .may_writepage = !laptop_mode, |
1880 | .may_unmap = 1, | 1892 | .may_unmap = 1, |
1881 | .may_swap = !noswap, | 1893 | .may_swap = !noswap, |
1882 | .swap_cluster_max = SWAP_CLUSTER_MAX, | 1894 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
1883 | .swappiness = swappiness, | 1895 | .swappiness = swappiness, |
1884 | .order = 0, | 1896 | .order = 0, |
1885 | .mem_cgroup = mem_cont, | 1897 | .mem_cgroup = mem_cont, |
@@ -1894,6 +1906,30 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
1894 | } | 1906 | } |
1895 | #endif | 1907 | #endif |
1896 | 1908 | ||
1909 | /* is kswapd sleeping prematurely? */ | ||
1910 | static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) | ||
1911 | { | ||
1912 | int i; | ||
1913 | |||
1914 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ | ||
1915 | if (remaining) | ||
1916 | return 1; | ||
1917 | |||
1918 | /* If after HZ/10, a zone is below the high mark, it's premature */ | ||
1919 | for (i = 0; i < pgdat->nr_zones; i++) { | ||
1920 | struct zone *zone = pgdat->node_zones + i; | ||
1921 | |||
1922 | if (!populated_zone(zone)) | ||
1923 | continue; | ||
1924 | |||
1925 | if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), | ||
1926 | 0, 0)) | ||
1927 | return 1; | ||
1928 | } | ||
1929 | |||
1930 | return 0; | ||
1931 | } | ||
1932 | |||
1897 | /* | 1933 | /* |
1898 | * For kswapd, balance_pgdat() will work across all this node's zones until | 1934 | * For kswapd, balance_pgdat() will work across all this node's zones until |
1899 | * they are all at high_wmark_pages(zone). | 1935 | * they are all at high_wmark_pages(zone). |
@@ -1926,7 +1962,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | |||
1926 | .gfp_mask = GFP_KERNEL, | 1962 | .gfp_mask = GFP_KERNEL, |
1927 | .may_unmap = 1, | 1963 | .may_unmap = 1, |
1928 | .may_swap = 1, | 1964 | .may_swap = 1, |
1929 | .swap_cluster_max = SWAP_CLUSTER_MAX, | 1965 | /* |
1966 | * kswapd doesn't want to be bailed out while reclaim. because | ||
1967 | * we want to put equal scanning pressure on each zone. | ||
1968 | */ | ||
1969 | .nr_to_reclaim = ULONG_MAX, | ||
1930 | .swappiness = vm_swappiness, | 1970 | .swappiness = vm_swappiness, |
1931 | .order = order, | 1971 | .order = order, |
1932 | .mem_cgroup = NULL, | 1972 | .mem_cgroup = NULL, |
@@ -1951,6 +1991,7 @@ loop_again: | |||
1951 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 1991 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
1952 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | 1992 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ |
1953 | unsigned long lru_pages = 0; | 1993 | unsigned long lru_pages = 0; |
1994 | int has_under_min_watermark_zone = 0; | ||
1954 | 1995 | ||
1955 | /* The swap token gets in the way of swapout... */ | 1996 | /* The swap token gets in the way of swapout... */ |
1956 | if (!priority) | 1997 | if (!priority) |
@@ -2057,6 +2098,15 @@ loop_again: | |||
2057 | if (total_scanned > SWAP_CLUSTER_MAX * 2 && | 2098 | if (total_scanned > SWAP_CLUSTER_MAX * 2 && |
2058 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) | 2099 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) |
2059 | sc.may_writepage = 1; | 2100 | sc.may_writepage = 1; |
2101 | |||
2102 | /* | ||
2103 | * We are still under min water mark. it mean we have | ||
2104 | * GFP_ATOMIC allocation failure risk. Hurry up! | ||
2105 | */ | ||
2106 | if (!zone_watermark_ok(zone, order, min_wmark_pages(zone), | ||
2107 | end_zone, 0)) | ||
2108 | has_under_min_watermark_zone = 1; | ||
2109 | |||
2060 | } | 2110 | } |
2061 | if (all_zones_ok) | 2111 | if (all_zones_ok) |
2062 | break; /* kswapd: all done */ | 2112 | break; /* kswapd: all done */ |
@@ -2064,8 +2114,12 @@ loop_again: | |||
2064 | * OK, kswapd is getting into trouble. Take a nap, then take | 2114 | * OK, kswapd is getting into trouble. Take a nap, then take |
2065 | * another pass across the zones. | 2115 | * another pass across the zones. |
2066 | */ | 2116 | */ |
2067 | if (total_scanned && priority < DEF_PRIORITY - 2) | 2117 | if (total_scanned && (priority < DEF_PRIORITY - 2)) { |
2068 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 2118 | if (has_under_min_watermark_zone) |
2119 | count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); | ||
2120 | else | ||
2121 | congestion_wait(BLK_RW_ASYNC, HZ/10); | ||
2122 | } | ||
2069 | 2123 | ||
2070 | /* | 2124 | /* |
2071 | * We do this so kswapd doesn't build up large priorities for | 2125 | * We do this so kswapd doesn't build up large priorities for |
@@ -2163,6 +2217,7 @@ static int kswapd(void *p) | |||
2163 | order = 0; | 2217 | order = 0; |
2164 | for ( ; ; ) { | 2218 | for ( ; ; ) { |
2165 | unsigned long new_order; | 2219 | unsigned long new_order; |
2220 | int ret; | ||
2166 | 2221 | ||
2167 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | 2222 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); |
2168 | new_order = pgdat->kswapd_max_order; | 2223 | new_order = pgdat->kswapd_max_order; |
@@ -2174,19 +2229,45 @@ static int kswapd(void *p) | |||
2174 | */ | 2229 | */ |
2175 | order = new_order; | 2230 | order = new_order; |
2176 | } else { | 2231 | } else { |
2177 | if (!freezing(current)) | 2232 | if (!freezing(current) && !kthread_should_stop()) { |
2178 | schedule(); | 2233 | long remaining = 0; |
2234 | |||
2235 | /* Try to sleep for a short interval */ | ||
2236 | if (!sleeping_prematurely(pgdat, order, remaining)) { | ||
2237 | remaining = schedule_timeout(HZ/10); | ||
2238 | finish_wait(&pgdat->kswapd_wait, &wait); | ||
2239 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | ||
2240 | } | ||
2241 | |||
2242 | /* | ||
2243 | * After a short sleep, check if it was a | ||
2244 | * premature sleep. If not, then go fully | ||
2245 | * to sleep until explicitly woken up | ||
2246 | */ | ||
2247 | if (!sleeping_prematurely(pgdat, order, remaining)) | ||
2248 | schedule(); | ||
2249 | else { | ||
2250 | if (remaining) | ||
2251 | count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); | ||
2252 | else | ||
2253 | count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); | ||
2254 | } | ||
2255 | } | ||
2179 | 2256 | ||
2180 | order = pgdat->kswapd_max_order; | 2257 | order = pgdat->kswapd_max_order; |
2181 | } | 2258 | } |
2182 | finish_wait(&pgdat->kswapd_wait, &wait); | 2259 | finish_wait(&pgdat->kswapd_wait, &wait); |
2183 | 2260 | ||
2184 | if (!try_to_freeze()) { | 2261 | ret = try_to_freeze(); |
2185 | /* We can speed up thawing tasks if we don't call | 2262 | if (kthread_should_stop()) |
2186 | * balance_pgdat after returning from the refrigerator | 2263 | break; |
2187 | */ | 2264 | |
2265 | /* | ||
2266 | * We can speed up thawing tasks if we don't call balance_pgdat | ||
2267 | * after returning from the refrigerator | ||
2268 | */ | ||
2269 | if (!ret) | ||
2188 | balance_pgdat(pgdat, order); | 2270 | balance_pgdat(pgdat, order); |
2189 | } | ||
2190 | } | 2271 | } |
2191 | return 0; | 2272 | return 0; |
2192 | } | 2273 | } |
@@ -2250,148 +2331,43 @@ unsigned long zone_reclaimable_pages(struct zone *zone) | |||
2250 | 2331 | ||
2251 | #ifdef CONFIG_HIBERNATION | 2332 | #ifdef CONFIG_HIBERNATION |
2252 | /* | 2333 | /* |
2253 | * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages | 2334 | * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of |
2254 | * from LRU lists system-wide, for given pass and priority. | ||
2255 | * | ||
2256 | * For pass > 3 we also try to shrink the LRU lists that contain a few pages | ||
2257 | */ | ||
2258 | static void shrink_all_zones(unsigned long nr_pages, int prio, | ||
2259 | int pass, struct scan_control *sc) | ||
2260 | { | ||
2261 | struct zone *zone; | ||
2262 | unsigned long nr_reclaimed = 0; | ||
2263 | struct zone_reclaim_stat *reclaim_stat; | ||
2264 | |||
2265 | for_each_populated_zone(zone) { | ||
2266 | enum lru_list l; | ||
2267 | |||
2268 | if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) | ||
2269 | continue; | ||
2270 | |||
2271 | for_each_evictable_lru(l) { | ||
2272 | enum zone_stat_item ls = NR_LRU_BASE + l; | ||
2273 | unsigned long lru_pages = zone_page_state(zone, ls); | ||
2274 | |||
2275 | /* For pass = 0, we don't shrink the active list */ | ||
2276 | if (pass == 0 && (l == LRU_ACTIVE_ANON || | ||
2277 | l == LRU_ACTIVE_FILE)) | ||
2278 | continue; | ||
2279 | |||
2280 | reclaim_stat = get_reclaim_stat(zone, sc); | ||
2281 | reclaim_stat->nr_saved_scan[l] += | ||
2282 | (lru_pages >> prio) + 1; | ||
2283 | if (reclaim_stat->nr_saved_scan[l] | ||
2284 | >= nr_pages || pass > 3) { | ||
2285 | unsigned long nr_to_scan; | ||
2286 | |||
2287 | reclaim_stat->nr_saved_scan[l] = 0; | ||
2288 | nr_to_scan = min(nr_pages, lru_pages); | ||
2289 | nr_reclaimed += shrink_list(l, nr_to_scan, zone, | ||
2290 | sc, prio); | ||
2291 | if (nr_reclaimed >= nr_pages) { | ||
2292 | sc->nr_reclaimed += nr_reclaimed; | ||
2293 | return; | ||
2294 | } | ||
2295 | } | ||
2296 | } | ||
2297 | } | ||
2298 | sc->nr_reclaimed += nr_reclaimed; | ||
2299 | } | ||
2300 | |||
2301 | /* | ||
2302 | * Try to free `nr_pages' of memory, system-wide, and return the number of | ||
2303 | * freed pages. | 2335 | * freed pages. |
2304 | * | 2336 | * |
2305 | * Rather than trying to age LRUs the aim is to preserve the overall | 2337 | * Rather than trying to age LRUs the aim is to preserve the overall |
2306 | * LRU order by reclaiming preferentially | 2338 | * LRU order by reclaiming preferentially |
2307 | * inactive > active > active referenced > active mapped | 2339 | * inactive > active > active referenced > active mapped |
2308 | */ | 2340 | */ |
2309 | unsigned long shrink_all_memory(unsigned long nr_pages) | 2341 | unsigned long shrink_all_memory(unsigned long nr_to_reclaim) |
2310 | { | 2342 | { |
2311 | unsigned long lru_pages, nr_slab; | ||
2312 | int pass; | ||
2313 | struct reclaim_state reclaim_state; | 2343 | struct reclaim_state reclaim_state; |
2314 | struct scan_control sc = { | 2344 | struct scan_control sc = { |
2315 | .gfp_mask = GFP_KERNEL, | 2345 | .gfp_mask = GFP_HIGHUSER_MOVABLE, |
2316 | .may_unmap = 0, | 2346 | .may_swap = 1, |
2347 | .may_unmap = 1, | ||
2317 | .may_writepage = 1, | 2348 | .may_writepage = 1, |
2349 | .nr_to_reclaim = nr_to_reclaim, | ||
2350 | .hibernation_mode = 1, | ||
2351 | .swappiness = vm_swappiness, | ||
2352 | .order = 0, | ||
2318 | .isolate_pages = isolate_pages_global, | 2353 | .isolate_pages = isolate_pages_global, |
2319 | .nr_reclaimed = 0, | ||
2320 | }; | 2354 | }; |
2355 | struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); | ||
2356 | struct task_struct *p = current; | ||
2357 | unsigned long nr_reclaimed; | ||
2321 | 2358 | ||
2322 | current->reclaim_state = &reclaim_state; | 2359 | p->flags |= PF_MEMALLOC; |
2323 | 2360 | lockdep_set_current_reclaim_state(sc.gfp_mask); | |
2324 | lru_pages = global_reclaimable_pages(); | 2361 | reclaim_state.reclaimed_slab = 0; |
2325 | nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); | 2362 | p->reclaim_state = &reclaim_state; |
2326 | /* If slab caches are huge, it's better to hit them first */ | ||
2327 | while (nr_slab >= lru_pages) { | ||
2328 | reclaim_state.reclaimed_slab = 0; | ||
2329 | shrink_slab(nr_pages, sc.gfp_mask, lru_pages); | ||
2330 | if (!reclaim_state.reclaimed_slab) | ||
2331 | break; | ||
2332 | |||
2333 | sc.nr_reclaimed += reclaim_state.reclaimed_slab; | ||
2334 | if (sc.nr_reclaimed >= nr_pages) | ||
2335 | goto out; | ||
2336 | |||
2337 | nr_slab -= reclaim_state.reclaimed_slab; | ||
2338 | } | ||
2339 | |||
2340 | /* | ||
2341 | * We try to shrink LRUs in 5 passes: | ||
2342 | * 0 = Reclaim from inactive_list only | ||
2343 | * 1 = Reclaim from active list but don't reclaim mapped | ||
2344 | * 2 = 2nd pass of type 1 | ||
2345 | * 3 = Reclaim mapped (normal reclaim) | ||
2346 | * 4 = 2nd pass of type 3 | ||
2347 | */ | ||
2348 | for (pass = 0; pass < 5; pass++) { | ||
2349 | int prio; | ||
2350 | |||
2351 | /* Force reclaiming mapped pages in the passes #3 and #4 */ | ||
2352 | if (pass > 2) | ||
2353 | sc.may_unmap = 1; | ||
2354 | |||
2355 | for (prio = DEF_PRIORITY; prio >= 0; prio--) { | ||
2356 | unsigned long nr_to_scan = nr_pages - sc.nr_reclaimed; | ||
2357 | |||
2358 | sc.nr_scanned = 0; | ||
2359 | sc.swap_cluster_max = nr_to_scan; | ||
2360 | shrink_all_zones(nr_to_scan, prio, pass, &sc); | ||
2361 | if (sc.nr_reclaimed >= nr_pages) | ||
2362 | goto out; | ||
2363 | |||
2364 | reclaim_state.reclaimed_slab = 0; | ||
2365 | shrink_slab(sc.nr_scanned, sc.gfp_mask, | ||
2366 | global_reclaimable_pages()); | ||
2367 | sc.nr_reclaimed += reclaim_state.reclaimed_slab; | ||
2368 | if (sc.nr_reclaimed >= nr_pages) | ||
2369 | goto out; | ||
2370 | |||
2371 | if (sc.nr_scanned && prio < DEF_PRIORITY - 2) | ||
2372 | congestion_wait(BLK_RW_ASYNC, HZ / 10); | ||
2373 | } | ||
2374 | } | ||
2375 | |||
2376 | /* | ||
2377 | * If sc.nr_reclaimed = 0, we could not shrink LRUs, but there may be | ||
2378 | * something in slab caches | ||
2379 | */ | ||
2380 | if (!sc.nr_reclaimed) { | ||
2381 | do { | ||
2382 | reclaim_state.reclaimed_slab = 0; | ||
2383 | shrink_slab(nr_pages, sc.gfp_mask, | ||
2384 | global_reclaimable_pages()); | ||
2385 | sc.nr_reclaimed += reclaim_state.reclaimed_slab; | ||
2386 | } while (sc.nr_reclaimed < nr_pages && | ||
2387 | reclaim_state.reclaimed_slab > 0); | ||
2388 | } | ||
2389 | 2363 | ||
2364 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc); | ||
2390 | 2365 | ||
2391 | out: | 2366 | p->reclaim_state = NULL; |
2392 | current->reclaim_state = NULL; | 2367 | lockdep_clear_current_reclaim_state(); |
2368 | p->flags &= ~PF_MEMALLOC; | ||
2393 | 2369 | ||
2394 | return sc.nr_reclaimed; | 2370 | return nr_reclaimed; |
2395 | } | 2371 | } |
2396 | #endif /* CONFIG_HIBERNATION */ | 2372 | #endif /* CONFIG_HIBERNATION */ |
2397 | 2373 | ||
@@ -2441,6 +2417,17 @@ int kswapd_run(int nid) | |||
2441 | return ret; | 2417 | return ret; |
2442 | } | 2418 | } |
2443 | 2419 | ||
2420 | /* | ||
2421 | * Called by memory hotplug when all memory in a node is offlined. | ||
2422 | */ | ||
2423 | void kswapd_stop(int nid) | ||
2424 | { | ||
2425 | struct task_struct *kswapd = NODE_DATA(nid)->kswapd; | ||
2426 | |||
2427 | if (kswapd) | ||
2428 | kthread_stop(kswapd); | ||
2429 | } | ||
2430 | |||
2444 | static int __init kswapd_init(void) | 2431 | static int __init kswapd_init(void) |
2445 | { | 2432 | { |
2446 | int nid; | 2433 | int nid; |
@@ -2543,8 +2530,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
2543 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), | 2530 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), |
2544 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), | 2531 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), |
2545 | .may_swap = 1, | 2532 | .may_swap = 1, |
2546 | .swap_cluster_max = max_t(unsigned long, nr_pages, | 2533 | .nr_to_reclaim = max_t(unsigned long, nr_pages, |
2547 | SWAP_CLUSTER_MAX), | 2534 | SWAP_CLUSTER_MAX), |
2548 | .gfp_mask = gfp_mask, | 2535 | .gfp_mask = gfp_mask, |
2549 | .swappiness = vm_swappiness, | 2536 | .swappiness = vm_swappiness, |
2550 | .order = order, | 2537 | .order = order, |
diff --git a/mm/vmstat.c b/mm/vmstat.c index dad2327e4580..6051fbab67ba 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -683,6 +683,9 @@ static const char * const vmstat_text[] = { | |||
683 | "slabs_scanned", | 683 | "slabs_scanned", |
684 | "kswapd_steal", | 684 | "kswapd_steal", |
685 | "kswapd_inodesteal", | 685 | "kswapd_inodesteal", |
686 | "kswapd_low_wmark_hit_quickly", | ||
687 | "kswapd_high_wmark_hit_quickly", | ||
688 | "kswapd_skip_congestion_wait", | ||
686 | "pageoutrun", | 689 | "pageoutrun", |
687 | "allocstall", | 690 | "allocstall", |
688 | 691 | ||