diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 37 | ||||
-rw-r--r-- | mm/Kconfig.debug | 1 | ||||
-rw-r--r-- | mm/Makefile | 4 | ||||
-rw-r--r-- | mm/bootmem.c | 12 | ||||
-rw-r--r-- | mm/bounce.c | 10 | ||||
-rw-r--r-- | mm/fadvise.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 175 | ||||
-rw-r--r-- | mm/highmem.c | 1 | ||||
-rw-r--r-- | mm/hugetlb.c | 132 | ||||
-rw-r--r-- | mm/init-mm.c | 20 | ||||
-rw-r--r-- | mm/internal.h | 33 | ||||
-rw-r--r-- | mm/kmemcheck.c | 122 | ||||
-rw-r--r-- | mm/kmemleak-test.c | 111 | ||||
-rw-r--r-- | mm/kmemleak.c | 1498 | ||||
-rw-r--r-- | mm/maccess.c | 2 | ||||
-rw-r--r-- | mm/madvise.c | 26 | ||||
-rw-r--r-- | mm/memcontrol.c | 25 | ||||
-rw-r--r-- | mm/memory.c | 128 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 6 | ||||
-rw-r--r-- | mm/mempolicy.c | 145 | ||||
-rw-r--r-- | mm/migrate.c | 6 | ||||
-rw-r--r-- | mm/mlock.c | 73 | ||||
-rw-r--r-- | mm/mmap.c | 8 | ||||
-rw-r--r-- | mm/mprotect.c | 2 | ||||
-rw-r--r-- | mm/nommu.c | 3 | ||||
-rw-r--r-- | mm/oom_kill.c | 84 | ||||
-rw-r--r-- | mm/page-writeback.c | 19 | ||||
-rw-r--r-- | mm/page_alloc.c | 852 | ||||
-rw-r--r-- | mm/page_cgroup.c | 17 | ||||
-rw-r--r-- | mm/page_io.c | 2 | ||||
-rw-r--r-- | mm/percpu.c | 141 | ||||
-rw-r--r-- | mm/readahead.c | 145 | ||||
-rw-r--r-- | mm/rmap.c | 40 | ||||
-rw-r--r-- | mm/shmem.c | 6 | ||||
-rw-r--r-- | mm/slab.c | 280 | ||||
-rw-r--r-- | mm/slob.c | 19 | ||||
-rw-r--r-- | mm/slub.c | 80 | ||||
-rw-r--r-- | mm/swap_state.c | 19 | ||||
-rw-r--r-- | mm/swapfile.c | 276 | ||||
-rw-r--r-- | mm/truncate.c | 40 | ||||
-rw-r--r-- | mm/util.c | 31 | ||||
-rw-r--r-- | mm/vmalloc.c | 33 | ||||
-rw-r--r-- | mm/vmscan.c | 376 | ||||
-rw-r--r-- | mm/vmstat.c | 19 |
44 files changed, 3768 insertions, 1293 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index c2b57d81e153..c948d4ca8bde 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -128,11 +128,11 @@ config SPARSEMEM_VMEMMAP | |||
128 | config MEMORY_HOTPLUG | 128 | config MEMORY_HOTPLUG |
129 | bool "Allow for memory hot-add" | 129 | bool "Allow for memory hot-add" |
130 | depends on SPARSEMEM || X86_64_ACPI_NUMA | 130 | depends on SPARSEMEM || X86_64_ACPI_NUMA |
131 | depends on HOTPLUG && !HIBERNATION && ARCH_ENABLE_MEMORY_HOTPLUG | 131 | depends on HOTPLUG && !(HIBERNATION && !S390) && ARCH_ENABLE_MEMORY_HOTPLUG |
132 | depends on (IA64 || X86 || PPC64 || SUPERH || S390) | 132 | depends on (IA64 || X86 || PPC64 || SUPERH || S390) |
133 | 133 | ||
134 | comment "Memory hotplug is currently incompatible with Software Suspend" | 134 | comment "Memory hotplug is currently incompatible with Software Suspend" |
135 | depends on SPARSEMEM && HOTPLUG && HIBERNATION | 135 | depends on SPARSEMEM && HOTPLUG && HIBERNATION && !S390 |
136 | 136 | ||
137 | config MEMORY_HOTPLUG_SPARSE | 137 | config MEMORY_HOTPLUG_SPARSE |
138 | def_bool y | 138 | def_bool y |
@@ -203,29 +203,36 @@ config VIRT_TO_BUS | |||
203 | def_bool y | 203 | def_bool y |
204 | depends on !ARCH_NO_VIRT_TO_BUS | 204 | depends on !ARCH_NO_VIRT_TO_BUS |
205 | 205 | ||
206 | config UNEVICTABLE_LRU | ||
207 | bool "Add LRU list to track non-evictable pages" | ||
208 | default y | ||
209 | help | ||
210 | Keeps unevictable pages off of the active and inactive pageout | ||
211 | lists, so kswapd will not waste CPU time or have its balancing | ||
212 | algorithms thrown off by scanning these pages. Selecting this | ||
213 | will use one page flag and increase the code size a little, | ||
214 | say Y unless you know what you are doing. | ||
215 | |||
216 | See Documentation/vm/unevictable-lru.txt for more information. | ||
217 | |||
218 | config HAVE_MLOCK | 206 | config HAVE_MLOCK |
219 | bool | 207 | bool |
220 | default y if MMU=y | 208 | default y if MMU=y |
221 | 209 | ||
222 | config HAVE_MLOCKED_PAGE_BIT | 210 | config HAVE_MLOCKED_PAGE_BIT |
223 | bool | 211 | bool |
224 | default y if HAVE_MLOCK=y && UNEVICTABLE_LRU=y | 212 | default y if HAVE_MLOCK=y |
225 | 213 | ||
226 | config MMU_NOTIFIER | 214 | config MMU_NOTIFIER |
227 | bool | 215 | bool |
228 | 216 | ||
217 | config DEFAULT_MMAP_MIN_ADDR | ||
218 | int "Low address space to protect from user allocation" | ||
219 | default 4096 | ||
220 | help | ||
221 | This is the portion of low virtual memory which should be protected | ||
222 | from userspace allocation. Keeping a user from writing to low pages | ||
223 | can help reduce the impact of kernel NULL pointer bugs. | ||
224 | |||
225 | For most ia64, ppc64 and x86 users with lots of address space | ||
226 | a value of 65536 is reasonable and should cause no problems. | ||
227 | On arm and other archs it should not be higher than 32768. | ||
228 | Programs which use vm86 functionality would either need additional | ||
229 | permissions from either the LSM or the capabilities module or have | ||
230 | this protection disabled. | ||
231 | |||
232 | This value can be changed after boot using the | ||
233 | /proc/sys/vm/mmap_min_addr tunable. | ||
234 | |||
235 | |||
229 | config NOMMU_INITIAL_TRIM_EXCESS | 236 | config NOMMU_INITIAL_TRIM_EXCESS |
230 | int "Turn on mmap() excess space trimming before booting" | 237 | int "Turn on mmap() excess space trimming before booting" |
231 | depends on !MMU | 238 | depends on !MMU |
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index bb01e298f260..aa99fd1f7109 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug | |||
@@ -2,6 +2,7 @@ config DEBUG_PAGEALLOC | |||
2 | bool "Debug page memory allocations" | 2 | bool "Debug page memory allocations" |
3 | depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC | 3 | depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC |
4 | depends on !HIBERNATION || !PPC && !SPARC | 4 | depends on !HIBERNATION || !PPC && !SPARC |
5 | depends on !KMEMCHECK | ||
5 | ---help--- | 6 | ---help--- |
6 | Unmap pages from the kernel linear mapping after free_pages(). | 7 | Unmap pages from the kernel linear mapping after free_pages(). |
7 | This results in a large slowdown, but helps to find certain types | 8 | This results in a large slowdown, but helps to find certain types |
diff --git a/mm/Makefile b/mm/Makefile index ec73c68b6015..5e0bd6426693 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -12,6 +12,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ | |||
12 | readahead.o swap.o truncate.o vmscan.o shmem.o \ | 12 | readahead.o swap.o truncate.o vmscan.o shmem.o \ |
13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ | 13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ |
14 | page_isolation.o mm_init.o $(mmu-y) | 14 | page_isolation.o mm_init.o $(mmu-y) |
15 | obj-y += init-mm.o | ||
15 | 16 | ||
16 | obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o | 17 | obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o |
17 | obj-$(CONFIG_BOUNCE) += bounce.o | 18 | obj-$(CONFIG_BOUNCE) += bounce.o |
@@ -27,6 +28,7 @@ obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o | |||
27 | obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o | 28 | obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o |
28 | obj-$(CONFIG_SLAB) += slab.o | 29 | obj-$(CONFIG_SLAB) += slab.o |
29 | obj-$(CONFIG_SLUB) += slub.o | 30 | obj-$(CONFIG_SLUB) += slub.o |
31 | obj-$(CONFIG_KMEMCHECK) += kmemcheck.o | ||
30 | obj-$(CONFIG_FAILSLAB) += failslab.o | 32 | obj-$(CONFIG_FAILSLAB) += failslab.o |
31 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o | 33 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o |
32 | obj-$(CONFIG_FS_XIP) += filemap_xip.o | 34 | obj-$(CONFIG_FS_XIP) += filemap_xip.o |
@@ -38,3 +40,5 @@ obj-$(CONFIG_SMP) += allocpercpu.o | |||
38 | endif | 40 | endif |
39 | obj-$(CONFIG_QUICKLIST) += quicklist.o | 41 | obj-$(CONFIG_QUICKLIST) += quicklist.o |
40 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o | 42 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o |
43 | obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o | ||
44 | obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o | ||
diff --git a/mm/bootmem.c b/mm/bootmem.c index daf92713f7de..282df0a09e6f 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -532,6 +532,9 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, | |||
532 | unsigned long size, unsigned long align, | 532 | unsigned long size, unsigned long align, |
533 | unsigned long goal, unsigned long limit) | 533 | unsigned long goal, unsigned long limit) |
534 | { | 534 | { |
535 | if (WARN_ON_ONCE(slab_is_available())) | ||
536 | return kzalloc(size, GFP_NOWAIT); | ||
537 | |||
535 | #ifdef CONFIG_HAVE_ARCH_BOOTMEM | 538 | #ifdef CONFIG_HAVE_ARCH_BOOTMEM |
536 | bootmem_data_t *p_bdata; | 539 | bootmem_data_t *p_bdata; |
537 | 540 | ||
@@ -662,6 +665,9 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, | |||
662 | void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, | 665 | void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, |
663 | unsigned long align, unsigned long goal) | 666 | unsigned long align, unsigned long goal) |
664 | { | 667 | { |
668 | if (WARN_ON_ONCE(slab_is_available())) | ||
669 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | ||
670 | |||
665 | return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); | 671 | return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); |
666 | } | 672 | } |
667 | 673 | ||
@@ -693,6 +699,9 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, | |||
693 | { | 699 | { |
694 | void *ptr; | 700 | void *ptr; |
695 | 701 | ||
702 | if (WARN_ON_ONCE(slab_is_available())) | ||
703 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | ||
704 | |||
696 | ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); | 705 | ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); |
697 | if (ptr) | 706 | if (ptr) |
698 | return ptr; | 707 | return ptr; |
@@ -745,6 +754,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, | |||
745 | void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, | 754 | void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, |
746 | unsigned long align, unsigned long goal) | 755 | unsigned long align, unsigned long goal) |
747 | { | 756 | { |
757 | if (WARN_ON_ONCE(slab_is_available())) | ||
758 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | ||
759 | |||
748 | return ___alloc_bootmem_node(pgdat->bdata, size, align, | 760 | return ___alloc_bootmem_node(pgdat->bdata, size, align, |
749 | goal, ARCH_LOW_ADDRESS_LIMIT); | 761 | goal, ARCH_LOW_ADDRESS_LIMIT); |
750 | } | 762 | } |
diff --git a/mm/bounce.c b/mm/bounce.c index e590272fe7a8..a2b76a588e34 100644 --- a/mm/bounce.c +++ b/mm/bounce.c | |||
@@ -13,17 +13,15 @@ | |||
13 | #include <linux/init.h> | 13 | #include <linux/init.h> |
14 | #include <linux/hash.h> | 14 | #include <linux/hash.h> |
15 | #include <linux/highmem.h> | 15 | #include <linux/highmem.h> |
16 | #include <linux/blktrace_api.h> | ||
17 | #include <trace/block.h> | ||
18 | #include <asm/tlbflush.h> | 16 | #include <asm/tlbflush.h> |
19 | 17 | ||
18 | #include <trace/events/block.h> | ||
19 | |||
20 | #define POOL_SIZE 64 | 20 | #define POOL_SIZE 64 |
21 | #define ISA_POOL_SIZE 16 | 21 | #define ISA_POOL_SIZE 16 |
22 | 22 | ||
23 | static mempool_t *page_pool, *isa_page_pool; | 23 | static mempool_t *page_pool, *isa_page_pool; |
24 | 24 | ||
25 | DEFINE_TRACE(block_bio_bounce); | ||
26 | |||
27 | #ifdef CONFIG_HIGHMEM | 25 | #ifdef CONFIG_HIGHMEM |
28 | static __init int init_emergency_pool(void) | 26 | static __init int init_emergency_pool(void) |
29 | { | 27 | { |
@@ -192,7 +190,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, | |||
192 | /* | 190 | /* |
193 | * is destination page below bounce pfn? | 191 | * is destination page below bounce pfn? |
194 | */ | 192 | */ |
195 | if (page_to_pfn(page) <= q->bounce_pfn) | 193 | if (page_to_pfn(page) <= queue_bounce_pfn(q)) |
196 | continue; | 194 | continue; |
197 | 195 | ||
198 | /* | 196 | /* |
@@ -284,7 +282,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) | |||
284 | * don't waste time iterating over bio segments | 282 | * don't waste time iterating over bio segments |
285 | */ | 283 | */ |
286 | if (!(q->bounce_gfp & GFP_DMA)) { | 284 | if (!(q->bounce_gfp & GFP_DMA)) { |
287 | if (q->bounce_pfn >= blk_max_pfn) | 285 | if (queue_bounce_pfn(q) >= blk_max_pfn) |
288 | return; | 286 | return; |
289 | pool = page_pool; | 287 | pool = page_pool; |
290 | } else { | 288 | } else { |
diff --git a/mm/fadvise.c b/mm/fadvise.c index 54a0f8040afa..e43359214f6f 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
@@ -101,7 +101,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) | |||
101 | 101 | ||
102 | ret = force_page_cache_readahead(mapping, file, | 102 | ret = force_page_cache_readahead(mapping, file, |
103 | start_index, | 103 | start_index, |
104 | max_sane_readahead(nrpages)); | 104 | nrpages); |
105 | if (ret > 0) | 105 | if (ret > 0) |
106 | ret = 0; | 106 | ret = 0; |
107 | break; | 107 | break; |
diff --git a/mm/filemap.c b/mm/filemap.c index 379ff0bcbf6e..22396713feb9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -121,7 +121,6 @@ void __remove_from_page_cache(struct page *page) | |||
121 | mapping->nrpages--; | 121 | mapping->nrpages--; |
122 | __dec_zone_page_state(page, NR_FILE_PAGES); | 122 | __dec_zone_page_state(page, NR_FILE_PAGES); |
123 | BUG_ON(page_mapped(page)); | 123 | BUG_ON(page_mapped(page)); |
124 | mem_cgroup_uncharge_cache_page(page); | ||
125 | 124 | ||
126 | /* | 125 | /* |
127 | * Some filesystems seem to re-dirty the page even after | 126 | * Some filesystems seem to re-dirty the page even after |
@@ -145,6 +144,7 @@ void remove_from_page_cache(struct page *page) | |||
145 | spin_lock_irq(&mapping->tree_lock); | 144 | spin_lock_irq(&mapping->tree_lock); |
146 | __remove_from_page_cache(page); | 145 | __remove_from_page_cache(page); |
147 | spin_unlock_irq(&mapping->tree_lock); | 146 | spin_unlock_irq(&mapping->tree_lock); |
147 | mem_cgroup_uncharge_cache_page(page); | ||
148 | } | 148 | } |
149 | 149 | ||
150 | static int sync_page(void *word) | 150 | static int sync_page(void *word) |
@@ -476,13 +476,13 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, | |||
476 | if (likely(!error)) { | 476 | if (likely(!error)) { |
477 | mapping->nrpages++; | 477 | mapping->nrpages++; |
478 | __inc_zone_page_state(page, NR_FILE_PAGES); | 478 | __inc_zone_page_state(page, NR_FILE_PAGES); |
479 | spin_unlock_irq(&mapping->tree_lock); | ||
479 | } else { | 480 | } else { |
480 | page->mapping = NULL; | 481 | page->mapping = NULL; |
482 | spin_unlock_irq(&mapping->tree_lock); | ||
481 | mem_cgroup_uncharge_cache_page(page); | 483 | mem_cgroup_uncharge_cache_page(page); |
482 | page_cache_release(page); | 484 | page_cache_release(page); |
483 | } | 485 | } |
484 | |||
485 | spin_unlock_irq(&mapping->tree_lock); | ||
486 | radix_tree_preload_end(); | 486 | radix_tree_preload_end(); |
487 | } else | 487 | } else |
488 | mem_cgroup_uncharge_cache_page(page); | 488 | mem_cgroup_uncharge_cache_page(page); |
@@ -521,7 +521,7 @@ struct page *__page_cache_alloc(gfp_t gfp) | |||
521 | { | 521 | { |
522 | if (cpuset_do_page_mem_spread()) { | 522 | if (cpuset_do_page_mem_spread()) { |
523 | int n = cpuset_mem_spread_node(); | 523 | int n = cpuset_mem_spread_node(); |
524 | return alloc_pages_node(n, gfp, 0); | 524 | return alloc_pages_exact_node(n, gfp, 0); |
525 | } | 525 | } |
526 | return alloc_pages(gfp, 0); | 526 | return alloc_pages(gfp, 0); |
527 | } | 527 | } |
@@ -1004,9 +1004,6 @@ EXPORT_SYMBOL(grab_cache_page_nowait); | |||
1004 | static void shrink_readahead_size_eio(struct file *filp, | 1004 | static void shrink_readahead_size_eio(struct file *filp, |
1005 | struct file_ra_state *ra) | 1005 | struct file_ra_state *ra) |
1006 | { | 1006 | { |
1007 | if (!ra->ra_pages) | ||
1008 | return; | ||
1009 | |||
1010 | ra->ra_pages /= 4; | 1007 | ra->ra_pages /= 4; |
1011 | } | 1008 | } |
1012 | 1009 | ||
@@ -1390,8 +1387,7 @@ do_readahead(struct address_space *mapping, struct file *filp, | |||
1390 | if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) | 1387 | if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) |
1391 | return -EINVAL; | 1388 | return -EINVAL; |
1392 | 1389 | ||
1393 | force_page_cache_readahead(mapping, filp, index, | 1390 | force_page_cache_readahead(mapping, filp, index, nr); |
1394 | max_sane_readahead(nr)); | ||
1395 | return 0; | 1391 | return 0; |
1396 | } | 1392 | } |
1397 | 1393 | ||
@@ -1457,6 +1453,73 @@ static int page_cache_read(struct file *file, pgoff_t offset) | |||
1457 | 1453 | ||
1458 | #define MMAP_LOTSAMISS (100) | 1454 | #define MMAP_LOTSAMISS (100) |
1459 | 1455 | ||
1456 | /* | ||
1457 | * Synchronous readahead happens when we don't even find | ||
1458 | * a page in the page cache at all. | ||
1459 | */ | ||
1460 | static void do_sync_mmap_readahead(struct vm_area_struct *vma, | ||
1461 | struct file_ra_state *ra, | ||
1462 | struct file *file, | ||
1463 | pgoff_t offset) | ||
1464 | { | ||
1465 | unsigned long ra_pages; | ||
1466 | struct address_space *mapping = file->f_mapping; | ||
1467 | |||
1468 | /* If we don't want any read-ahead, don't bother */ | ||
1469 | if (VM_RandomReadHint(vma)) | ||
1470 | return; | ||
1471 | |||
1472 | if (VM_SequentialReadHint(vma) || | ||
1473 | offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) { | ||
1474 | page_cache_sync_readahead(mapping, ra, file, offset, | ||
1475 | ra->ra_pages); | ||
1476 | return; | ||
1477 | } | ||
1478 | |||
1479 | if (ra->mmap_miss < INT_MAX) | ||
1480 | ra->mmap_miss++; | ||
1481 | |||
1482 | /* | ||
1483 | * Do we miss much more than hit in this file? If so, | ||
1484 | * stop bothering with read-ahead. It will only hurt. | ||
1485 | */ | ||
1486 | if (ra->mmap_miss > MMAP_LOTSAMISS) | ||
1487 | return; | ||
1488 | |||
1489 | /* | ||
1490 | * mmap read-around | ||
1491 | */ | ||
1492 | ra_pages = max_sane_readahead(ra->ra_pages); | ||
1493 | if (ra_pages) { | ||
1494 | ra->start = max_t(long, 0, offset - ra_pages/2); | ||
1495 | ra->size = ra_pages; | ||
1496 | ra->async_size = 0; | ||
1497 | ra_submit(ra, mapping, file); | ||
1498 | } | ||
1499 | } | ||
1500 | |||
1501 | /* | ||
1502 | * Asynchronous readahead happens when we find the page and PG_readahead, | ||
1503 | * so we want to possibly extend the readahead further.. | ||
1504 | */ | ||
1505 | static void do_async_mmap_readahead(struct vm_area_struct *vma, | ||
1506 | struct file_ra_state *ra, | ||
1507 | struct file *file, | ||
1508 | struct page *page, | ||
1509 | pgoff_t offset) | ||
1510 | { | ||
1511 | struct address_space *mapping = file->f_mapping; | ||
1512 | |||
1513 | /* If we don't want any read-ahead, don't bother */ | ||
1514 | if (VM_RandomReadHint(vma)) | ||
1515 | return; | ||
1516 | if (ra->mmap_miss > 0) | ||
1517 | ra->mmap_miss--; | ||
1518 | if (PageReadahead(page)) | ||
1519 | page_cache_async_readahead(mapping, ra, file, | ||
1520 | page, offset, ra->ra_pages); | ||
1521 | } | ||
1522 | |||
1460 | /** | 1523 | /** |
1461 | * filemap_fault - read in file data for page fault handling | 1524 | * filemap_fault - read in file data for page fault handling |
1462 | * @vma: vma in which the fault was taken | 1525 | * @vma: vma in which the fault was taken |
@@ -1476,78 +1539,44 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1476 | struct address_space *mapping = file->f_mapping; | 1539 | struct address_space *mapping = file->f_mapping; |
1477 | struct file_ra_state *ra = &file->f_ra; | 1540 | struct file_ra_state *ra = &file->f_ra; |
1478 | struct inode *inode = mapping->host; | 1541 | struct inode *inode = mapping->host; |
1542 | pgoff_t offset = vmf->pgoff; | ||
1479 | struct page *page; | 1543 | struct page *page; |
1480 | pgoff_t size; | 1544 | pgoff_t size; |
1481 | int did_readaround = 0; | ||
1482 | int ret = 0; | 1545 | int ret = 0; |
1483 | 1546 | ||
1484 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 1547 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
1485 | if (vmf->pgoff >= size) | 1548 | if (offset >= size) |
1486 | return VM_FAULT_SIGBUS; | 1549 | return VM_FAULT_SIGBUS; |
1487 | 1550 | ||
1488 | /* If we don't want any read-ahead, don't bother */ | ||
1489 | if (VM_RandomReadHint(vma)) | ||
1490 | goto no_cached_page; | ||
1491 | |||
1492 | /* | 1551 | /* |
1493 | * Do we have something in the page cache already? | 1552 | * Do we have something in the page cache already? |
1494 | */ | 1553 | */ |
1495 | retry_find: | 1554 | page = find_get_page(mapping, offset); |
1496 | page = find_lock_page(mapping, vmf->pgoff); | 1555 | if (likely(page)) { |
1497 | /* | ||
1498 | * For sequential accesses, we use the generic readahead logic. | ||
1499 | */ | ||
1500 | if (VM_SequentialReadHint(vma)) { | ||
1501 | if (!page) { | ||
1502 | page_cache_sync_readahead(mapping, ra, file, | ||
1503 | vmf->pgoff, 1); | ||
1504 | page = find_lock_page(mapping, vmf->pgoff); | ||
1505 | if (!page) | ||
1506 | goto no_cached_page; | ||
1507 | } | ||
1508 | if (PageReadahead(page)) { | ||
1509 | page_cache_async_readahead(mapping, ra, file, page, | ||
1510 | vmf->pgoff, 1); | ||
1511 | } | ||
1512 | } | ||
1513 | |||
1514 | if (!page) { | ||
1515 | unsigned long ra_pages; | ||
1516 | |||
1517 | ra->mmap_miss++; | ||
1518 | |||
1519 | /* | 1556 | /* |
1520 | * Do we miss much more than hit in this file? If so, | 1557 | * We found the page, so try async readahead before |
1521 | * stop bothering with read-ahead. It will only hurt. | 1558 | * waiting for the lock. |
1522 | */ | 1559 | */ |
1523 | if (ra->mmap_miss > MMAP_LOTSAMISS) | 1560 | do_async_mmap_readahead(vma, ra, file, page, offset); |
1524 | goto no_cached_page; | 1561 | lock_page(page); |
1525 | 1562 | ||
1526 | /* | 1563 | /* Did it get truncated? */ |
1527 | * To keep the pgmajfault counter straight, we need to | 1564 | if (unlikely(page->mapping != mapping)) { |
1528 | * check did_readaround, as this is an inner loop. | 1565 | unlock_page(page); |
1529 | */ | 1566 | put_page(page); |
1530 | if (!did_readaround) { | 1567 | goto no_cached_page; |
1531 | ret = VM_FAULT_MAJOR; | ||
1532 | count_vm_event(PGMAJFAULT); | ||
1533 | } | ||
1534 | did_readaround = 1; | ||
1535 | ra_pages = max_sane_readahead(file->f_ra.ra_pages); | ||
1536 | if (ra_pages) { | ||
1537 | pgoff_t start = 0; | ||
1538 | |||
1539 | if (vmf->pgoff > ra_pages / 2) | ||
1540 | start = vmf->pgoff - ra_pages / 2; | ||
1541 | do_page_cache_readahead(mapping, file, start, ra_pages); | ||
1542 | } | 1568 | } |
1543 | page = find_lock_page(mapping, vmf->pgoff); | 1569 | } else { |
1570 | /* No page in the page cache at all */ | ||
1571 | do_sync_mmap_readahead(vma, ra, file, offset); | ||
1572 | count_vm_event(PGMAJFAULT); | ||
1573 | ret = VM_FAULT_MAJOR; | ||
1574 | retry_find: | ||
1575 | page = find_lock_page(mapping, offset); | ||
1544 | if (!page) | 1576 | if (!page) |
1545 | goto no_cached_page; | 1577 | goto no_cached_page; |
1546 | } | 1578 | } |
1547 | 1579 | ||
1548 | if (!did_readaround) | ||
1549 | ra->mmap_miss--; | ||
1550 | |||
1551 | /* | 1580 | /* |
1552 | * We have a locked page in the page cache, now we need to check | 1581 | * We have a locked page in the page cache, now we need to check |
1553 | * that it's up-to-date. If not, it is going to be due to an error. | 1582 | * that it's up-to-date. If not, it is going to be due to an error. |
@@ -1555,18 +1584,18 @@ retry_find: | |||
1555 | if (unlikely(!PageUptodate(page))) | 1584 | if (unlikely(!PageUptodate(page))) |
1556 | goto page_not_uptodate; | 1585 | goto page_not_uptodate; |
1557 | 1586 | ||
1558 | /* Must recheck i_size under page lock */ | 1587 | /* |
1588 | * Found the page and have a reference on it. | ||
1589 | * We must recheck i_size under page lock. | ||
1590 | */ | ||
1559 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 1591 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
1560 | if (unlikely(vmf->pgoff >= size)) { | 1592 | if (unlikely(offset >= size)) { |
1561 | unlock_page(page); | 1593 | unlock_page(page); |
1562 | page_cache_release(page); | 1594 | page_cache_release(page); |
1563 | return VM_FAULT_SIGBUS; | 1595 | return VM_FAULT_SIGBUS; |
1564 | } | 1596 | } |
1565 | 1597 | ||
1566 | /* | 1598 | ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT; |
1567 | * Found the page and have a reference on it. | ||
1568 | */ | ||
1569 | ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT; | ||
1570 | vmf->page = page; | 1599 | vmf->page = page; |
1571 | return ret | VM_FAULT_LOCKED; | 1600 | return ret | VM_FAULT_LOCKED; |
1572 | 1601 | ||
@@ -1575,7 +1604,7 @@ no_cached_page: | |||
1575 | * We're only likely to ever get here if MADV_RANDOM is in | 1604 | * We're only likely to ever get here if MADV_RANDOM is in |
1576 | * effect. | 1605 | * effect. |
1577 | */ | 1606 | */ |
1578 | error = page_cache_read(file, vmf->pgoff); | 1607 | error = page_cache_read(file, offset); |
1579 | 1608 | ||
1580 | /* | 1609 | /* |
1581 | * The page we want has now been added to the page cache. | 1610 | * The page we want has now been added to the page cache. |
@@ -1595,12 +1624,6 @@ no_cached_page: | |||
1595 | return VM_FAULT_SIGBUS; | 1624 | return VM_FAULT_SIGBUS; |
1596 | 1625 | ||
1597 | page_not_uptodate: | 1626 | page_not_uptodate: |
1598 | /* IO error path */ | ||
1599 | if (!did_readaround) { | ||
1600 | ret = VM_FAULT_MAJOR; | ||
1601 | count_vm_event(PGMAJFAULT); | ||
1602 | } | ||
1603 | |||
1604 | /* | 1627 | /* |
1605 | * Umm, take care of errors if the page isn't up-to-date. | 1628 | * Umm, take care of errors if the page isn't up-to-date. |
1606 | * Try to re-read it _once_. We do this synchronously, | 1629 | * Try to re-read it _once_. We do this synchronously, |
diff --git a/mm/highmem.c b/mm/highmem.c index 68eb1d9b63fa..25878cc49daa 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
@@ -26,7 +26,6 @@ | |||
26 | #include <linux/init.h> | 26 | #include <linux/init.h> |
27 | #include <linux/hash.h> | 27 | #include <linux/hash.h> |
28 | #include <linux/highmem.h> | 28 | #include <linux/highmem.h> |
29 | #include <linux/blktrace_api.h> | ||
30 | #include <asm/tlbflush.h> | 29 | #include <asm/tlbflush.h> |
31 | 30 | ||
32 | /* | 31 | /* |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 28c655ba9353..a56e6f3ce979 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -316,7 +316,7 @@ static void resv_map_release(struct kref *ref) | |||
316 | static struct resv_map *vma_resv_map(struct vm_area_struct *vma) | 316 | static struct resv_map *vma_resv_map(struct vm_area_struct *vma) |
317 | { | 317 | { |
318 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | 318 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); |
319 | if (!(vma->vm_flags & VM_SHARED)) | 319 | if (!(vma->vm_flags & VM_MAYSHARE)) |
320 | return (struct resv_map *)(get_vma_private_data(vma) & | 320 | return (struct resv_map *)(get_vma_private_data(vma) & |
321 | ~HPAGE_RESV_MASK); | 321 | ~HPAGE_RESV_MASK); |
322 | return NULL; | 322 | return NULL; |
@@ -325,7 +325,7 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma) | |||
325 | static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) | 325 | static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) |
326 | { | 326 | { |
327 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | 327 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); |
328 | VM_BUG_ON(vma->vm_flags & VM_SHARED); | 328 | VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); |
329 | 329 | ||
330 | set_vma_private_data(vma, (get_vma_private_data(vma) & | 330 | set_vma_private_data(vma, (get_vma_private_data(vma) & |
331 | HPAGE_RESV_MASK) | (unsigned long)map); | 331 | HPAGE_RESV_MASK) | (unsigned long)map); |
@@ -334,7 +334,7 @@ static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) | |||
334 | static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) | 334 | static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) |
335 | { | 335 | { |
336 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | 336 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); |
337 | VM_BUG_ON(vma->vm_flags & VM_SHARED); | 337 | VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); |
338 | 338 | ||
339 | set_vma_private_data(vma, get_vma_private_data(vma) | flags); | 339 | set_vma_private_data(vma, get_vma_private_data(vma) | flags); |
340 | } | 340 | } |
@@ -353,7 +353,7 @@ static void decrement_hugepage_resv_vma(struct hstate *h, | |||
353 | if (vma->vm_flags & VM_NORESERVE) | 353 | if (vma->vm_flags & VM_NORESERVE) |
354 | return; | 354 | return; |
355 | 355 | ||
356 | if (vma->vm_flags & VM_SHARED) { | 356 | if (vma->vm_flags & VM_MAYSHARE) { |
357 | /* Shared mappings always use reserves */ | 357 | /* Shared mappings always use reserves */ |
358 | h->resv_huge_pages--; | 358 | h->resv_huge_pages--; |
359 | } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { | 359 | } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { |
@@ -369,14 +369,14 @@ static void decrement_hugepage_resv_vma(struct hstate *h, | |||
369 | void reset_vma_resv_huge_pages(struct vm_area_struct *vma) | 369 | void reset_vma_resv_huge_pages(struct vm_area_struct *vma) |
370 | { | 370 | { |
371 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | 371 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); |
372 | if (!(vma->vm_flags & VM_SHARED)) | 372 | if (!(vma->vm_flags & VM_MAYSHARE)) |
373 | vma->vm_private_data = (void *)0; | 373 | vma->vm_private_data = (void *)0; |
374 | } | 374 | } |
375 | 375 | ||
376 | /* Returns true if the VMA has associated reserve pages */ | 376 | /* Returns true if the VMA has associated reserve pages */ |
377 | static int vma_has_reserves(struct vm_area_struct *vma) | 377 | static int vma_has_reserves(struct vm_area_struct *vma) |
378 | { | 378 | { |
379 | if (vma->vm_flags & VM_SHARED) | 379 | if (vma->vm_flags & VM_MAYSHARE) |
380 | return 1; | 380 | return 1; |
381 | if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) | 381 | if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) |
382 | return 1; | 382 | return 1; |
@@ -578,41 +578,6 @@ static void free_huge_page(struct page *page) | |||
578 | hugetlb_put_quota(mapping, 1); | 578 | hugetlb_put_quota(mapping, 1); |
579 | } | 579 | } |
580 | 580 | ||
581 | /* | ||
582 | * Increment or decrement surplus_huge_pages. Keep node-specific counters | ||
583 | * balanced by operating on them in a round-robin fashion. | ||
584 | * Returns 1 if an adjustment was made. | ||
585 | */ | ||
586 | static int adjust_pool_surplus(struct hstate *h, int delta) | ||
587 | { | ||
588 | static int prev_nid; | ||
589 | int nid = prev_nid; | ||
590 | int ret = 0; | ||
591 | |||
592 | VM_BUG_ON(delta != -1 && delta != 1); | ||
593 | do { | ||
594 | nid = next_node(nid, node_online_map); | ||
595 | if (nid == MAX_NUMNODES) | ||
596 | nid = first_node(node_online_map); | ||
597 | |||
598 | /* To shrink on this node, there must be a surplus page */ | ||
599 | if (delta < 0 && !h->surplus_huge_pages_node[nid]) | ||
600 | continue; | ||
601 | /* Surplus cannot exceed the total number of pages */ | ||
602 | if (delta > 0 && h->surplus_huge_pages_node[nid] >= | ||
603 | h->nr_huge_pages_node[nid]) | ||
604 | continue; | ||
605 | |||
606 | h->surplus_huge_pages += delta; | ||
607 | h->surplus_huge_pages_node[nid] += delta; | ||
608 | ret = 1; | ||
609 | break; | ||
610 | } while (nid != prev_nid); | ||
611 | |||
612 | prev_nid = nid; | ||
613 | return ret; | ||
614 | } | ||
615 | |||
616 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) | 581 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) |
617 | { | 582 | { |
618 | set_compound_page_dtor(page, free_huge_page); | 583 | set_compound_page_dtor(page, free_huge_page); |
@@ -623,6 +588,34 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) | |||
623 | put_page(page); /* free it into the hugepage allocator */ | 588 | put_page(page); /* free it into the hugepage allocator */ |
624 | } | 589 | } |
625 | 590 | ||
591 | static void prep_compound_gigantic_page(struct page *page, unsigned long order) | ||
592 | { | ||
593 | int i; | ||
594 | int nr_pages = 1 << order; | ||
595 | struct page *p = page + 1; | ||
596 | |||
597 | /* we rely on prep_new_huge_page to set the destructor */ | ||
598 | set_compound_order(page, order); | ||
599 | __SetPageHead(page); | ||
600 | for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { | ||
601 | __SetPageTail(p); | ||
602 | p->first_page = page; | ||
603 | } | ||
604 | } | ||
605 | |||
606 | int PageHuge(struct page *page) | ||
607 | { | ||
608 | compound_page_dtor *dtor; | ||
609 | |||
610 | if (!PageCompound(page)) | ||
611 | return 0; | ||
612 | |||
613 | page = compound_head(page); | ||
614 | dtor = get_compound_page_dtor(page); | ||
615 | |||
616 | return dtor == free_huge_page; | ||
617 | } | ||
618 | |||
626 | static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | 619 | static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) |
627 | { | 620 | { |
628 | struct page *page; | 621 | struct page *page; |
@@ -630,7 +623,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | |||
630 | if (h->order >= MAX_ORDER) | 623 | if (h->order >= MAX_ORDER) |
631 | return NULL; | 624 | return NULL; |
632 | 625 | ||
633 | page = alloc_pages_node(nid, | 626 | page = alloc_pages_exact_node(nid, |
634 | htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| | 627 | htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| |
635 | __GFP_REPEAT|__GFP_NOWARN, | 628 | __GFP_REPEAT|__GFP_NOWARN, |
636 | huge_page_order(h)); | 629 | huge_page_order(h)); |
@@ -649,7 +642,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | |||
649 | * Use a helper variable to find the next node and then | 642 | * Use a helper variable to find the next node and then |
650 | * copy it back to hugetlb_next_nid afterwards: | 643 | * copy it back to hugetlb_next_nid afterwards: |
651 | * otherwise there's a window in which a racer might | 644 | * otherwise there's a window in which a racer might |
652 | * pass invalid nid MAX_NUMNODES to alloc_pages_node. | 645 | * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. |
653 | * But we don't need to use a spin_lock here: it really | 646 | * But we don't need to use a spin_lock here: it really |
654 | * doesn't matter if occasionally a racer chooses the | 647 | * doesn't matter if occasionally a racer chooses the |
655 | * same nid as we do. Move nid forward in the mask even | 648 | * same nid as we do. Move nid forward in the mask even |
@@ -875,7 +868,7 @@ static void return_unused_surplus_pages(struct hstate *h, | |||
875 | * can no longer free unreserved surplus pages. This occurs when | 868 | * can no longer free unreserved surplus pages. This occurs when |
876 | * the nodes with surplus pages have no free pages. | 869 | * the nodes with surplus pages have no free pages. |
877 | */ | 870 | */ |
878 | unsigned long remaining_iterations = num_online_nodes(); | 871 | unsigned long remaining_iterations = nr_online_nodes; |
879 | 872 | ||
880 | /* Uncommit the reservation */ | 873 | /* Uncommit the reservation */ |
881 | h->resv_huge_pages -= unused_resv_pages; | 874 | h->resv_huge_pages -= unused_resv_pages; |
@@ -904,7 +897,7 @@ static void return_unused_surplus_pages(struct hstate *h, | |||
904 | h->surplus_huge_pages--; | 897 | h->surplus_huge_pages--; |
905 | h->surplus_huge_pages_node[nid]--; | 898 | h->surplus_huge_pages_node[nid]--; |
906 | nr_pages--; | 899 | nr_pages--; |
907 | remaining_iterations = num_online_nodes(); | 900 | remaining_iterations = nr_online_nodes; |
908 | } | 901 | } |
909 | } | 902 | } |
910 | } | 903 | } |
@@ -924,7 +917,7 @@ static long vma_needs_reservation(struct hstate *h, | |||
924 | struct address_space *mapping = vma->vm_file->f_mapping; | 917 | struct address_space *mapping = vma->vm_file->f_mapping; |
925 | struct inode *inode = mapping->host; | 918 | struct inode *inode = mapping->host; |
926 | 919 | ||
927 | if (vma->vm_flags & VM_SHARED) { | 920 | if (vma->vm_flags & VM_MAYSHARE) { |
928 | pgoff_t idx = vma_hugecache_offset(h, vma, addr); | 921 | pgoff_t idx = vma_hugecache_offset(h, vma, addr); |
929 | return region_chg(&inode->i_mapping->private_list, | 922 | return region_chg(&inode->i_mapping->private_list, |
930 | idx, idx + 1); | 923 | idx, idx + 1); |
@@ -949,7 +942,7 @@ static void vma_commit_reservation(struct hstate *h, | |||
949 | struct address_space *mapping = vma->vm_file->f_mapping; | 942 | struct address_space *mapping = vma->vm_file->f_mapping; |
950 | struct inode *inode = mapping->host; | 943 | struct inode *inode = mapping->host; |
951 | 944 | ||
952 | if (vma->vm_flags & VM_SHARED) { | 945 | if (vma->vm_flags & VM_MAYSHARE) { |
953 | pgoff_t idx = vma_hugecache_offset(h, vma, addr); | 946 | pgoff_t idx = vma_hugecache_offset(h, vma, addr); |
954 | region_add(&inode->i_mapping->private_list, idx, idx + 1); | 947 | region_add(&inode->i_mapping->private_list, idx, idx + 1); |
955 | 948 | ||
@@ -1140,6 +1133,41 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count) | |||
1140 | } | 1133 | } |
1141 | #endif | 1134 | #endif |
1142 | 1135 | ||
1136 | /* | ||
1137 | * Increment or decrement surplus_huge_pages. Keep node-specific counters | ||
1138 | * balanced by operating on them in a round-robin fashion. | ||
1139 | * Returns 1 if an adjustment was made. | ||
1140 | */ | ||
1141 | static int adjust_pool_surplus(struct hstate *h, int delta) | ||
1142 | { | ||
1143 | static int prev_nid; | ||
1144 | int nid = prev_nid; | ||
1145 | int ret = 0; | ||
1146 | |||
1147 | VM_BUG_ON(delta != -1 && delta != 1); | ||
1148 | do { | ||
1149 | nid = next_node(nid, node_online_map); | ||
1150 | if (nid == MAX_NUMNODES) | ||
1151 | nid = first_node(node_online_map); | ||
1152 | |||
1153 | /* To shrink on this node, there must be a surplus page */ | ||
1154 | if (delta < 0 && !h->surplus_huge_pages_node[nid]) | ||
1155 | continue; | ||
1156 | /* Surplus cannot exceed the total number of pages */ | ||
1157 | if (delta > 0 && h->surplus_huge_pages_node[nid] >= | ||
1158 | h->nr_huge_pages_node[nid]) | ||
1159 | continue; | ||
1160 | |||
1161 | h->surplus_huge_pages += delta; | ||
1162 | h->surplus_huge_pages_node[nid] += delta; | ||
1163 | ret = 1; | ||
1164 | break; | ||
1165 | } while (nid != prev_nid); | ||
1166 | |||
1167 | prev_nid = nid; | ||
1168 | return ret; | ||
1169 | } | ||
1170 | |||
1143 | #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) | 1171 | #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) |
1144 | static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | 1172 | static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) |
1145 | { | 1173 | { |
@@ -1893,7 +1921,7 @@ retry_avoidcopy: | |||
1893 | * at the time of fork() could consume its reserves on COW instead | 1921 | * at the time of fork() could consume its reserves on COW instead |
1894 | * of the full address range. | 1922 | * of the full address range. |
1895 | */ | 1923 | */ |
1896 | if (!(vma->vm_flags & VM_SHARED) && | 1924 | if (!(vma->vm_flags & VM_MAYSHARE) && |
1897 | is_vma_resv_set(vma, HPAGE_RESV_OWNER) && | 1925 | is_vma_resv_set(vma, HPAGE_RESV_OWNER) && |
1898 | old_page != pagecache_page) | 1926 | old_page != pagecache_page) |
1899 | outside_reserve = 1; | 1927 | outside_reserve = 1; |
@@ -2000,7 +2028,7 @@ retry: | |||
2000 | clear_huge_page(page, address, huge_page_size(h)); | 2028 | clear_huge_page(page, address, huge_page_size(h)); |
2001 | __SetPageUptodate(page); | 2029 | __SetPageUptodate(page); |
2002 | 2030 | ||
2003 | if (vma->vm_flags & VM_SHARED) { | 2031 | if (vma->vm_flags & VM_MAYSHARE) { |
2004 | int err; | 2032 | int err; |
2005 | struct inode *inode = mapping->host; | 2033 | struct inode *inode = mapping->host; |
2006 | 2034 | ||
@@ -2104,7 +2132,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2104 | goto out_mutex; | 2132 | goto out_mutex; |
2105 | } | 2133 | } |
2106 | 2134 | ||
2107 | if (!(vma->vm_flags & VM_SHARED)) | 2135 | if (!(vma->vm_flags & VM_MAYSHARE)) |
2108 | pagecache_page = hugetlbfs_pagecache_page(h, | 2136 | pagecache_page = hugetlbfs_pagecache_page(h, |
2109 | vma, address); | 2137 | vma, address); |
2110 | } | 2138 | } |
@@ -2289,7 +2317,7 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
2289 | * to reserve the full area even if read-only as mprotect() may be | 2317 | * to reserve the full area even if read-only as mprotect() may be |
2290 | * called to make the mapping read-write. Assume !vma is a shm mapping | 2318 | * called to make the mapping read-write. Assume !vma is a shm mapping |
2291 | */ | 2319 | */ |
2292 | if (!vma || vma->vm_flags & VM_SHARED) | 2320 | if (!vma || vma->vm_flags & VM_MAYSHARE) |
2293 | chg = region_chg(&inode->i_mapping->private_list, from, to); | 2321 | chg = region_chg(&inode->i_mapping->private_list, from, to); |
2294 | else { | 2322 | else { |
2295 | struct resv_map *resv_map = resv_map_alloc(); | 2323 | struct resv_map *resv_map = resv_map_alloc(); |
@@ -2330,7 +2358,7 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
2330 | * consumed reservations are stored in the map. Hence, nothing | 2358 | * consumed reservations are stored in the map. Hence, nothing |
2331 | * else has to be done for private mappings here | 2359 | * else has to be done for private mappings here |
2332 | */ | 2360 | */ |
2333 | if (!vma || vma->vm_flags & VM_SHARED) | 2361 | if (!vma || vma->vm_flags & VM_MAYSHARE) |
2334 | region_add(&inode->i_mapping->private_list, from, to); | 2362 | region_add(&inode->i_mapping->private_list, from, to); |
2335 | return 0; | 2363 | return 0; |
2336 | } | 2364 | } |
diff --git a/mm/init-mm.c b/mm/init-mm.c new file mode 100644 index 000000000000..57aba0da9668 --- /dev/null +++ b/mm/init-mm.c | |||
@@ -0,0 +1,20 @@ | |||
1 | #include <linux/mm_types.h> | ||
2 | #include <linux/rbtree.h> | ||
3 | #include <linux/rwsem.h> | ||
4 | #include <linux/spinlock.h> | ||
5 | #include <linux/list.h> | ||
6 | #include <linux/cpumask.h> | ||
7 | |||
8 | #include <asm/atomic.h> | ||
9 | #include <asm/pgtable.h> | ||
10 | |||
11 | struct mm_struct init_mm = { | ||
12 | .mm_rb = RB_ROOT, | ||
13 | .pgd = swapper_pg_dir, | ||
14 | .mm_users = ATOMIC_INIT(2), | ||
15 | .mm_count = ATOMIC_INIT(1), | ||
16 | .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), | ||
17 | .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), | ||
18 | .mmlist = LIST_HEAD_INIT(init_mm.mmlist), | ||
19 | .cpu_vm_mask = CPU_MASK_ALL, | ||
20 | }; | ||
diff --git a/mm/internal.h b/mm/internal.h index 987bb03fbdd8..f290c4db528b 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -16,9 +16,6 @@ | |||
16 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, | 16 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, |
17 | unsigned long floor, unsigned long ceiling); | 17 | unsigned long floor, unsigned long ceiling); |
18 | 18 | ||
19 | extern void prep_compound_page(struct page *page, unsigned long order); | ||
20 | extern void prep_compound_gigantic_page(struct page *page, unsigned long order); | ||
21 | |||
22 | static inline void set_page_count(struct page *page, int v) | 19 | static inline void set_page_count(struct page *page, int v) |
23 | { | 20 | { |
24 | atomic_set(&page->_count, v); | 21 | atomic_set(&page->_count, v); |
@@ -51,6 +48,8 @@ extern void putback_lru_page(struct page *page); | |||
51 | */ | 48 | */ |
52 | extern unsigned long highest_memmap_pfn; | 49 | extern unsigned long highest_memmap_pfn; |
53 | extern void __free_pages_bootmem(struct page *page, unsigned int order); | 50 | extern void __free_pages_bootmem(struct page *page, unsigned int order); |
51 | extern void prep_compound_page(struct page *page, unsigned long order); | ||
52 | |||
54 | 53 | ||
55 | /* | 54 | /* |
56 | * function for dealing with page's order in buddy system. | 55 | * function for dealing with page's order in buddy system. |
@@ -74,7 +73,6 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) | |||
74 | } | 73 | } |
75 | #endif | 74 | #endif |
76 | 75 | ||
77 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
78 | /* | 76 | /* |
79 | * unevictable_migrate_page() called only from migrate_page_copy() to | 77 | * unevictable_migrate_page() called only from migrate_page_copy() to |
80 | * migrate unevictable flag to new page. | 78 | * migrate unevictable flag to new page. |
@@ -86,11 +84,6 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old) | |||
86 | if (TestClearPageUnevictable(old)) | 84 | if (TestClearPageUnevictable(old)) |
87 | SetPageUnevictable(new); | 85 | SetPageUnevictable(new); |
88 | } | 86 | } |
89 | #else | ||
90 | static inline void unevictable_migrate_page(struct page *new, struct page *old) | ||
91 | { | ||
92 | } | ||
93 | #endif | ||
94 | 87 | ||
95 | #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT | 88 | #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT |
96 | /* | 89 | /* |
@@ -150,23 +143,6 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) | |||
150 | } | 143 | } |
151 | } | 144 | } |
152 | 145 | ||
153 | /* | ||
154 | * free_page_mlock() -- clean up attempts to free and mlocked() page. | ||
155 | * Page should not be on lru, so no need to fix that up. | ||
156 | * free_pages_check() will verify... | ||
157 | */ | ||
158 | static inline void free_page_mlock(struct page *page) | ||
159 | { | ||
160 | if (unlikely(TestClearPageMlocked(page))) { | ||
161 | unsigned long flags; | ||
162 | |||
163 | local_irq_save(flags); | ||
164 | __dec_zone_page_state(page, NR_MLOCK); | ||
165 | __count_vm_event(UNEVICTABLE_MLOCKFREED); | ||
166 | local_irq_restore(flags); | ||
167 | } | ||
168 | } | ||
169 | |||
170 | #else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ | 146 | #else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ |
171 | static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) | 147 | static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) |
172 | { | 148 | { |
@@ -175,7 +151,6 @@ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) | |||
175 | static inline void clear_page_mlock(struct page *page) { } | 151 | static inline void clear_page_mlock(struct page *page) { } |
176 | static inline void mlock_vma_page(struct page *page) { } | 152 | static inline void mlock_vma_page(struct page *page) { } |
177 | static inline void mlock_migrate_page(struct page *new, struct page *old) { } | 153 | static inline void mlock_migrate_page(struct page *new, struct page *old) { } |
178 | static inline void free_page_mlock(struct page *page) { } | ||
179 | 154 | ||
180 | #endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ | 155 | #endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ |
181 | 156 | ||
@@ -284,4 +259,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
284 | unsigned long start, int len, int flags, | 259 | unsigned long start, int len, int flags, |
285 | struct page **pages, struct vm_area_struct **vmas); | 260 | struct page **pages, struct vm_area_struct **vmas); |
286 | 261 | ||
262 | #define ZONE_RECLAIM_NOSCAN -2 | ||
263 | #define ZONE_RECLAIM_FULL -1 | ||
264 | #define ZONE_RECLAIM_SOME 0 | ||
265 | #define ZONE_RECLAIM_SUCCESS 1 | ||
287 | #endif | 266 | #endif |
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c new file mode 100644 index 000000000000..fd814fd61319 --- /dev/null +++ b/mm/kmemcheck.c | |||
@@ -0,0 +1,122 @@ | |||
1 | #include <linux/gfp.h> | ||
2 | #include <linux/mm_types.h> | ||
3 | #include <linux/mm.h> | ||
4 | #include <linux/slab.h> | ||
5 | #include <linux/kmemcheck.h> | ||
6 | |||
7 | void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) | ||
8 | { | ||
9 | struct page *shadow; | ||
10 | int pages; | ||
11 | int i; | ||
12 | |||
13 | pages = 1 << order; | ||
14 | |||
15 | /* | ||
16 | * With kmemcheck enabled, we need to allocate a memory area for the | ||
17 | * shadow bits as well. | ||
18 | */ | ||
19 | shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order); | ||
20 | if (!shadow) { | ||
21 | if (printk_ratelimit()) | ||
22 | printk(KERN_ERR "kmemcheck: failed to allocate " | ||
23 | "shadow bitmap\n"); | ||
24 | return; | ||
25 | } | ||
26 | |||
27 | for(i = 0; i < pages; ++i) | ||
28 | page[i].shadow = page_address(&shadow[i]); | ||
29 | |||
30 | /* | ||
31 | * Mark it as non-present for the MMU so that our accesses to | ||
32 | * this memory will trigger a page fault and let us analyze | ||
33 | * the memory accesses. | ||
34 | */ | ||
35 | kmemcheck_hide_pages(page, pages); | ||
36 | } | ||
37 | |||
38 | void kmemcheck_free_shadow(struct page *page, int order) | ||
39 | { | ||
40 | struct page *shadow; | ||
41 | int pages; | ||
42 | int i; | ||
43 | |||
44 | if (!kmemcheck_page_is_tracked(page)) | ||
45 | return; | ||
46 | |||
47 | pages = 1 << order; | ||
48 | |||
49 | kmemcheck_show_pages(page, pages); | ||
50 | |||
51 | shadow = virt_to_page(page[0].shadow); | ||
52 | |||
53 | for(i = 0; i < pages; ++i) | ||
54 | page[i].shadow = NULL; | ||
55 | |||
56 | __free_pages(shadow, order); | ||
57 | } | ||
58 | |||
59 | void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, | ||
60 | size_t size) | ||
61 | { | ||
62 | /* | ||
63 | * Has already been memset(), which initializes the shadow for us | ||
64 | * as well. | ||
65 | */ | ||
66 | if (gfpflags & __GFP_ZERO) | ||
67 | return; | ||
68 | |||
69 | /* No need to initialize the shadow of a non-tracked slab. */ | ||
70 | if (s->flags & SLAB_NOTRACK) | ||
71 | return; | ||
72 | |||
73 | if (!kmemcheck_enabled || gfpflags & __GFP_NOTRACK) { | ||
74 | /* | ||
75 | * Allow notracked objects to be allocated from | ||
76 | * tracked caches. Note however that these objects | ||
77 | * will still get page faults on access, they just | ||
78 | * won't ever be flagged as uninitialized. If page | ||
79 | * faults are not acceptable, the slab cache itself | ||
80 | * should be marked NOTRACK. | ||
81 | */ | ||
82 | kmemcheck_mark_initialized(object, size); | ||
83 | } else if (!s->ctor) { | ||
84 | /* | ||
85 | * New objects should be marked uninitialized before | ||
86 | * they're returned to the called. | ||
87 | */ | ||
88 | kmemcheck_mark_uninitialized(object, size); | ||
89 | } | ||
90 | } | ||
91 | |||
92 | void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size) | ||
93 | { | ||
94 | /* TODO: RCU freeing is unsupported for now; hide false positives. */ | ||
95 | if (!s->ctor && !(s->flags & SLAB_DESTROY_BY_RCU)) | ||
96 | kmemcheck_mark_freed(object, size); | ||
97 | } | ||
98 | |||
99 | void kmemcheck_pagealloc_alloc(struct page *page, unsigned int order, | ||
100 | gfp_t gfpflags) | ||
101 | { | ||
102 | int pages; | ||
103 | |||
104 | if (gfpflags & (__GFP_HIGHMEM | __GFP_NOTRACK)) | ||
105 | return; | ||
106 | |||
107 | pages = 1 << order; | ||
108 | |||
109 | /* | ||
110 | * NOTE: We choose to track GFP_ZERO pages too; in fact, they | ||
111 | * can become uninitialized by copying uninitialized memory | ||
112 | * into them. | ||
113 | */ | ||
114 | |||
115 | /* XXX: Can use zone->node for node? */ | ||
116 | kmemcheck_alloc_shadow(page, order, gfpflags, -1); | ||
117 | |||
118 | if (gfpflags & __GFP_ZERO) | ||
119 | kmemcheck_mark_initialized_pages(page, pages); | ||
120 | else | ||
121 | kmemcheck_mark_uninitialized_pages(page, pages); | ||
122 | } | ||
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c new file mode 100644 index 000000000000..d5292fc6f523 --- /dev/null +++ b/mm/kmemleak-test.c | |||
@@ -0,0 +1,111 @@ | |||
1 | /* | ||
2 | * mm/kmemleak-test.c | ||
3 | * | ||
4 | * Copyright (C) 2008 ARM Limited | ||
5 | * Written by Catalin Marinas <catalin.marinas@arm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
14 | * GNU General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public License | ||
17 | * along with this program; if not, write to the Free Software | ||
18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
19 | */ | ||
20 | |||
21 | #include <linux/init.h> | ||
22 | #include <linux/kernel.h> | ||
23 | #include <linux/module.h> | ||
24 | #include <linux/slab.h> | ||
25 | #include <linux/vmalloc.h> | ||
26 | #include <linux/list.h> | ||
27 | #include <linux/percpu.h> | ||
28 | #include <linux/fdtable.h> | ||
29 | |||
30 | #include <linux/kmemleak.h> | ||
31 | |||
32 | struct test_node { | ||
33 | long header[25]; | ||
34 | struct list_head list; | ||
35 | long footer[25]; | ||
36 | }; | ||
37 | |||
38 | static LIST_HEAD(test_list); | ||
39 | static DEFINE_PER_CPU(void *, test_pointer); | ||
40 | |||
41 | /* | ||
42 | * Some very simple testing. This function needs to be extended for | ||
43 | * proper testing. | ||
44 | */ | ||
45 | static int __init kmemleak_test_init(void) | ||
46 | { | ||
47 | struct test_node *elem; | ||
48 | int i; | ||
49 | |||
50 | printk(KERN_INFO "Kmemleak testing\n"); | ||
51 | |||
52 | /* make some orphan objects */ | ||
53 | pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); | ||
54 | pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); | ||
55 | pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); | ||
56 | pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); | ||
57 | pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); | ||
58 | pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); | ||
59 | pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); | ||
60 | pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); | ||
61 | #ifndef CONFIG_MODULES | ||
62 | pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n", | ||
63 | kmem_cache_alloc(files_cachep, GFP_KERNEL)); | ||
64 | pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n", | ||
65 | kmem_cache_alloc(files_cachep, GFP_KERNEL)); | ||
66 | #endif | ||
67 | pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); | ||
68 | pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); | ||
69 | pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); | ||
70 | pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); | ||
71 | pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); | ||
72 | |||
73 | /* | ||
74 | * Add elements to a list. They should only appear as orphan | ||
75 | * after the module is removed. | ||
76 | */ | ||
77 | for (i = 0; i < 10; i++) { | ||
78 | elem = kmalloc(sizeof(*elem), GFP_KERNEL); | ||
79 | pr_info("kmemleak: kmalloc(sizeof(*elem)) = %p\n", elem); | ||
80 | if (!elem) | ||
81 | return -ENOMEM; | ||
82 | memset(elem, 0, sizeof(*elem)); | ||
83 | INIT_LIST_HEAD(&elem->list); | ||
84 | |||
85 | list_add_tail(&elem->list, &test_list); | ||
86 | } | ||
87 | |||
88 | for_each_possible_cpu(i) { | ||
89 | per_cpu(test_pointer, i) = kmalloc(129, GFP_KERNEL); | ||
90 | pr_info("kmemleak: kmalloc(129) = %p\n", | ||
91 | per_cpu(test_pointer, i)); | ||
92 | } | ||
93 | |||
94 | return 0; | ||
95 | } | ||
96 | module_init(kmemleak_test_init); | ||
97 | |||
98 | static void __exit kmemleak_test_exit(void) | ||
99 | { | ||
100 | struct test_node *elem, *tmp; | ||
101 | |||
102 | /* | ||
103 | * Remove the list elements without actually freeing the | ||
104 | * memory. | ||
105 | */ | ||
106 | list_for_each_entry_safe(elem, tmp, &test_list, list) | ||
107 | list_del(&elem->list); | ||
108 | } | ||
109 | module_exit(kmemleak_test_exit); | ||
110 | |||
111 | MODULE_LICENSE("GPL"); | ||
diff --git a/mm/kmemleak.c b/mm/kmemleak.c new file mode 100644 index 000000000000..58ec86c9e58a --- /dev/null +++ b/mm/kmemleak.c | |||
@@ -0,0 +1,1498 @@ | |||
1 | /* | ||
2 | * mm/kmemleak.c | ||
3 | * | ||
4 | * Copyright (C) 2008 ARM Limited | ||
5 | * Written by Catalin Marinas <catalin.marinas@arm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
14 | * GNU General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public License | ||
17 | * along with this program; if not, write to the Free Software | ||
18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
19 | * | ||
20 | * | ||
21 | * For more information on the algorithm and kmemleak usage, please see | ||
22 | * Documentation/kmemleak.txt. | ||
23 | * | ||
24 | * Notes on locking | ||
25 | * ---------------- | ||
26 | * | ||
27 | * The following locks and mutexes are used by kmemleak: | ||
28 | * | ||
29 | * - kmemleak_lock (rwlock): protects the object_list modifications and | ||
30 | * accesses to the object_tree_root. The object_list is the main list | ||
31 | * holding the metadata (struct kmemleak_object) for the allocated memory | ||
32 | * blocks. The object_tree_root is a priority search tree used to look-up | ||
33 | * metadata based on a pointer to the corresponding memory block. The | ||
34 | * kmemleak_object structures are added to the object_list and | ||
35 | * object_tree_root in the create_object() function called from the | ||
36 | * kmemleak_alloc() callback and removed in delete_object() called from the | ||
37 | * kmemleak_free() callback | ||
38 | * - kmemleak_object.lock (spinlock): protects a kmemleak_object. Accesses to | ||
39 | * the metadata (e.g. count) are protected by this lock. Note that some | ||
40 | * members of this structure may be protected by other means (atomic or | ||
41 | * kmemleak_lock). This lock is also held when scanning the corresponding | ||
42 | * memory block to avoid the kernel freeing it via the kmemleak_free() | ||
43 | * callback. This is less heavyweight than holding a global lock like | ||
44 | * kmemleak_lock during scanning | ||
45 | * - scan_mutex (mutex): ensures that only one thread may scan the memory for | ||
46 | * unreferenced objects at a time. The gray_list contains the objects which | ||
47 | * are already referenced or marked as false positives and need to be | ||
48 | * scanned. This list is only modified during a scanning episode when the | ||
49 | * scan_mutex is held. At the end of a scan, the gray_list is always empty. | ||
50 | * Note that the kmemleak_object.use_count is incremented when an object is | ||
51 | * added to the gray_list and therefore cannot be freed | ||
52 | * - kmemleak_mutex (mutex): prevents multiple users of the "kmemleak" debugfs | ||
53 | * file together with modifications to the memory scanning parameters | ||
54 | * including the scan_thread pointer | ||
55 | * | ||
56 | * The kmemleak_object structures have a use_count incremented or decremented | ||
57 | * using the get_object()/put_object() functions. When the use_count becomes | ||
58 | * 0, this count can no longer be incremented and put_object() schedules the | ||
59 | * kmemleak_object freeing via an RCU callback. All calls to the get_object() | ||
60 | * function must be protected by rcu_read_lock() to avoid accessing a freed | ||
61 | * structure. | ||
62 | */ | ||
63 | |||
64 | #include <linux/init.h> | ||
65 | #include <linux/kernel.h> | ||
66 | #include <linux/list.h> | ||
67 | #include <linux/sched.h> | ||
68 | #include <linux/jiffies.h> | ||
69 | #include <linux/delay.h> | ||
70 | #include <linux/module.h> | ||
71 | #include <linux/kthread.h> | ||
72 | #include <linux/prio_tree.h> | ||
73 | #include <linux/gfp.h> | ||
74 | #include <linux/fs.h> | ||
75 | #include <linux/debugfs.h> | ||
76 | #include <linux/seq_file.h> | ||
77 | #include <linux/cpumask.h> | ||
78 | #include <linux/spinlock.h> | ||
79 | #include <linux/mutex.h> | ||
80 | #include <linux/rcupdate.h> | ||
81 | #include <linux/stacktrace.h> | ||
82 | #include <linux/cache.h> | ||
83 | #include <linux/percpu.h> | ||
84 | #include <linux/hardirq.h> | ||
85 | #include <linux/mmzone.h> | ||
86 | #include <linux/slab.h> | ||
87 | #include <linux/thread_info.h> | ||
88 | #include <linux/err.h> | ||
89 | #include <linux/uaccess.h> | ||
90 | #include <linux/string.h> | ||
91 | #include <linux/nodemask.h> | ||
92 | #include <linux/mm.h> | ||
93 | |||
94 | #include <asm/sections.h> | ||
95 | #include <asm/processor.h> | ||
96 | #include <asm/atomic.h> | ||
97 | |||
98 | #include <linux/kmemleak.h> | ||
99 | |||
100 | /* | ||
101 | * Kmemleak configuration and common defines. | ||
102 | */ | ||
103 | #define MAX_TRACE 16 /* stack trace length */ | ||
104 | #define REPORTS_NR 50 /* maximum number of reported leaks */ | ||
105 | #define MSECS_MIN_AGE 5000 /* minimum object age for reporting */ | ||
106 | #define MSECS_SCAN_YIELD 10 /* CPU yielding period */ | ||
107 | #define SECS_FIRST_SCAN 60 /* delay before the first scan */ | ||
108 | #define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ | ||
109 | |||
110 | #define BYTES_PER_POINTER sizeof(void *) | ||
111 | |||
112 | /* scanning area inside a memory block */ | ||
113 | struct kmemleak_scan_area { | ||
114 | struct hlist_node node; | ||
115 | unsigned long offset; | ||
116 | size_t length; | ||
117 | }; | ||
118 | |||
119 | /* | ||
120 | * Structure holding the metadata for each allocated memory block. | ||
121 | * Modifications to such objects should be made while holding the | ||
122 | * object->lock. Insertions or deletions from object_list, gray_list or | ||
123 | * tree_node are already protected by the corresponding locks or mutex (see | ||
124 | * the notes on locking above). These objects are reference-counted | ||
125 | * (use_count) and freed using the RCU mechanism. | ||
126 | */ | ||
127 | struct kmemleak_object { | ||
128 | spinlock_t lock; | ||
129 | unsigned long flags; /* object status flags */ | ||
130 | struct list_head object_list; | ||
131 | struct list_head gray_list; | ||
132 | struct prio_tree_node tree_node; | ||
133 | struct rcu_head rcu; /* object_list lockless traversal */ | ||
134 | /* object usage count; object freed when use_count == 0 */ | ||
135 | atomic_t use_count; | ||
136 | unsigned long pointer; | ||
137 | size_t size; | ||
138 | /* minimum number of a pointers found before it is considered leak */ | ||
139 | int min_count; | ||
140 | /* the total number of pointers found pointing to this object */ | ||
141 | int count; | ||
142 | /* memory ranges to be scanned inside an object (empty for all) */ | ||
143 | struct hlist_head area_list; | ||
144 | unsigned long trace[MAX_TRACE]; | ||
145 | unsigned int trace_len; | ||
146 | unsigned long jiffies; /* creation timestamp */ | ||
147 | pid_t pid; /* pid of the current task */ | ||
148 | char comm[TASK_COMM_LEN]; /* executable name */ | ||
149 | }; | ||
150 | |||
151 | /* flag representing the memory block allocation status */ | ||
152 | #define OBJECT_ALLOCATED (1 << 0) | ||
153 | /* flag set after the first reporting of an unreference object */ | ||
154 | #define OBJECT_REPORTED (1 << 1) | ||
155 | /* flag set to not scan the object */ | ||
156 | #define OBJECT_NO_SCAN (1 << 2) | ||
157 | |||
158 | /* the list of all allocated objects */ | ||
159 | static LIST_HEAD(object_list); | ||
160 | /* the list of gray-colored objects (see color_gray comment below) */ | ||
161 | static LIST_HEAD(gray_list); | ||
162 | /* prio search tree for object boundaries */ | ||
163 | static struct prio_tree_root object_tree_root; | ||
164 | /* rw_lock protecting the access to object_list and prio_tree_root */ | ||
165 | static DEFINE_RWLOCK(kmemleak_lock); | ||
166 | |||
167 | /* allocation caches for kmemleak internal data */ | ||
168 | static struct kmem_cache *object_cache; | ||
169 | static struct kmem_cache *scan_area_cache; | ||
170 | |||
171 | /* set if tracing memory operations is enabled */ | ||
172 | static atomic_t kmemleak_enabled = ATOMIC_INIT(0); | ||
173 | /* set in the late_initcall if there were no errors */ | ||
174 | static atomic_t kmemleak_initialized = ATOMIC_INIT(0); | ||
175 | /* enables or disables early logging of the memory operations */ | ||
176 | static atomic_t kmemleak_early_log = ATOMIC_INIT(1); | ||
177 | /* set if a fata kmemleak error has occurred */ | ||
178 | static atomic_t kmemleak_error = ATOMIC_INIT(0); | ||
179 | |||
180 | /* minimum and maximum address that may be valid pointers */ | ||
181 | static unsigned long min_addr = ULONG_MAX; | ||
182 | static unsigned long max_addr; | ||
183 | |||
184 | /* used for yielding the CPU to other tasks during scanning */ | ||
185 | static unsigned long next_scan_yield; | ||
186 | static struct task_struct *scan_thread; | ||
187 | static unsigned long jiffies_scan_yield; | ||
188 | static unsigned long jiffies_min_age; | ||
189 | /* delay between automatic memory scannings */ | ||
190 | static signed long jiffies_scan_wait; | ||
191 | /* enables or disables the task stacks scanning */ | ||
192 | static int kmemleak_stack_scan; | ||
193 | /* mutex protecting the memory scanning */ | ||
194 | static DEFINE_MUTEX(scan_mutex); | ||
195 | /* mutex protecting the access to the /sys/kernel/debug/kmemleak file */ | ||
196 | static DEFINE_MUTEX(kmemleak_mutex); | ||
197 | |||
198 | /* number of leaks reported (for limitation purposes) */ | ||
199 | static int reported_leaks; | ||
200 | |||
201 | /* | ||
202 | * Early object allocation/freeing logging. Kkmemleak is initialized after the | ||
203 | * kernel allocator. However, both the kernel allocator and kmemleak may | ||
204 | * allocate memory blocks which need to be tracked. Kkmemleak defines an | ||
205 | * arbitrary buffer to hold the allocation/freeing information before it is | ||
206 | * fully initialized. | ||
207 | */ | ||
208 | |||
209 | /* kmemleak operation type for early logging */ | ||
210 | enum { | ||
211 | KMEMLEAK_ALLOC, | ||
212 | KMEMLEAK_FREE, | ||
213 | KMEMLEAK_NOT_LEAK, | ||
214 | KMEMLEAK_IGNORE, | ||
215 | KMEMLEAK_SCAN_AREA, | ||
216 | KMEMLEAK_NO_SCAN | ||
217 | }; | ||
218 | |||
219 | /* | ||
220 | * Structure holding the information passed to kmemleak callbacks during the | ||
221 | * early logging. | ||
222 | */ | ||
223 | struct early_log { | ||
224 | int op_type; /* kmemleak operation type */ | ||
225 | const void *ptr; /* allocated/freed memory block */ | ||
226 | size_t size; /* memory block size */ | ||
227 | int min_count; /* minimum reference count */ | ||
228 | unsigned long offset; /* scan area offset */ | ||
229 | size_t length; /* scan area length */ | ||
230 | }; | ||
231 | |||
232 | /* early logging buffer and current position */ | ||
233 | static struct early_log early_log[200]; | ||
234 | static int crt_early_log; | ||
235 | |||
236 | static void kmemleak_disable(void); | ||
237 | |||
238 | /* | ||
239 | * Print a warning and dump the stack trace. | ||
240 | */ | ||
241 | #define kmemleak_warn(x...) do { \ | ||
242 | pr_warning(x); \ | ||
243 | dump_stack(); \ | ||
244 | } while (0) | ||
245 | |||
246 | /* | ||
247 | * Macro invoked when a serious kmemleak condition occured and cannot be | ||
248 | * recovered from. Kkmemleak will be disabled and further allocation/freeing | ||
249 | * tracing no longer available. | ||
250 | */ | ||
251 | #define kmemleak_panic(x...) do { \ | ||
252 | kmemleak_warn(x); \ | ||
253 | kmemleak_disable(); \ | ||
254 | } while (0) | ||
255 | |||
256 | /* | ||
257 | * Object colors, encoded with count and min_count: | ||
258 | * - white - orphan object, not enough references to it (count < min_count) | ||
259 | * - gray - not orphan, not marked as false positive (min_count == 0) or | ||
260 | * sufficient references to it (count >= min_count) | ||
261 | * - black - ignore, it doesn't contain references (e.g. text section) | ||
262 | * (min_count == -1). No function defined for this color. | ||
263 | * Newly created objects don't have any color assigned (object->count == -1) | ||
264 | * before the next memory scan when they become white. | ||
265 | */ | ||
266 | static int color_white(const struct kmemleak_object *object) | ||
267 | { | ||
268 | return object->count != -1 && object->count < object->min_count; | ||
269 | } | ||
270 | |||
271 | static int color_gray(const struct kmemleak_object *object) | ||
272 | { | ||
273 | return object->min_count != -1 && object->count >= object->min_count; | ||
274 | } | ||
275 | |||
276 | /* | ||
277 | * Objects are considered referenced if their color is gray and they have not | ||
278 | * been deleted. | ||
279 | */ | ||
280 | static int referenced_object(struct kmemleak_object *object) | ||
281 | { | ||
282 | return (object->flags & OBJECT_ALLOCATED) && color_gray(object); | ||
283 | } | ||
284 | |||
285 | /* | ||
286 | * Objects are considered unreferenced only if their color is white, they have | ||
287 | * not be deleted and have a minimum age to avoid false positives caused by | ||
288 | * pointers temporarily stored in CPU registers. | ||
289 | */ | ||
290 | static int unreferenced_object(struct kmemleak_object *object) | ||
291 | { | ||
292 | return (object->flags & OBJECT_ALLOCATED) && color_white(object) && | ||
293 | time_is_before_eq_jiffies(object->jiffies + jiffies_min_age); | ||
294 | } | ||
295 | |||
296 | /* | ||
297 | * Printing of the (un)referenced objects information, either to the seq file | ||
298 | * or to the kernel log. The print_referenced/print_unreferenced functions | ||
299 | * must be called with the object->lock held. | ||
300 | */ | ||
301 | #define print_helper(seq, x...) do { \ | ||
302 | struct seq_file *s = (seq); \ | ||
303 | if (s) \ | ||
304 | seq_printf(s, x); \ | ||
305 | else \ | ||
306 | pr_info(x); \ | ||
307 | } while (0) | ||
308 | |||
309 | static void print_referenced(struct kmemleak_object *object) | ||
310 | { | ||
311 | pr_info("kmemleak: referenced object 0x%08lx (size %zu)\n", | ||
312 | object->pointer, object->size); | ||
313 | } | ||
314 | |||
315 | static void print_unreferenced(struct seq_file *seq, | ||
316 | struct kmemleak_object *object) | ||
317 | { | ||
318 | int i; | ||
319 | |||
320 | print_helper(seq, "kmemleak: unreferenced object 0x%08lx (size %zu):\n", | ||
321 | object->pointer, object->size); | ||
322 | print_helper(seq, " comm \"%s\", pid %d, jiffies %lu\n", | ||
323 | object->comm, object->pid, object->jiffies); | ||
324 | print_helper(seq, " backtrace:\n"); | ||
325 | |||
326 | for (i = 0; i < object->trace_len; i++) { | ||
327 | void *ptr = (void *)object->trace[i]; | ||
328 | print_helper(seq, " [<%p>] %pS\n", ptr, ptr); | ||
329 | } | ||
330 | } | ||
331 | |||
332 | /* | ||
333 | * Print the kmemleak_object information. This function is used mainly for | ||
334 | * debugging special cases when kmemleak operations. It must be called with | ||
335 | * the object->lock held. | ||
336 | */ | ||
337 | static void dump_object_info(struct kmemleak_object *object) | ||
338 | { | ||
339 | struct stack_trace trace; | ||
340 | |||
341 | trace.nr_entries = object->trace_len; | ||
342 | trace.entries = object->trace; | ||
343 | |||
344 | pr_notice("kmemleak: Object 0x%08lx (size %zu):\n", | ||
345 | object->tree_node.start, object->size); | ||
346 | pr_notice(" comm \"%s\", pid %d, jiffies %lu\n", | ||
347 | object->comm, object->pid, object->jiffies); | ||
348 | pr_notice(" min_count = %d\n", object->min_count); | ||
349 | pr_notice(" count = %d\n", object->count); | ||
350 | pr_notice(" backtrace:\n"); | ||
351 | print_stack_trace(&trace, 4); | ||
352 | } | ||
353 | |||
354 | /* | ||
355 | * Look-up a memory block metadata (kmemleak_object) in the priority search | ||
356 | * tree based on a pointer value. If alias is 0, only values pointing to the | ||
357 | * beginning of the memory block are allowed. The kmemleak_lock must be held | ||
358 | * when calling this function. | ||
359 | */ | ||
360 | static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) | ||
361 | { | ||
362 | struct prio_tree_node *node; | ||
363 | struct prio_tree_iter iter; | ||
364 | struct kmemleak_object *object; | ||
365 | |||
366 | prio_tree_iter_init(&iter, &object_tree_root, ptr, ptr); | ||
367 | node = prio_tree_next(&iter); | ||
368 | if (node) { | ||
369 | object = prio_tree_entry(node, struct kmemleak_object, | ||
370 | tree_node); | ||
371 | if (!alias && object->pointer != ptr) { | ||
372 | kmemleak_warn("kmemleak: Found object by alias"); | ||
373 | object = NULL; | ||
374 | } | ||
375 | } else | ||
376 | object = NULL; | ||
377 | |||
378 | return object; | ||
379 | } | ||
380 | |||
381 | /* | ||
382 | * Increment the object use_count. Return 1 if successful or 0 otherwise. Note | ||
383 | * that once an object's use_count reached 0, the RCU freeing was already | ||
384 | * registered and the object should no longer be used. This function must be | ||
385 | * called under the protection of rcu_read_lock(). | ||
386 | */ | ||
387 | static int get_object(struct kmemleak_object *object) | ||
388 | { | ||
389 | return atomic_inc_not_zero(&object->use_count); | ||
390 | } | ||
391 | |||
392 | /* | ||
393 | * RCU callback to free a kmemleak_object. | ||
394 | */ | ||
395 | static void free_object_rcu(struct rcu_head *rcu) | ||
396 | { | ||
397 | struct hlist_node *elem, *tmp; | ||
398 | struct kmemleak_scan_area *area; | ||
399 | struct kmemleak_object *object = | ||
400 | container_of(rcu, struct kmemleak_object, rcu); | ||
401 | |||
402 | /* | ||
403 | * Once use_count is 0 (guaranteed by put_object), there is no other | ||
404 | * code accessing this object, hence no need for locking. | ||
405 | */ | ||
406 | hlist_for_each_entry_safe(area, elem, tmp, &object->area_list, node) { | ||
407 | hlist_del(elem); | ||
408 | kmem_cache_free(scan_area_cache, area); | ||
409 | } | ||
410 | kmem_cache_free(object_cache, object); | ||
411 | } | ||
412 | |||
413 | /* | ||
414 | * Decrement the object use_count. Once the count is 0, free the object using | ||
415 | * an RCU callback. Since put_object() may be called via the kmemleak_free() -> | ||
416 | * delete_object() path, the delayed RCU freeing ensures that there is no | ||
417 | * recursive call to the kernel allocator. Lock-less RCU object_list traversal | ||
418 | * is also possible. | ||
419 | */ | ||
420 | static void put_object(struct kmemleak_object *object) | ||
421 | { | ||
422 | if (!atomic_dec_and_test(&object->use_count)) | ||
423 | return; | ||
424 | |||
425 | /* should only get here after delete_object was called */ | ||
426 | WARN_ON(object->flags & OBJECT_ALLOCATED); | ||
427 | |||
428 | call_rcu(&object->rcu, free_object_rcu); | ||
429 | } | ||
430 | |||
431 | /* | ||
432 | * Look up an object in the prio search tree and increase its use_count. | ||
433 | */ | ||
434 | static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) | ||
435 | { | ||
436 | unsigned long flags; | ||
437 | struct kmemleak_object *object = NULL; | ||
438 | |||
439 | rcu_read_lock(); | ||
440 | read_lock_irqsave(&kmemleak_lock, flags); | ||
441 | if (ptr >= min_addr && ptr < max_addr) | ||
442 | object = lookup_object(ptr, alias); | ||
443 | read_unlock_irqrestore(&kmemleak_lock, flags); | ||
444 | |||
445 | /* check whether the object is still available */ | ||
446 | if (object && !get_object(object)) | ||
447 | object = NULL; | ||
448 | rcu_read_unlock(); | ||
449 | |||
450 | return object; | ||
451 | } | ||
452 | |||
453 | /* | ||
454 | * Create the metadata (struct kmemleak_object) corresponding to an allocated | ||
455 | * memory block and add it to the object_list and object_tree_root. | ||
456 | */ | ||
457 | static void create_object(unsigned long ptr, size_t size, int min_count, | ||
458 | gfp_t gfp) | ||
459 | { | ||
460 | unsigned long flags; | ||
461 | struct kmemleak_object *object; | ||
462 | struct prio_tree_node *node; | ||
463 | struct stack_trace trace; | ||
464 | |||
465 | object = kmem_cache_alloc(object_cache, gfp & ~GFP_SLAB_BUG_MASK); | ||
466 | if (!object) { | ||
467 | kmemleak_panic("kmemleak: Cannot allocate a kmemleak_object " | ||
468 | "structure\n"); | ||
469 | return; | ||
470 | } | ||
471 | |||
472 | INIT_LIST_HEAD(&object->object_list); | ||
473 | INIT_LIST_HEAD(&object->gray_list); | ||
474 | INIT_HLIST_HEAD(&object->area_list); | ||
475 | spin_lock_init(&object->lock); | ||
476 | atomic_set(&object->use_count, 1); | ||
477 | object->flags = OBJECT_ALLOCATED; | ||
478 | object->pointer = ptr; | ||
479 | object->size = size; | ||
480 | object->min_count = min_count; | ||
481 | object->count = -1; /* no color initially */ | ||
482 | object->jiffies = jiffies; | ||
483 | |||
484 | /* task information */ | ||
485 | if (in_irq()) { | ||
486 | object->pid = 0; | ||
487 | strncpy(object->comm, "hardirq", sizeof(object->comm)); | ||
488 | } else if (in_softirq()) { | ||
489 | object->pid = 0; | ||
490 | strncpy(object->comm, "softirq", sizeof(object->comm)); | ||
491 | } else { | ||
492 | object->pid = current->pid; | ||
493 | /* | ||
494 | * There is a small chance of a race with set_task_comm(), | ||
495 | * however using get_task_comm() here may cause locking | ||
496 | * dependency issues with current->alloc_lock. In the worst | ||
497 | * case, the command line is not correct. | ||
498 | */ | ||
499 | strncpy(object->comm, current->comm, sizeof(object->comm)); | ||
500 | } | ||
501 | |||
502 | /* kernel backtrace */ | ||
503 | trace.max_entries = MAX_TRACE; | ||
504 | trace.nr_entries = 0; | ||
505 | trace.entries = object->trace; | ||
506 | trace.skip = 1; | ||
507 | save_stack_trace(&trace); | ||
508 | object->trace_len = trace.nr_entries; | ||
509 | |||
510 | INIT_PRIO_TREE_NODE(&object->tree_node); | ||
511 | object->tree_node.start = ptr; | ||
512 | object->tree_node.last = ptr + size - 1; | ||
513 | |||
514 | write_lock_irqsave(&kmemleak_lock, flags); | ||
515 | min_addr = min(min_addr, ptr); | ||
516 | max_addr = max(max_addr, ptr + size); | ||
517 | node = prio_tree_insert(&object_tree_root, &object->tree_node); | ||
518 | /* | ||
519 | * The code calling the kernel does not yet have the pointer to the | ||
520 | * memory block to be able to free it. However, we still hold the | ||
521 | * kmemleak_lock here in case parts of the kernel started freeing | ||
522 | * random memory blocks. | ||
523 | */ | ||
524 | if (node != &object->tree_node) { | ||
525 | unsigned long flags; | ||
526 | |||
527 | kmemleak_panic("kmemleak: Cannot insert 0x%lx into the object " | ||
528 | "search tree (already existing)\n", ptr); | ||
529 | object = lookup_object(ptr, 1); | ||
530 | spin_lock_irqsave(&object->lock, flags); | ||
531 | dump_object_info(object); | ||
532 | spin_unlock_irqrestore(&object->lock, flags); | ||
533 | |||
534 | goto out; | ||
535 | } | ||
536 | list_add_tail_rcu(&object->object_list, &object_list); | ||
537 | out: | ||
538 | write_unlock_irqrestore(&kmemleak_lock, flags); | ||
539 | } | ||
540 | |||
541 | /* | ||
542 | * Remove the metadata (struct kmemleak_object) for a memory block from the | ||
543 | * object_list and object_tree_root and decrement its use_count. | ||
544 | */ | ||
545 | static void delete_object(unsigned long ptr) | ||
546 | { | ||
547 | unsigned long flags; | ||
548 | struct kmemleak_object *object; | ||
549 | |||
550 | write_lock_irqsave(&kmemleak_lock, flags); | ||
551 | object = lookup_object(ptr, 0); | ||
552 | if (!object) { | ||
553 | kmemleak_warn("kmemleak: Freeing unknown object at 0x%08lx\n", | ||
554 | ptr); | ||
555 | write_unlock_irqrestore(&kmemleak_lock, flags); | ||
556 | return; | ||
557 | } | ||
558 | prio_tree_remove(&object_tree_root, &object->tree_node); | ||
559 | list_del_rcu(&object->object_list); | ||
560 | write_unlock_irqrestore(&kmemleak_lock, flags); | ||
561 | |||
562 | WARN_ON(!(object->flags & OBJECT_ALLOCATED)); | ||
563 | WARN_ON(atomic_read(&object->use_count) < 1); | ||
564 | |||
565 | /* | ||
566 | * Locking here also ensures that the corresponding memory block | ||
567 | * cannot be freed when it is being scanned. | ||
568 | */ | ||
569 | spin_lock_irqsave(&object->lock, flags); | ||
570 | if (object->flags & OBJECT_REPORTED) | ||
571 | print_referenced(object); | ||
572 | object->flags &= ~OBJECT_ALLOCATED; | ||
573 | spin_unlock_irqrestore(&object->lock, flags); | ||
574 | put_object(object); | ||
575 | } | ||
576 | |||
577 | /* | ||
578 | * Make a object permanently as gray-colored so that it can no longer be | ||
579 | * reported as a leak. This is used in general to mark a false positive. | ||
580 | */ | ||
581 | static void make_gray_object(unsigned long ptr) | ||
582 | { | ||
583 | unsigned long flags; | ||
584 | struct kmemleak_object *object; | ||
585 | |||
586 | object = find_and_get_object(ptr, 0); | ||
587 | if (!object) { | ||
588 | kmemleak_warn("kmemleak: Graying unknown object at 0x%08lx\n", | ||
589 | ptr); | ||
590 | return; | ||
591 | } | ||
592 | |||
593 | spin_lock_irqsave(&object->lock, flags); | ||
594 | object->min_count = 0; | ||
595 | spin_unlock_irqrestore(&object->lock, flags); | ||
596 | put_object(object); | ||
597 | } | ||
598 | |||
599 | /* | ||
600 | * Mark the object as black-colored so that it is ignored from scans and | ||
601 | * reporting. | ||
602 | */ | ||
603 | static void make_black_object(unsigned long ptr) | ||
604 | { | ||
605 | unsigned long flags; | ||
606 | struct kmemleak_object *object; | ||
607 | |||
608 | object = find_and_get_object(ptr, 0); | ||
609 | if (!object) { | ||
610 | kmemleak_warn("kmemleak: Blacking unknown object at 0x%08lx\n", | ||
611 | ptr); | ||
612 | return; | ||
613 | } | ||
614 | |||
615 | spin_lock_irqsave(&object->lock, flags); | ||
616 | object->min_count = -1; | ||
617 | spin_unlock_irqrestore(&object->lock, flags); | ||
618 | put_object(object); | ||
619 | } | ||
620 | |||
621 | /* | ||
622 | * Add a scanning area to the object. If at least one such area is added, | ||
623 | * kmemleak will only scan these ranges rather than the whole memory block. | ||
624 | */ | ||
625 | static void add_scan_area(unsigned long ptr, unsigned long offset, | ||
626 | size_t length, gfp_t gfp) | ||
627 | { | ||
628 | unsigned long flags; | ||
629 | struct kmemleak_object *object; | ||
630 | struct kmemleak_scan_area *area; | ||
631 | |||
632 | object = find_and_get_object(ptr, 0); | ||
633 | if (!object) { | ||
634 | kmemleak_warn("kmemleak: Adding scan area to unknown " | ||
635 | "object at 0x%08lx\n", ptr); | ||
636 | return; | ||
637 | } | ||
638 | |||
639 | area = kmem_cache_alloc(scan_area_cache, gfp & ~GFP_SLAB_BUG_MASK); | ||
640 | if (!area) { | ||
641 | kmemleak_warn("kmemleak: Cannot allocate a scan area\n"); | ||
642 | goto out; | ||
643 | } | ||
644 | |||
645 | spin_lock_irqsave(&object->lock, flags); | ||
646 | if (offset + length > object->size) { | ||
647 | kmemleak_warn("kmemleak: Scan area larger than object " | ||
648 | "0x%08lx\n", ptr); | ||
649 | dump_object_info(object); | ||
650 | kmem_cache_free(scan_area_cache, area); | ||
651 | goto out_unlock; | ||
652 | } | ||
653 | |||
654 | INIT_HLIST_NODE(&area->node); | ||
655 | area->offset = offset; | ||
656 | area->length = length; | ||
657 | |||
658 | hlist_add_head(&area->node, &object->area_list); | ||
659 | out_unlock: | ||
660 | spin_unlock_irqrestore(&object->lock, flags); | ||
661 | out: | ||
662 | put_object(object); | ||
663 | } | ||
664 | |||
665 | /* | ||
666 | * Set the OBJECT_NO_SCAN flag for the object corresponding to the give | ||
667 | * pointer. Such object will not be scanned by kmemleak but references to it | ||
668 | * are searched. | ||
669 | */ | ||
670 | static void object_no_scan(unsigned long ptr) | ||
671 | { | ||
672 | unsigned long flags; | ||
673 | struct kmemleak_object *object; | ||
674 | |||
675 | object = find_and_get_object(ptr, 0); | ||
676 | if (!object) { | ||
677 | kmemleak_warn("kmemleak: Not scanning unknown object at " | ||
678 | "0x%08lx\n", ptr); | ||
679 | return; | ||
680 | } | ||
681 | |||
682 | spin_lock_irqsave(&object->lock, flags); | ||
683 | object->flags |= OBJECT_NO_SCAN; | ||
684 | spin_unlock_irqrestore(&object->lock, flags); | ||
685 | put_object(object); | ||
686 | } | ||
687 | |||
688 | /* | ||
689 | * Log an early kmemleak_* call to the early_log buffer. These calls will be | ||
690 | * processed later once kmemleak is fully initialized. | ||
691 | */ | ||
692 | static void log_early(int op_type, const void *ptr, size_t size, | ||
693 | int min_count, unsigned long offset, size_t length) | ||
694 | { | ||
695 | unsigned long flags; | ||
696 | struct early_log *log; | ||
697 | |||
698 | if (crt_early_log >= ARRAY_SIZE(early_log)) { | ||
699 | kmemleak_panic("kmemleak: Early log buffer exceeded\n"); | ||
700 | return; | ||
701 | } | ||
702 | |||
703 | /* | ||
704 | * There is no need for locking since the kernel is still in UP mode | ||
705 | * at this stage. Disabling the IRQs is enough. | ||
706 | */ | ||
707 | local_irq_save(flags); | ||
708 | log = &early_log[crt_early_log]; | ||
709 | log->op_type = op_type; | ||
710 | log->ptr = ptr; | ||
711 | log->size = size; | ||
712 | log->min_count = min_count; | ||
713 | log->offset = offset; | ||
714 | log->length = length; | ||
715 | crt_early_log++; | ||
716 | local_irq_restore(flags); | ||
717 | } | ||
718 | |||
719 | /* | ||
720 | * Memory allocation function callback. This function is called from the | ||
721 | * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc, | ||
722 | * vmalloc etc.). | ||
723 | */ | ||
724 | void kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) | ||
725 | { | ||
726 | pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count); | ||
727 | |||
728 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | ||
729 | create_object((unsigned long)ptr, size, min_count, gfp); | ||
730 | else if (atomic_read(&kmemleak_early_log)) | ||
731 | log_early(KMEMLEAK_ALLOC, ptr, size, min_count, 0, 0); | ||
732 | } | ||
733 | EXPORT_SYMBOL_GPL(kmemleak_alloc); | ||
734 | |||
735 | /* | ||
736 | * Memory freeing function callback. This function is called from the kernel | ||
737 | * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.). | ||
738 | */ | ||
739 | void kmemleak_free(const void *ptr) | ||
740 | { | ||
741 | pr_debug("%s(0x%p)\n", __func__, ptr); | ||
742 | |||
743 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | ||
744 | delete_object((unsigned long)ptr); | ||
745 | else if (atomic_read(&kmemleak_early_log)) | ||
746 | log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0); | ||
747 | } | ||
748 | EXPORT_SYMBOL_GPL(kmemleak_free); | ||
749 | |||
750 | /* | ||
751 | * Mark an already allocated memory block as a false positive. This will cause | ||
752 | * the block to no longer be reported as leak and always be scanned. | ||
753 | */ | ||
754 | void kmemleak_not_leak(const void *ptr) | ||
755 | { | ||
756 | pr_debug("%s(0x%p)\n", __func__, ptr); | ||
757 | |||
758 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | ||
759 | make_gray_object((unsigned long)ptr); | ||
760 | else if (atomic_read(&kmemleak_early_log)) | ||
761 | log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0, 0, 0); | ||
762 | } | ||
763 | EXPORT_SYMBOL(kmemleak_not_leak); | ||
764 | |||
765 | /* | ||
766 | * Ignore a memory block. This is usually done when it is known that the | ||
767 | * corresponding block is not a leak and does not contain any references to | ||
768 | * other allocated memory blocks. | ||
769 | */ | ||
770 | void kmemleak_ignore(const void *ptr) | ||
771 | { | ||
772 | pr_debug("%s(0x%p)\n", __func__, ptr); | ||
773 | |||
774 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | ||
775 | make_black_object((unsigned long)ptr); | ||
776 | else if (atomic_read(&kmemleak_early_log)) | ||
777 | log_early(KMEMLEAK_IGNORE, ptr, 0, 0, 0, 0); | ||
778 | } | ||
779 | EXPORT_SYMBOL(kmemleak_ignore); | ||
780 | |||
781 | /* | ||
782 | * Limit the range to be scanned in an allocated memory block. | ||
783 | */ | ||
784 | void kmemleak_scan_area(const void *ptr, unsigned long offset, size_t length, | ||
785 | gfp_t gfp) | ||
786 | { | ||
787 | pr_debug("%s(0x%p)\n", __func__, ptr); | ||
788 | |||
789 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | ||
790 | add_scan_area((unsigned long)ptr, offset, length, gfp); | ||
791 | else if (atomic_read(&kmemleak_early_log)) | ||
792 | log_early(KMEMLEAK_SCAN_AREA, ptr, 0, 0, offset, length); | ||
793 | } | ||
794 | EXPORT_SYMBOL(kmemleak_scan_area); | ||
795 | |||
796 | /* | ||
797 | * Inform kmemleak not to scan the given memory block. | ||
798 | */ | ||
799 | void kmemleak_no_scan(const void *ptr) | ||
800 | { | ||
801 | pr_debug("%s(0x%p)\n", __func__, ptr); | ||
802 | |||
803 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | ||
804 | object_no_scan((unsigned long)ptr); | ||
805 | else if (atomic_read(&kmemleak_early_log)) | ||
806 | log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0, 0, 0); | ||
807 | } | ||
808 | EXPORT_SYMBOL(kmemleak_no_scan); | ||
809 | |||
810 | /* | ||
811 | * Yield the CPU so that other tasks get a chance to run. The yielding is | ||
812 | * rate-limited to avoid excessive number of calls to the schedule() function | ||
813 | * during memory scanning. | ||
814 | */ | ||
815 | static void scan_yield(void) | ||
816 | { | ||
817 | might_sleep(); | ||
818 | |||
819 | if (time_is_before_eq_jiffies(next_scan_yield)) { | ||
820 | schedule(); | ||
821 | next_scan_yield = jiffies + jiffies_scan_yield; | ||
822 | } | ||
823 | } | ||
824 | |||
825 | /* | ||
826 | * Memory scanning is a long process and it needs to be interruptable. This | ||
827 | * function checks whether such interrupt condition occured. | ||
828 | */ | ||
829 | static int scan_should_stop(void) | ||
830 | { | ||
831 | if (!atomic_read(&kmemleak_enabled)) | ||
832 | return 1; | ||
833 | |||
834 | /* | ||
835 | * This function may be called from either process or kthread context, | ||
836 | * hence the need to check for both stop conditions. | ||
837 | */ | ||
838 | if (current->mm) | ||
839 | return signal_pending(current); | ||
840 | else | ||
841 | return kthread_should_stop(); | ||
842 | |||
843 | return 0; | ||
844 | } | ||
845 | |||
846 | /* | ||
847 | * Scan a memory block (exclusive range) for valid pointers and add those | ||
848 | * found to the gray list. | ||
849 | */ | ||
850 | static void scan_block(void *_start, void *_end, | ||
851 | struct kmemleak_object *scanned) | ||
852 | { | ||
853 | unsigned long *ptr; | ||
854 | unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER); | ||
855 | unsigned long *end = _end - (BYTES_PER_POINTER - 1); | ||
856 | |||
857 | for (ptr = start; ptr < end; ptr++) { | ||
858 | unsigned long flags; | ||
859 | unsigned long pointer = *ptr; | ||
860 | struct kmemleak_object *object; | ||
861 | |||
862 | if (scan_should_stop()) | ||
863 | break; | ||
864 | |||
865 | /* | ||
866 | * When scanning a memory block with a corresponding | ||
867 | * kmemleak_object, the CPU yielding is handled in the calling | ||
868 | * code since it holds the object->lock to avoid the block | ||
869 | * freeing. | ||
870 | */ | ||
871 | if (!scanned) | ||
872 | scan_yield(); | ||
873 | |||
874 | object = find_and_get_object(pointer, 1); | ||
875 | if (!object) | ||
876 | continue; | ||
877 | if (object == scanned) { | ||
878 | /* self referenced, ignore */ | ||
879 | put_object(object); | ||
880 | continue; | ||
881 | } | ||
882 | |||
883 | /* | ||
884 | * Avoid the lockdep recursive warning on object->lock being | ||
885 | * previously acquired in scan_object(). These locks are | ||
886 | * enclosed by scan_mutex. | ||
887 | */ | ||
888 | spin_lock_irqsave_nested(&object->lock, flags, | ||
889 | SINGLE_DEPTH_NESTING); | ||
890 | if (!color_white(object)) { | ||
891 | /* non-orphan, ignored or new */ | ||
892 | spin_unlock_irqrestore(&object->lock, flags); | ||
893 | put_object(object); | ||
894 | continue; | ||
895 | } | ||
896 | |||
897 | /* | ||
898 | * Increase the object's reference count (number of pointers | ||
899 | * to the memory block). If this count reaches the required | ||
900 | * minimum, the object's color will become gray and it will be | ||
901 | * added to the gray_list. | ||
902 | */ | ||
903 | object->count++; | ||
904 | if (color_gray(object)) | ||
905 | list_add_tail(&object->gray_list, &gray_list); | ||
906 | else | ||
907 | put_object(object); | ||
908 | spin_unlock_irqrestore(&object->lock, flags); | ||
909 | } | ||
910 | } | ||
911 | |||
912 | /* | ||
913 | * Scan a memory block corresponding to a kmemleak_object. A condition is | ||
914 | * that object->use_count >= 1. | ||
915 | */ | ||
916 | static void scan_object(struct kmemleak_object *object) | ||
917 | { | ||
918 | struct kmemleak_scan_area *area; | ||
919 | struct hlist_node *elem; | ||
920 | unsigned long flags; | ||
921 | |||
922 | /* | ||
923 | * Once the object->lock is aquired, the corresponding memory block | ||
924 | * cannot be freed (the same lock is aquired in delete_object). | ||
925 | */ | ||
926 | spin_lock_irqsave(&object->lock, flags); | ||
927 | if (object->flags & OBJECT_NO_SCAN) | ||
928 | goto out; | ||
929 | if (!(object->flags & OBJECT_ALLOCATED)) | ||
930 | /* already freed object */ | ||
931 | goto out; | ||
932 | if (hlist_empty(&object->area_list)) | ||
933 | scan_block((void *)object->pointer, | ||
934 | (void *)(object->pointer + object->size), object); | ||
935 | else | ||
936 | hlist_for_each_entry(area, elem, &object->area_list, node) | ||
937 | scan_block((void *)(object->pointer + area->offset), | ||
938 | (void *)(object->pointer + area->offset | ||
939 | + area->length), object); | ||
940 | out: | ||
941 | spin_unlock_irqrestore(&object->lock, flags); | ||
942 | } | ||
943 | |||
944 | /* | ||
945 | * Scan data sections and all the referenced memory blocks allocated via the | ||
946 | * kernel's standard allocators. This function must be called with the | ||
947 | * scan_mutex held. | ||
948 | */ | ||
949 | static void kmemleak_scan(void) | ||
950 | { | ||
951 | unsigned long flags; | ||
952 | struct kmemleak_object *object, *tmp; | ||
953 | struct task_struct *task; | ||
954 | int i; | ||
955 | |||
956 | /* prepare the kmemleak_object's */ | ||
957 | rcu_read_lock(); | ||
958 | list_for_each_entry_rcu(object, &object_list, object_list) { | ||
959 | spin_lock_irqsave(&object->lock, flags); | ||
960 | #ifdef DEBUG | ||
961 | /* | ||
962 | * With a few exceptions there should be a maximum of | ||
963 | * 1 reference to any object at this point. | ||
964 | */ | ||
965 | if (atomic_read(&object->use_count) > 1) { | ||
966 | pr_debug("kmemleak: object->use_count = %d\n", | ||
967 | atomic_read(&object->use_count)); | ||
968 | dump_object_info(object); | ||
969 | } | ||
970 | #endif | ||
971 | /* reset the reference count (whiten the object) */ | ||
972 | object->count = 0; | ||
973 | if (color_gray(object) && get_object(object)) | ||
974 | list_add_tail(&object->gray_list, &gray_list); | ||
975 | |||
976 | spin_unlock_irqrestore(&object->lock, flags); | ||
977 | } | ||
978 | rcu_read_unlock(); | ||
979 | |||
980 | /* data/bss scanning */ | ||
981 | scan_block(_sdata, _edata, NULL); | ||
982 | scan_block(__bss_start, __bss_stop, NULL); | ||
983 | |||
984 | #ifdef CONFIG_SMP | ||
985 | /* per-cpu sections scanning */ | ||
986 | for_each_possible_cpu(i) | ||
987 | scan_block(__per_cpu_start + per_cpu_offset(i), | ||
988 | __per_cpu_end + per_cpu_offset(i), NULL); | ||
989 | #endif | ||
990 | |||
991 | /* | ||
992 | * Struct page scanning for each node. The code below is not yet safe | ||
993 | * with MEMORY_HOTPLUG. | ||
994 | */ | ||
995 | for_each_online_node(i) { | ||
996 | pg_data_t *pgdat = NODE_DATA(i); | ||
997 | unsigned long start_pfn = pgdat->node_start_pfn; | ||
998 | unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; | ||
999 | unsigned long pfn; | ||
1000 | |||
1001 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { | ||
1002 | struct page *page; | ||
1003 | |||
1004 | if (!pfn_valid(pfn)) | ||
1005 | continue; | ||
1006 | page = pfn_to_page(pfn); | ||
1007 | /* only scan if page is in use */ | ||
1008 | if (page_count(page) == 0) | ||
1009 | continue; | ||
1010 | scan_block(page, page + 1, NULL); | ||
1011 | } | ||
1012 | } | ||
1013 | |||
1014 | /* | ||
1015 | * Scanning the task stacks may introduce false negatives and it is | ||
1016 | * not enabled by default. | ||
1017 | */ | ||
1018 | if (kmemleak_stack_scan) { | ||
1019 | read_lock(&tasklist_lock); | ||
1020 | for_each_process(task) | ||
1021 | scan_block(task_stack_page(task), | ||
1022 | task_stack_page(task) + THREAD_SIZE, NULL); | ||
1023 | read_unlock(&tasklist_lock); | ||
1024 | } | ||
1025 | |||
1026 | /* | ||
1027 | * Scan the objects already referenced from the sections scanned | ||
1028 | * above. More objects will be referenced and, if there are no memory | ||
1029 | * leaks, all the objects will be scanned. The list traversal is safe | ||
1030 | * for both tail additions and removals from inside the loop. The | ||
1031 | * kmemleak objects cannot be freed from outside the loop because their | ||
1032 | * use_count was increased. | ||
1033 | */ | ||
1034 | object = list_entry(gray_list.next, typeof(*object), gray_list); | ||
1035 | while (&object->gray_list != &gray_list) { | ||
1036 | scan_yield(); | ||
1037 | |||
1038 | /* may add new objects to the list */ | ||
1039 | if (!scan_should_stop()) | ||
1040 | scan_object(object); | ||
1041 | |||
1042 | tmp = list_entry(object->gray_list.next, typeof(*object), | ||
1043 | gray_list); | ||
1044 | |||
1045 | /* remove the object from the list and release it */ | ||
1046 | list_del(&object->gray_list); | ||
1047 | put_object(object); | ||
1048 | |||
1049 | object = tmp; | ||
1050 | } | ||
1051 | WARN_ON(!list_empty(&gray_list)); | ||
1052 | } | ||
1053 | |||
1054 | /* | ||
1055 | * Thread function performing automatic memory scanning. Unreferenced objects | ||
1056 | * at the end of a memory scan are reported but only the first time. | ||
1057 | */ | ||
1058 | static int kmemleak_scan_thread(void *arg) | ||
1059 | { | ||
1060 | static int first_run = 1; | ||
1061 | |||
1062 | pr_info("kmemleak: Automatic memory scanning thread started\n"); | ||
1063 | |||
1064 | /* | ||
1065 | * Wait before the first scan to allow the system to fully initialize. | ||
1066 | */ | ||
1067 | if (first_run) { | ||
1068 | first_run = 0; | ||
1069 | ssleep(SECS_FIRST_SCAN); | ||
1070 | } | ||
1071 | |||
1072 | while (!kthread_should_stop()) { | ||
1073 | struct kmemleak_object *object; | ||
1074 | signed long timeout = jiffies_scan_wait; | ||
1075 | |||
1076 | mutex_lock(&scan_mutex); | ||
1077 | |||
1078 | kmemleak_scan(); | ||
1079 | reported_leaks = 0; | ||
1080 | |||
1081 | rcu_read_lock(); | ||
1082 | list_for_each_entry_rcu(object, &object_list, object_list) { | ||
1083 | unsigned long flags; | ||
1084 | |||
1085 | if (reported_leaks >= REPORTS_NR) | ||
1086 | break; | ||
1087 | spin_lock_irqsave(&object->lock, flags); | ||
1088 | if (!(object->flags & OBJECT_REPORTED) && | ||
1089 | unreferenced_object(object)) { | ||
1090 | print_unreferenced(NULL, object); | ||
1091 | object->flags |= OBJECT_REPORTED; | ||
1092 | reported_leaks++; | ||
1093 | } else if ((object->flags & OBJECT_REPORTED) && | ||
1094 | referenced_object(object)) { | ||
1095 | print_referenced(object); | ||
1096 | object->flags &= ~OBJECT_REPORTED; | ||
1097 | } | ||
1098 | spin_unlock_irqrestore(&object->lock, flags); | ||
1099 | } | ||
1100 | rcu_read_unlock(); | ||
1101 | |||
1102 | mutex_unlock(&scan_mutex); | ||
1103 | /* wait before the next scan */ | ||
1104 | while (timeout && !kthread_should_stop()) | ||
1105 | timeout = schedule_timeout_interruptible(timeout); | ||
1106 | } | ||
1107 | |||
1108 | pr_info("kmemleak: Automatic memory scanning thread ended\n"); | ||
1109 | |||
1110 | return 0; | ||
1111 | } | ||
1112 | |||
1113 | /* | ||
1114 | * Start the automatic memory scanning thread. This function must be called | ||
1115 | * with the kmemleak_mutex held. | ||
1116 | */ | ||
1117 | void start_scan_thread(void) | ||
1118 | { | ||
1119 | if (scan_thread) | ||
1120 | return; | ||
1121 | scan_thread = kthread_run(kmemleak_scan_thread, NULL, "kmemleak"); | ||
1122 | if (IS_ERR(scan_thread)) { | ||
1123 | pr_warning("kmemleak: Failed to create the scan thread\n"); | ||
1124 | scan_thread = NULL; | ||
1125 | } | ||
1126 | } | ||
1127 | |||
1128 | /* | ||
1129 | * Stop the automatic memory scanning thread. This function must be called | ||
1130 | * with the kmemleak_mutex held. | ||
1131 | */ | ||
1132 | void stop_scan_thread(void) | ||
1133 | { | ||
1134 | if (scan_thread) { | ||
1135 | kthread_stop(scan_thread); | ||
1136 | scan_thread = NULL; | ||
1137 | } | ||
1138 | } | ||
1139 | |||
1140 | /* | ||
1141 | * Iterate over the object_list and return the first valid object at or after | ||
1142 | * the required position with its use_count incremented. The function triggers | ||
1143 | * a memory scanning when the pos argument points to the first position. | ||
1144 | */ | ||
1145 | static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos) | ||
1146 | { | ||
1147 | struct kmemleak_object *object; | ||
1148 | loff_t n = *pos; | ||
1149 | |||
1150 | if (!n) { | ||
1151 | kmemleak_scan(); | ||
1152 | reported_leaks = 0; | ||
1153 | } | ||
1154 | if (reported_leaks >= REPORTS_NR) | ||
1155 | return NULL; | ||
1156 | |||
1157 | rcu_read_lock(); | ||
1158 | list_for_each_entry_rcu(object, &object_list, object_list) { | ||
1159 | if (n-- > 0) | ||
1160 | continue; | ||
1161 | if (get_object(object)) | ||
1162 | goto out; | ||
1163 | } | ||
1164 | object = NULL; | ||
1165 | out: | ||
1166 | rcu_read_unlock(); | ||
1167 | return object; | ||
1168 | } | ||
1169 | |||
1170 | /* | ||
1171 | * Return the next object in the object_list. The function decrements the | ||
1172 | * use_count of the previous object and increases that of the next one. | ||
1173 | */ | ||
1174 | static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos) | ||
1175 | { | ||
1176 | struct kmemleak_object *prev_obj = v; | ||
1177 | struct kmemleak_object *next_obj = NULL; | ||
1178 | struct list_head *n = &prev_obj->object_list; | ||
1179 | |||
1180 | ++(*pos); | ||
1181 | if (reported_leaks >= REPORTS_NR) | ||
1182 | goto out; | ||
1183 | |||
1184 | rcu_read_lock(); | ||
1185 | list_for_each_continue_rcu(n, &object_list) { | ||
1186 | next_obj = list_entry(n, struct kmemleak_object, object_list); | ||
1187 | if (get_object(next_obj)) | ||
1188 | break; | ||
1189 | } | ||
1190 | rcu_read_unlock(); | ||
1191 | out: | ||
1192 | put_object(prev_obj); | ||
1193 | return next_obj; | ||
1194 | } | ||
1195 | |||
1196 | /* | ||
1197 | * Decrement the use_count of the last object required, if any. | ||
1198 | */ | ||
1199 | static void kmemleak_seq_stop(struct seq_file *seq, void *v) | ||
1200 | { | ||
1201 | if (v) | ||
1202 | put_object(v); | ||
1203 | } | ||
1204 | |||
1205 | /* | ||
1206 | * Print the information for an unreferenced object to the seq file. | ||
1207 | */ | ||
1208 | static int kmemleak_seq_show(struct seq_file *seq, void *v) | ||
1209 | { | ||
1210 | struct kmemleak_object *object = v; | ||
1211 | unsigned long flags; | ||
1212 | |||
1213 | spin_lock_irqsave(&object->lock, flags); | ||
1214 | if (!unreferenced_object(object)) | ||
1215 | goto out; | ||
1216 | print_unreferenced(seq, object); | ||
1217 | reported_leaks++; | ||
1218 | out: | ||
1219 | spin_unlock_irqrestore(&object->lock, flags); | ||
1220 | return 0; | ||
1221 | } | ||
1222 | |||
1223 | static const struct seq_operations kmemleak_seq_ops = { | ||
1224 | .start = kmemleak_seq_start, | ||
1225 | .next = kmemleak_seq_next, | ||
1226 | .stop = kmemleak_seq_stop, | ||
1227 | .show = kmemleak_seq_show, | ||
1228 | }; | ||
1229 | |||
1230 | static int kmemleak_open(struct inode *inode, struct file *file) | ||
1231 | { | ||
1232 | int ret = 0; | ||
1233 | |||
1234 | if (!atomic_read(&kmemleak_enabled)) | ||
1235 | return -EBUSY; | ||
1236 | |||
1237 | ret = mutex_lock_interruptible(&kmemleak_mutex); | ||
1238 | if (ret < 0) | ||
1239 | goto out; | ||
1240 | if (file->f_mode & FMODE_READ) { | ||
1241 | ret = mutex_lock_interruptible(&scan_mutex); | ||
1242 | if (ret < 0) | ||
1243 | goto kmemleak_unlock; | ||
1244 | ret = seq_open(file, &kmemleak_seq_ops); | ||
1245 | if (ret < 0) | ||
1246 | goto scan_unlock; | ||
1247 | } | ||
1248 | return ret; | ||
1249 | |||
1250 | scan_unlock: | ||
1251 | mutex_unlock(&scan_mutex); | ||
1252 | kmemleak_unlock: | ||
1253 | mutex_unlock(&kmemleak_mutex); | ||
1254 | out: | ||
1255 | return ret; | ||
1256 | } | ||
1257 | |||
1258 | static int kmemleak_release(struct inode *inode, struct file *file) | ||
1259 | { | ||
1260 | int ret = 0; | ||
1261 | |||
1262 | if (file->f_mode & FMODE_READ) { | ||
1263 | seq_release(inode, file); | ||
1264 | mutex_unlock(&scan_mutex); | ||
1265 | } | ||
1266 | mutex_unlock(&kmemleak_mutex); | ||
1267 | |||
1268 | return ret; | ||
1269 | } | ||
1270 | |||
1271 | /* | ||
1272 | * File write operation to configure kmemleak at run-time. The following | ||
1273 | * commands can be written to the /sys/kernel/debug/kmemleak file: | ||
1274 | * off - disable kmemleak (irreversible) | ||
1275 | * stack=on - enable the task stacks scanning | ||
1276 | * stack=off - disable the tasks stacks scanning | ||
1277 | * scan=on - start the automatic memory scanning thread | ||
1278 | * scan=off - stop the automatic memory scanning thread | ||
1279 | * scan=... - set the automatic memory scanning period in seconds (0 to | ||
1280 | * disable it) | ||
1281 | */ | ||
1282 | static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, | ||
1283 | size_t size, loff_t *ppos) | ||
1284 | { | ||
1285 | char buf[64]; | ||
1286 | int buf_size; | ||
1287 | |||
1288 | if (!atomic_read(&kmemleak_enabled)) | ||
1289 | return -EBUSY; | ||
1290 | |||
1291 | buf_size = min(size, (sizeof(buf) - 1)); | ||
1292 | if (strncpy_from_user(buf, user_buf, buf_size) < 0) | ||
1293 | return -EFAULT; | ||
1294 | buf[buf_size] = 0; | ||
1295 | |||
1296 | if (strncmp(buf, "off", 3) == 0) | ||
1297 | kmemleak_disable(); | ||
1298 | else if (strncmp(buf, "stack=on", 8) == 0) | ||
1299 | kmemleak_stack_scan = 1; | ||
1300 | else if (strncmp(buf, "stack=off", 9) == 0) | ||
1301 | kmemleak_stack_scan = 0; | ||
1302 | else if (strncmp(buf, "scan=on", 7) == 0) | ||
1303 | start_scan_thread(); | ||
1304 | else if (strncmp(buf, "scan=off", 8) == 0) | ||
1305 | stop_scan_thread(); | ||
1306 | else if (strncmp(buf, "scan=", 5) == 0) { | ||
1307 | unsigned long secs; | ||
1308 | int err; | ||
1309 | |||
1310 | err = strict_strtoul(buf + 5, 0, &secs); | ||
1311 | if (err < 0) | ||
1312 | return err; | ||
1313 | stop_scan_thread(); | ||
1314 | if (secs) { | ||
1315 | jiffies_scan_wait = msecs_to_jiffies(secs * 1000); | ||
1316 | start_scan_thread(); | ||
1317 | } | ||
1318 | } else | ||
1319 | return -EINVAL; | ||
1320 | |||
1321 | /* ignore the rest of the buffer, only one command at a time */ | ||
1322 | *ppos += size; | ||
1323 | return size; | ||
1324 | } | ||
1325 | |||
1326 | static const struct file_operations kmemleak_fops = { | ||
1327 | .owner = THIS_MODULE, | ||
1328 | .open = kmemleak_open, | ||
1329 | .read = seq_read, | ||
1330 | .write = kmemleak_write, | ||
1331 | .llseek = seq_lseek, | ||
1332 | .release = kmemleak_release, | ||
1333 | }; | ||
1334 | |||
1335 | /* | ||
1336 | * Perform the freeing of the kmemleak internal objects after waiting for any | ||
1337 | * current memory scan to complete. | ||
1338 | */ | ||
1339 | static int kmemleak_cleanup_thread(void *arg) | ||
1340 | { | ||
1341 | struct kmemleak_object *object; | ||
1342 | |||
1343 | mutex_lock(&kmemleak_mutex); | ||
1344 | stop_scan_thread(); | ||
1345 | mutex_unlock(&kmemleak_mutex); | ||
1346 | |||
1347 | mutex_lock(&scan_mutex); | ||
1348 | rcu_read_lock(); | ||
1349 | list_for_each_entry_rcu(object, &object_list, object_list) | ||
1350 | delete_object(object->pointer); | ||
1351 | rcu_read_unlock(); | ||
1352 | mutex_unlock(&scan_mutex); | ||
1353 | |||
1354 | return 0; | ||
1355 | } | ||
1356 | |||
1357 | /* | ||
1358 | * Start the clean-up thread. | ||
1359 | */ | ||
1360 | static void kmemleak_cleanup(void) | ||
1361 | { | ||
1362 | struct task_struct *cleanup_thread; | ||
1363 | |||
1364 | cleanup_thread = kthread_run(kmemleak_cleanup_thread, NULL, | ||
1365 | "kmemleak-clean"); | ||
1366 | if (IS_ERR(cleanup_thread)) | ||
1367 | pr_warning("kmemleak: Failed to create the clean-up thread\n"); | ||
1368 | } | ||
1369 | |||
1370 | /* | ||
1371 | * Disable kmemleak. No memory allocation/freeing will be traced once this | ||
1372 | * function is called. Disabling kmemleak is an irreversible operation. | ||
1373 | */ | ||
1374 | static void kmemleak_disable(void) | ||
1375 | { | ||
1376 | /* atomically check whether it was already invoked */ | ||
1377 | if (atomic_cmpxchg(&kmemleak_error, 0, 1)) | ||
1378 | return; | ||
1379 | |||
1380 | /* stop any memory operation tracing */ | ||
1381 | atomic_set(&kmemleak_early_log, 0); | ||
1382 | atomic_set(&kmemleak_enabled, 0); | ||
1383 | |||
1384 | /* check whether it is too early for a kernel thread */ | ||
1385 | if (atomic_read(&kmemleak_initialized)) | ||
1386 | kmemleak_cleanup(); | ||
1387 | |||
1388 | pr_info("Kernel memory leak detector disabled\n"); | ||
1389 | } | ||
1390 | |||
1391 | /* | ||
1392 | * Allow boot-time kmemleak disabling (enabled by default). | ||
1393 | */ | ||
1394 | static int kmemleak_boot_config(char *str) | ||
1395 | { | ||
1396 | if (!str) | ||
1397 | return -EINVAL; | ||
1398 | if (strcmp(str, "off") == 0) | ||
1399 | kmemleak_disable(); | ||
1400 | else if (strcmp(str, "on") != 0) | ||
1401 | return -EINVAL; | ||
1402 | return 0; | ||
1403 | } | ||
1404 | early_param("kmemleak", kmemleak_boot_config); | ||
1405 | |||
1406 | /* | ||
1407 | * Kkmemleak initialization. | ||
1408 | */ | ||
1409 | void __init kmemleak_init(void) | ||
1410 | { | ||
1411 | int i; | ||
1412 | unsigned long flags; | ||
1413 | |||
1414 | jiffies_scan_yield = msecs_to_jiffies(MSECS_SCAN_YIELD); | ||
1415 | jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); | ||
1416 | jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); | ||
1417 | |||
1418 | object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); | ||
1419 | scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); | ||
1420 | INIT_PRIO_TREE_ROOT(&object_tree_root); | ||
1421 | |||
1422 | /* the kernel is still in UP mode, so disabling the IRQs is enough */ | ||
1423 | local_irq_save(flags); | ||
1424 | if (!atomic_read(&kmemleak_error)) { | ||
1425 | atomic_set(&kmemleak_enabled, 1); | ||
1426 | atomic_set(&kmemleak_early_log, 0); | ||
1427 | } | ||
1428 | local_irq_restore(flags); | ||
1429 | |||
1430 | /* | ||
1431 | * This is the point where tracking allocations is safe. Automatic | ||
1432 | * scanning is started during the late initcall. Add the early logged | ||
1433 | * callbacks to the kmemleak infrastructure. | ||
1434 | */ | ||
1435 | for (i = 0; i < crt_early_log; i++) { | ||
1436 | struct early_log *log = &early_log[i]; | ||
1437 | |||
1438 | switch (log->op_type) { | ||
1439 | case KMEMLEAK_ALLOC: | ||
1440 | kmemleak_alloc(log->ptr, log->size, log->min_count, | ||
1441 | GFP_KERNEL); | ||
1442 | break; | ||
1443 | case KMEMLEAK_FREE: | ||
1444 | kmemleak_free(log->ptr); | ||
1445 | break; | ||
1446 | case KMEMLEAK_NOT_LEAK: | ||
1447 | kmemleak_not_leak(log->ptr); | ||
1448 | break; | ||
1449 | case KMEMLEAK_IGNORE: | ||
1450 | kmemleak_ignore(log->ptr); | ||
1451 | break; | ||
1452 | case KMEMLEAK_SCAN_AREA: | ||
1453 | kmemleak_scan_area(log->ptr, log->offset, log->length, | ||
1454 | GFP_KERNEL); | ||
1455 | break; | ||
1456 | case KMEMLEAK_NO_SCAN: | ||
1457 | kmemleak_no_scan(log->ptr); | ||
1458 | break; | ||
1459 | default: | ||
1460 | WARN_ON(1); | ||
1461 | } | ||
1462 | } | ||
1463 | } | ||
1464 | |||
1465 | /* | ||
1466 | * Late initialization function. | ||
1467 | */ | ||
1468 | static int __init kmemleak_late_init(void) | ||
1469 | { | ||
1470 | struct dentry *dentry; | ||
1471 | |||
1472 | atomic_set(&kmemleak_initialized, 1); | ||
1473 | |||
1474 | if (atomic_read(&kmemleak_error)) { | ||
1475 | /* | ||
1476 | * Some error occured and kmemleak was disabled. There is a | ||
1477 | * small chance that kmemleak_disable() was called immediately | ||
1478 | * after setting kmemleak_initialized and we may end up with | ||
1479 | * two clean-up threads but serialized by scan_mutex. | ||
1480 | */ | ||
1481 | kmemleak_cleanup(); | ||
1482 | return -ENOMEM; | ||
1483 | } | ||
1484 | |||
1485 | dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL, | ||
1486 | &kmemleak_fops); | ||
1487 | if (!dentry) | ||
1488 | pr_warning("kmemleak: Failed to create the debugfs kmemleak " | ||
1489 | "file\n"); | ||
1490 | mutex_lock(&kmemleak_mutex); | ||
1491 | start_scan_thread(); | ||
1492 | mutex_unlock(&kmemleak_mutex); | ||
1493 | |||
1494 | pr_info("Kernel memory leak detector initialized\n"); | ||
1495 | |||
1496 | return 0; | ||
1497 | } | ||
1498 | late_initcall(kmemleak_late_init); | ||
diff --git a/mm/maccess.c b/mm/maccess.c index ac40796cfb15..9073695ff25f 100644 --- a/mm/maccess.c +++ b/mm/maccess.c | |||
@@ -39,7 +39,7 @@ EXPORT_SYMBOL_GPL(probe_kernel_read); | |||
39 | * Safely write to address @dst from the buffer at @src. If a kernel fault | 39 | * Safely write to address @dst from the buffer at @src. If a kernel fault |
40 | * happens, handle that and return -EFAULT. | 40 | * happens, handle that and return -EFAULT. |
41 | */ | 41 | */ |
42 | long probe_kernel_write(void *dst, void *src, size_t size) | 42 | long notrace __weak probe_kernel_write(void *dst, void *src, size_t size) |
43 | { | 43 | { |
44 | long ret; | 44 | long ret; |
45 | mm_segment_t old_fs = get_fs(); | 45 | mm_segment_t old_fs = get_fs(); |
diff --git a/mm/madvise.c b/mm/madvise.c index b9ce574827c8..76eb4193acdd 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -123,8 +123,7 @@ static long madvise_willneed(struct vm_area_struct * vma, | |||
123 | end = vma->vm_end; | 123 | end = vma->vm_end; |
124 | end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | 124 | end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; |
125 | 125 | ||
126 | force_page_cache_readahead(file->f_mapping, | 126 | force_page_cache_readahead(file->f_mapping, file, start, end - start); |
127 | file, start, max_sane_readahead(end - start)); | ||
128 | return 0; | 127 | return 0; |
129 | } | 128 | } |
130 | 129 | ||
@@ -239,12 +238,30 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, | |||
239 | break; | 238 | break; |
240 | 239 | ||
241 | default: | 240 | default: |
242 | error = -EINVAL; | 241 | BUG(); |
243 | break; | 242 | break; |
244 | } | 243 | } |
245 | return error; | 244 | return error; |
246 | } | 245 | } |
247 | 246 | ||
247 | static int | ||
248 | madvise_behavior_valid(int behavior) | ||
249 | { | ||
250 | switch (behavior) { | ||
251 | case MADV_DOFORK: | ||
252 | case MADV_DONTFORK: | ||
253 | case MADV_NORMAL: | ||
254 | case MADV_SEQUENTIAL: | ||
255 | case MADV_RANDOM: | ||
256 | case MADV_REMOVE: | ||
257 | case MADV_WILLNEED: | ||
258 | case MADV_DONTNEED: | ||
259 | return 1; | ||
260 | |||
261 | default: | ||
262 | return 0; | ||
263 | } | ||
264 | } | ||
248 | /* | 265 | /* |
249 | * The madvise(2) system call. | 266 | * The madvise(2) system call. |
250 | * | 267 | * |
@@ -290,6 +307,9 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | |||
290 | int write; | 307 | int write; |
291 | size_t len; | 308 | size_t len; |
292 | 309 | ||
310 | if (!madvise_behavior_valid(behavior)) | ||
311 | return error; | ||
312 | |||
293 | write = madvise_need_mmap_write(behavior); | 313 | write = madvise_need_mmap_write(behavior); |
294 | if (write) | 314 | if (write) |
295 | down_write(¤t->mm->mmap_sem); | 315 | down_write(¤t->mm->mmap_sem); |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 01c2d8f14685..70db6e0a5eec 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -314,14 +314,6 @@ static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | |||
314 | return mem; | 314 | return mem; |
315 | } | 315 | } |
316 | 316 | ||
317 | static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem) | ||
318 | { | ||
319 | if (!mem) | ||
320 | return true; | ||
321 | return css_is_removed(&mem->css); | ||
322 | } | ||
323 | |||
324 | |||
325 | /* | 317 | /* |
326 | * Call callback function against all cgroup under hierarchy tree. | 318 | * Call callback function against all cgroup under hierarchy tree. |
327 | */ | 319 | */ |
@@ -578,6 +570,17 @@ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) | |||
578 | return 0; | 570 | return 0; |
579 | } | 571 | } |
580 | 572 | ||
573 | int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) | ||
574 | { | ||
575 | unsigned long active; | ||
576 | unsigned long inactive; | ||
577 | |||
578 | inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE); | ||
579 | active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE); | ||
580 | |||
581 | return (active > inactive); | ||
582 | } | ||
583 | |||
581 | unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, | 584 | unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, |
582 | struct zone *zone, | 585 | struct zone *zone, |
583 | enum lru_list lru) | 586 | enum lru_list lru) |
@@ -932,7 +935,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
932 | if (unlikely(!mem)) | 935 | if (unlikely(!mem)) |
933 | return 0; | 936 | return 0; |
934 | 937 | ||
935 | VM_BUG_ON(!mem || mem_cgroup_is_obsolete(mem)); | 938 | VM_BUG_ON(css_is_removed(&mem->css)); |
936 | 939 | ||
937 | while (1) { | 940 | while (1) { |
938 | int ret; | 941 | int ret; |
@@ -1488,8 +1491,9 @@ void mem_cgroup_uncharge_cache_page(struct page *page) | |||
1488 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); | 1491 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); |
1489 | } | 1492 | } |
1490 | 1493 | ||
1494 | #ifdef CONFIG_SWAP | ||
1491 | /* | 1495 | /* |
1492 | * called from __delete_from_swap_cache() and drop "page" account. | 1496 | * called after __delete_from_swap_cache() and drop "page" account. |
1493 | * memcg information is recorded to swap_cgroup of "ent" | 1497 | * memcg information is recorded to swap_cgroup of "ent" |
1494 | */ | 1498 | */ |
1495 | void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) | 1499 | void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) |
@@ -1506,6 +1510,7 @@ void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) | |||
1506 | if (memcg) | 1510 | if (memcg) |
1507 | css_put(&memcg->css); | 1511 | css_put(&memcg->css); |
1508 | } | 1512 | } |
1513 | #endif | ||
1509 | 1514 | ||
1510 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 1515 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
1511 | /* | 1516 | /* |
diff --git a/mm/memory.c b/mm/memory.c index 4126dd16778c..d5d1653d60a6 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1360,6 +1360,56 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1360 | return i; | 1360 | return i; |
1361 | } | 1361 | } |
1362 | 1362 | ||
1363 | /** | ||
1364 | * get_user_pages() - pin user pages in memory | ||
1365 | * @tsk: task_struct of target task | ||
1366 | * @mm: mm_struct of target mm | ||
1367 | * @start: starting user address | ||
1368 | * @len: number of pages from start to pin | ||
1369 | * @write: whether pages will be written to by the caller | ||
1370 | * @force: whether to force write access even if user mapping is | ||
1371 | * readonly. This will result in the page being COWed even | ||
1372 | * in MAP_SHARED mappings. You do not want this. | ||
1373 | * @pages: array that receives pointers to the pages pinned. | ||
1374 | * Should be at least nr_pages long. Or NULL, if caller | ||
1375 | * only intends to ensure the pages are faulted in. | ||
1376 | * @vmas: array of pointers to vmas corresponding to each page. | ||
1377 | * Or NULL if the caller does not require them. | ||
1378 | * | ||
1379 | * Returns number of pages pinned. This may be fewer than the number | ||
1380 | * requested. If len is 0 or negative, returns 0. If no pages | ||
1381 | * were pinned, returns -errno. Each page returned must be released | ||
1382 | * with a put_page() call when it is finished with. vmas will only | ||
1383 | * remain valid while mmap_sem is held. | ||
1384 | * | ||
1385 | * Must be called with mmap_sem held for read or write. | ||
1386 | * | ||
1387 | * get_user_pages walks a process's page tables and takes a reference to | ||
1388 | * each struct page that each user address corresponds to at a given | ||
1389 | * instant. That is, it takes the page that would be accessed if a user | ||
1390 | * thread accesses the given user virtual address at that instant. | ||
1391 | * | ||
1392 | * This does not guarantee that the page exists in the user mappings when | ||
1393 | * get_user_pages returns, and there may even be a completely different | ||
1394 | * page there in some cases (eg. if mmapped pagecache has been invalidated | ||
1395 | * and subsequently re faulted). However it does guarantee that the page | ||
1396 | * won't be freed completely. And mostly callers simply care that the page | ||
1397 | * contains data that was valid *at some point in time*. Typically, an IO | ||
1398 | * or similar operation cannot guarantee anything stronger anyway because | ||
1399 | * locks can't be held over the syscall boundary. | ||
1400 | * | ||
1401 | * If write=0, the page must not be written to. If the page is written to, | ||
1402 | * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called | ||
1403 | * after the page is finished with, and before put_page is called. | ||
1404 | * | ||
1405 | * get_user_pages is typically used for fewer-copy IO operations, to get a | ||
1406 | * handle on the memory by some means other than accesses via the user virtual | ||
1407 | * addresses. The pages may be submitted for DMA to devices or accessed via | ||
1408 | * their kernel linear mapping (via the kmap APIs). Care should be taken to | ||
1409 | * use the correct cache flushing APIs. | ||
1410 | * | ||
1411 | * See also get_user_pages_fast, for performance critical applications. | ||
1412 | */ | ||
1363 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 1413 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
1364 | unsigned long start, int len, int write, int force, | 1414 | unsigned long start, int len, int write, int force, |
1365 | struct page **pages, struct vm_area_struct **vmas) | 1415 | struct page **pages, struct vm_area_struct **vmas) |
@@ -3053,22 +3103,13 @@ int in_gate_area_no_task(unsigned long addr) | |||
3053 | 3103 | ||
3054 | #endif /* __HAVE_ARCH_GATE_AREA */ | 3104 | #endif /* __HAVE_ARCH_GATE_AREA */ |
3055 | 3105 | ||
3056 | #ifdef CONFIG_HAVE_IOREMAP_PROT | 3106 | static int follow_pte(struct mm_struct *mm, unsigned long address, |
3057 | int follow_phys(struct vm_area_struct *vma, | 3107 | pte_t **ptepp, spinlock_t **ptlp) |
3058 | unsigned long address, unsigned int flags, | ||
3059 | unsigned long *prot, resource_size_t *phys) | ||
3060 | { | 3108 | { |
3061 | pgd_t *pgd; | 3109 | pgd_t *pgd; |
3062 | pud_t *pud; | 3110 | pud_t *pud; |
3063 | pmd_t *pmd; | 3111 | pmd_t *pmd; |
3064 | pte_t *ptep, pte; | 3112 | pte_t *ptep; |
3065 | spinlock_t *ptl; | ||
3066 | resource_size_t phys_addr = 0; | ||
3067 | struct mm_struct *mm = vma->vm_mm; | ||
3068 | int ret = -EINVAL; | ||
3069 | |||
3070 | if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) | ||
3071 | goto out; | ||
3072 | 3113 | ||
3073 | pgd = pgd_offset(mm, address); | 3114 | pgd = pgd_offset(mm, address); |
3074 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) | 3115 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) |
@@ -3086,22 +3127,71 @@ int follow_phys(struct vm_area_struct *vma, | |||
3086 | if (pmd_huge(*pmd)) | 3127 | if (pmd_huge(*pmd)) |
3087 | goto out; | 3128 | goto out; |
3088 | 3129 | ||
3089 | ptep = pte_offset_map_lock(mm, pmd, address, &ptl); | 3130 | ptep = pte_offset_map_lock(mm, pmd, address, ptlp); |
3090 | if (!ptep) | 3131 | if (!ptep) |
3091 | goto out; | 3132 | goto out; |
3133 | if (!pte_present(*ptep)) | ||
3134 | goto unlock; | ||
3135 | *ptepp = ptep; | ||
3136 | return 0; | ||
3137 | unlock: | ||
3138 | pte_unmap_unlock(ptep, *ptlp); | ||
3139 | out: | ||
3140 | return -EINVAL; | ||
3141 | } | ||
3092 | 3142 | ||
3143 | /** | ||
3144 | * follow_pfn - look up PFN at a user virtual address | ||
3145 | * @vma: memory mapping | ||
3146 | * @address: user virtual address | ||
3147 | * @pfn: location to store found PFN | ||
3148 | * | ||
3149 | * Only IO mappings and raw PFN mappings are allowed. | ||
3150 | * | ||
3151 | * Returns zero and the pfn at @pfn on success, -ve otherwise. | ||
3152 | */ | ||
3153 | int follow_pfn(struct vm_area_struct *vma, unsigned long address, | ||
3154 | unsigned long *pfn) | ||
3155 | { | ||
3156 | int ret = -EINVAL; | ||
3157 | spinlock_t *ptl; | ||
3158 | pte_t *ptep; | ||
3159 | |||
3160 | if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) | ||
3161 | return ret; | ||
3162 | |||
3163 | ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); | ||
3164 | if (ret) | ||
3165 | return ret; | ||
3166 | *pfn = pte_pfn(*ptep); | ||
3167 | pte_unmap_unlock(ptep, ptl); | ||
3168 | return 0; | ||
3169 | } | ||
3170 | EXPORT_SYMBOL(follow_pfn); | ||
3171 | |||
3172 | #ifdef CONFIG_HAVE_IOREMAP_PROT | ||
3173 | int follow_phys(struct vm_area_struct *vma, | ||
3174 | unsigned long address, unsigned int flags, | ||
3175 | unsigned long *prot, resource_size_t *phys) | ||
3176 | { | ||
3177 | int ret = -EINVAL; | ||
3178 | pte_t *ptep, pte; | ||
3179 | spinlock_t *ptl; | ||
3180 | |||
3181 | if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) | ||
3182 | goto out; | ||
3183 | |||
3184 | if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) | ||
3185 | goto out; | ||
3093 | pte = *ptep; | 3186 | pte = *ptep; |
3094 | if (!pte_present(pte)) | 3187 | |
3095 | goto unlock; | ||
3096 | if ((flags & FOLL_WRITE) && !pte_write(pte)) | 3188 | if ((flags & FOLL_WRITE) && !pte_write(pte)) |
3097 | goto unlock; | 3189 | goto unlock; |
3098 | phys_addr = pte_pfn(pte); | ||
3099 | phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */ | ||
3100 | 3190 | ||
3101 | *prot = pgprot_val(pte_pgprot(pte)); | 3191 | *prot = pgprot_val(pte_pgprot(pte)); |
3102 | *phys = phys_addr; | 3192 | *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; |
3103 | ret = 0; | ||
3104 | 3193 | ||
3194 | ret = 0; | ||
3105 | unlock: | 3195 | unlock: |
3106 | pte_unmap_unlock(ptep, ptl); | 3196 | pte_unmap_unlock(ptep, ptl); |
3107 | out: | 3197 | out: |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c083cf5fd6df..e4412a676c88 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -422,7 +422,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
422 | zone->present_pages += onlined_pages; | 422 | zone->present_pages += onlined_pages; |
423 | zone->zone_pgdat->node_present_pages += onlined_pages; | 423 | zone->zone_pgdat->node_present_pages += onlined_pages; |
424 | 424 | ||
425 | setup_per_zone_pages_min(); | 425 | setup_per_zone_wmarks(); |
426 | calculate_zone_inactive_ratio(zone); | ||
426 | if (onlined_pages) { | 427 | if (onlined_pages) { |
427 | kswapd_run(zone_to_nid(zone)); | 428 | kswapd_run(zone_to_nid(zone)); |
428 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); | 429 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); |
@@ -832,6 +833,9 @@ repeat: | |||
832 | totalram_pages -= offlined_pages; | 833 | totalram_pages -= offlined_pages; |
833 | num_physpages -= offlined_pages; | 834 | num_physpages -= offlined_pages; |
834 | 835 | ||
836 | setup_per_zone_wmarks(); | ||
837 | calculate_zone_inactive_ratio(zone); | ||
838 | |||
835 | vm_total_pages = nr_free_pagecache_pages(); | 839 | vm_total_pages = nr_free_pagecache_pages(); |
836 | writeback_set_ratelimit(); | 840 | writeback_set_ratelimit(); |
837 | 841 | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3eb4a6fdc043..e08e2c4da63a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -182,13 +182,54 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) | |||
182 | return 0; | 182 | return 0; |
183 | } | 183 | } |
184 | 184 | ||
185 | /* Create a new policy */ | 185 | /* |
186 | * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if | ||
187 | * any, for the new policy. mpol_new() has already validated the nodes | ||
188 | * parameter with respect to the policy mode and flags. But, we need to | ||
189 | * handle an empty nodemask with MPOL_PREFERRED here. | ||
190 | * | ||
191 | * Must be called holding task's alloc_lock to protect task's mems_allowed | ||
192 | * and mempolicy. May also be called holding the mmap_semaphore for write. | ||
193 | */ | ||
194 | static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes) | ||
195 | { | ||
196 | nodemask_t cpuset_context_nmask; | ||
197 | int ret; | ||
198 | |||
199 | /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ | ||
200 | if (pol == NULL) | ||
201 | return 0; | ||
202 | |||
203 | VM_BUG_ON(!nodes); | ||
204 | if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) | ||
205 | nodes = NULL; /* explicit local allocation */ | ||
206 | else { | ||
207 | if (pol->flags & MPOL_F_RELATIVE_NODES) | ||
208 | mpol_relative_nodemask(&cpuset_context_nmask, nodes, | ||
209 | &cpuset_current_mems_allowed); | ||
210 | else | ||
211 | nodes_and(cpuset_context_nmask, *nodes, | ||
212 | cpuset_current_mems_allowed); | ||
213 | if (mpol_store_user_nodemask(pol)) | ||
214 | pol->w.user_nodemask = *nodes; | ||
215 | else | ||
216 | pol->w.cpuset_mems_allowed = | ||
217 | cpuset_current_mems_allowed; | ||
218 | } | ||
219 | |||
220 | ret = mpol_ops[pol->mode].create(pol, | ||
221 | nodes ? &cpuset_context_nmask : NULL); | ||
222 | return ret; | ||
223 | } | ||
224 | |||
225 | /* | ||
226 | * This function just creates a new policy, does some check and simple | ||
227 | * initialization. You must invoke mpol_set_nodemask() to set nodes. | ||
228 | */ | ||
186 | static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | 229 | static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, |
187 | nodemask_t *nodes) | 230 | nodemask_t *nodes) |
188 | { | 231 | { |
189 | struct mempolicy *policy; | 232 | struct mempolicy *policy; |
190 | nodemask_t cpuset_context_nmask; | ||
191 | int ret; | ||
192 | 233 | ||
193 | pr_debug("setting mode %d flags %d nodes[0] %lx\n", | 234 | pr_debug("setting mode %d flags %d nodes[0] %lx\n", |
194 | mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); | 235 | mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); |
@@ -210,7 +251,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | |||
210 | if (((flags & MPOL_F_STATIC_NODES) || | 251 | if (((flags & MPOL_F_STATIC_NODES) || |
211 | (flags & MPOL_F_RELATIVE_NODES))) | 252 | (flags & MPOL_F_RELATIVE_NODES))) |
212 | return ERR_PTR(-EINVAL); | 253 | return ERR_PTR(-EINVAL); |
213 | nodes = NULL; /* flag local alloc */ | ||
214 | } | 254 | } |
215 | } else if (nodes_empty(*nodes)) | 255 | } else if (nodes_empty(*nodes)) |
216 | return ERR_PTR(-EINVAL); | 256 | return ERR_PTR(-EINVAL); |
@@ -221,30 +261,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | |||
221 | policy->mode = mode; | 261 | policy->mode = mode; |
222 | policy->flags = flags; | 262 | policy->flags = flags; |
223 | 263 | ||
224 | if (nodes) { | ||
225 | /* | ||
226 | * cpuset related setup doesn't apply to local allocation | ||
227 | */ | ||
228 | cpuset_update_task_memory_state(); | ||
229 | if (flags & MPOL_F_RELATIVE_NODES) | ||
230 | mpol_relative_nodemask(&cpuset_context_nmask, nodes, | ||
231 | &cpuset_current_mems_allowed); | ||
232 | else | ||
233 | nodes_and(cpuset_context_nmask, *nodes, | ||
234 | cpuset_current_mems_allowed); | ||
235 | if (mpol_store_user_nodemask(policy)) | ||
236 | policy->w.user_nodemask = *nodes; | ||
237 | else | ||
238 | policy->w.cpuset_mems_allowed = | ||
239 | cpuset_mems_allowed(current); | ||
240 | } | ||
241 | |||
242 | ret = mpol_ops[mode].create(policy, | ||
243 | nodes ? &cpuset_context_nmask : NULL); | ||
244 | if (ret < 0) { | ||
245 | kmem_cache_free(policy_cache, policy); | ||
246 | return ERR_PTR(ret); | ||
247 | } | ||
248 | return policy; | 264 | return policy; |
249 | } | 265 | } |
250 | 266 | ||
@@ -324,6 +340,8 @@ static void mpol_rebind_policy(struct mempolicy *pol, | |||
324 | /* | 340 | /* |
325 | * Wrapper for mpol_rebind_policy() that just requires task | 341 | * Wrapper for mpol_rebind_policy() that just requires task |
326 | * pointer, and updates task mempolicy. | 342 | * pointer, and updates task mempolicy. |
343 | * | ||
344 | * Called with task's alloc_lock held. | ||
327 | */ | 345 | */ |
328 | 346 | ||
329 | void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) | 347 | void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) |
@@ -600,8 +618,9 @@ static void mpol_set_task_struct_flag(void) | |||
600 | static long do_set_mempolicy(unsigned short mode, unsigned short flags, | 618 | static long do_set_mempolicy(unsigned short mode, unsigned short flags, |
601 | nodemask_t *nodes) | 619 | nodemask_t *nodes) |
602 | { | 620 | { |
603 | struct mempolicy *new; | 621 | struct mempolicy *new, *old; |
604 | struct mm_struct *mm = current->mm; | 622 | struct mm_struct *mm = current->mm; |
623 | int ret; | ||
605 | 624 | ||
606 | new = mpol_new(mode, flags, nodes); | 625 | new = mpol_new(mode, flags, nodes); |
607 | if (IS_ERR(new)) | 626 | if (IS_ERR(new)) |
@@ -615,20 +634,33 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, | |||
615 | */ | 634 | */ |
616 | if (mm) | 635 | if (mm) |
617 | down_write(&mm->mmap_sem); | 636 | down_write(&mm->mmap_sem); |
618 | mpol_put(current->mempolicy); | 637 | task_lock(current); |
638 | ret = mpol_set_nodemask(new, nodes); | ||
639 | if (ret) { | ||
640 | task_unlock(current); | ||
641 | if (mm) | ||
642 | up_write(&mm->mmap_sem); | ||
643 | mpol_put(new); | ||
644 | return ret; | ||
645 | } | ||
646 | old = current->mempolicy; | ||
619 | current->mempolicy = new; | 647 | current->mempolicy = new; |
620 | mpol_set_task_struct_flag(); | 648 | mpol_set_task_struct_flag(); |
621 | if (new && new->mode == MPOL_INTERLEAVE && | 649 | if (new && new->mode == MPOL_INTERLEAVE && |
622 | nodes_weight(new->v.nodes)) | 650 | nodes_weight(new->v.nodes)) |
623 | current->il_next = first_node(new->v.nodes); | 651 | current->il_next = first_node(new->v.nodes); |
652 | task_unlock(current); | ||
624 | if (mm) | 653 | if (mm) |
625 | up_write(&mm->mmap_sem); | 654 | up_write(&mm->mmap_sem); |
626 | 655 | ||
656 | mpol_put(old); | ||
627 | return 0; | 657 | return 0; |
628 | } | 658 | } |
629 | 659 | ||
630 | /* | 660 | /* |
631 | * Return nodemask for policy for get_mempolicy() query | 661 | * Return nodemask for policy for get_mempolicy() query |
662 | * | ||
663 | * Called with task's alloc_lock held | ||
632 | */ | 664 | */ |
633 | static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) | 665 | static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) |
634 | { | 666 | { |
@@ -674,7 +706,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
674 | struct vm_area_struct *vma = NULL; | 706 | struct vm_area_struct *vma = NULL; |
675 | struct mempolicy *pol = current->mempolicy; | 707 | struct mempolicy *pol = current->mempolicy; |
676 | 708 | ||
677 | cpuset_update_task_memory_state(); | ||
678 | if (flags & | 709 | if (flags & |
679 | ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) | 710 | ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) |
680 | return -EINVAL; | 711 | return -EINVAL; |
@@ -683,7 +714,9 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
683 | if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) | 714 | if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) |
684 | return -EINVAL; | 715 | return -EINVAL; |
685 | *policy = 0; /* just so it's initialized */ | 716 | *policy = 0; /* just so it's initialized */ |
717 | task_lock(current); | ||
686 | *nmask = cpuset_current_mems_allowed; | 718 | *nmask = cpuset_current_mems_allowed; |
719 | task_unlock(current); | ||
687 | return 0; | 720 | return 0; |
688 | } | 721 | } |
689 | 722 | ||
@@ -738,8 +771,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
738 | } | 771 | } |
739 | 772 | ||
740 | err = 0; | 773 | err = 0; |
741 | if (nmask) | 774 | if (nmask) { |
775 | task_lock(current); | ||
742 | get_policy_nodemask(pol, nmask); | 776 | get_policy_nodemask(pol, nmask); |
777 | task_unlock(current); | ||
778 | } | ||
743 | 779 | ||
744 | out: | 780 | out: |
745 | mpol_cond_put(pol); | 781 | mpol_cond_put(pol); |
@@ -767,7 +803,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, | |||
767 | 803 | ||
768 | static struct page *new_node_page(struct page *page, unsigned long node, int **x) | 804 | static struct page *new_node_page(struct page *page, unsigned long node, int **x) |
769 | { | 805 | { |
770 | return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0); | 806 | return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); |
771 | } | 807 | } |
772 | 808 | ||
773 | /* | 809 | /* |
@@ -979,6 +1015,14 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
979 | return err; | 1015 | return err; |
980 | } | 1016 | } |
981 | down_write(&mm->mmap_sem); | 1017 | down_write(&mm->mmap_sem); |
1018 | task_lock(current); | ||
1019 | err = mpol_set_nodemask(new, nmask); | ||
1020 | task_unlock(current); | ||
1021 | if (err) { | ||
1022 | up_write(&mm->mmap_sem); | ||
1023 | mpol_put(new); | ||
1024 | return err; | ||
1025 | } | ||
982 | vma = check_range(mm, start, end, nmask, | 1026 | vma = check_range(mm, start, end, nmask, |
983 | flags | MPOL_MF_INVERT, &pagelist); | 1027 | flags | MPOL_MF_INVERT, &pagelist); |
984 | 1028 | ||
@@ -1545,8 +1589,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
1545 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | 1589 | struct mempolicy *pol = get_vma_policy(current, vma, addr); |
1546 | struct zonelist *zl; | 1590 | struct zonelist *zl; |
1547 | 1591 | ||
1548 | cpuset_update_task_memory_state(); | ||
1549 | |||
1550 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { | 1592 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { |
1551 | unsigned nid; | 1593 | unsigned nid; |
1552 | 1594 | ||
@@ -1593,8 +1635,6 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
1593 | { | 1635 | { |
1594 | struct mempolicy *pol = current->mempolicy; | 1636 | struct mempolicy *pol = current->mempolicy; |
1595 | 1637 | ||
1596 | if ((gfp & __GFP_WAIT) && !in_interrupt()) | ||
1597 | cpuset_update_task_memory_state(); | ||
1598 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) | 1638 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) |
1599 | pol = &default_policy; | 1639 | pol = &default_policy; |
1600 | 1640 | ||
@@ -1854,6 +1894,8 @@ restart: | |||
1854 | */ | 1894 | */ |
1855 | void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) | 1895 | void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) |
1856 | { | 1896 | { |
1897 | int ret; | ||
1898 | |||
1857 | sp->root = RB_ROOT; /* empty tree == default mempolicy */ | 1899 | sp->root = RB_ROOT; /* empty tree == default mempolicy */ |
1858 | spin_lock_init(&sp->lock); | 1900 | spin_lock_init(&sp->lock); |
1859 | 1901 | ||
@@ -1863,9 +1905,19 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) | |||
1863 | 1905 | ||
1864 | /* contextualize the tmpfs mount point mempolicy */ | 1906 | /* contextualize the tmpfs mount point mempolicy */ |
1865 | new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); | 1907 | new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); |
1866 | mpol_put(mpol); /* drop our ref on sb mpol */ | 1908 | if (IS_ERR(new)) { |
1867 | if (IS_ERR(new)) | 1909 | mpol_put(mpol); /* drop our ref on sb mpol */ |
1868 | return; /* no valid nodemask intersection */ | 1910 | return; /* no valid nodemask intersection */ |
1911 | } | ||
1912 | |||
1913 | task_lock(current); | ||
1914 | ret = mpol_set_nodemask(new, &mpol->w.user_nodemask); | ||
1915 | task_unlock(current); | ||
1916 | mpol_put(mpol); /* drop our ref on sb mpol */ | ||
1917 | if (ret) { | ||
1918 | mpol_put(new); | ||
1919 | return; | ||
1920 | } | ||
1869 | 1921 | ||
1870 | /* Create pseudo-vma that contains just the policy */ | 1922 | /* Create pseudo-vma that contains just the policy */ |
1871 | memset(&pvma, 0, sizeof(struct vm_area_struct)); | 1923 | memset(&pvma, 0, sizeof(struct vm_area_struct)); |
@@ -2086,8 +2138,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
2086 | new = mpol_new(mode, mode_flags, &nodes); | 2138 | new = mpol_new(mode, mode_flags, &nodes); |
2087 | if (IS_ERR(new)) | 2139 | if (IS_ERR(new)) |
2088 | err = 1; | 2140 | err = 1; |
2089 | else if (no_context) | 2141 | else { |
2090 | new->w.user_nodemask = nodes; /* save for contextualization */ | 2142 | int ret; |
2143 | |||
2144 | task_lock(current); | ||
2145 | ret = mpol_set_nodemask(new, &nodes); | ||
2146 | task_unlock(current); | ||
2147 | if (ret) | ||
2148 | err = 1; | ||
2149 | else if (no_context) { | ||
2150 | /* save for contextualization */ | ||
2151 | new->w.user_nodemask = nodes; | ||
2152 | } | ||
2153 | } | ||
2091 | 2154 | ||
2092 | out: | 2155 | out: |
2093 | /* Restore string for error message */ | 2156 | /* Restore string for error message */ |
diff --git a/mm/migrate.c b/mm/migrate.c index 068655d8f883..939888f9ddab 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -802,7 +802,7 @@ static struct page *new_page_node(struct page *p, unsigned long private, | |||
802 | 802 | ||
803 | *result = &pm->status; | 803 | *result = &pm->status; |
804 | 804 | ||
805 | return alloc_pages_node(pm->node, | 805 | return alloc_pages_exact_node(pm->node, |
806 | GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); | 806 | GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); |
807 | } | 807 | } |
808 | 808 | ||
@@ -820,7 +820,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm, | |||
820 | struct page_to_node *pp; | 820 | struct page_to_node *pp; |
821 | LIST_HEAD(pagelist); | 821 | LIST_HEAD(pagelist); |
822 | 822 | ||
823 | migrate_prep(); | ||
824 | down_read(&mm->mmap_sem); | 823 | down_read(&mm->mmap_sem); |
825 | 824 | ||
826 | /* | 825 | /* |
@@ -907,6 +906,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task, | |||
907 | pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); | 906 | pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); |
908 | if (!pm) | 907 | if (!pm) |
909 | goto out; | 908 | goto out; |
909 | |||
910 | migrate_prep(); | ||
911 | |||
910 | /* | 912 | /* |
911 | * Store a chunk of page_to_node array in a page, | 913 | * Store a chunk of page_to_node array in a page, |
912 | * but keep the last one as a marker | 914 | * but keep the last one as a marker |
diff --git a/mm/mlock.c b/mm/mlock.c index cbe9e0581b75..45eb650b9654 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -31,7 +31,6 @@ int can_do_mlock(void) | |||
31 | } | 31 | } |
32 | EXPORT_SYMBOL(can_do_mlock); | 32 | EXPORT_SYMBOL(can_do_mlock); |
33 | 33 | ||
34 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
35 | /* | 34 | /* |
36 | * Mlocked pages are marked with PageMlocked() flag for efficient testing | 35 | * Mlocked pages are marked with PageMlocked() flag for efficient testing |
37 | * in vmscan and, possibly, the fault path; and to support semi-accurate | 36 | * in vmscan and, possibly, the fault path; and to support semi-accurate |
@@ -261,27 +260,6 @@ static int __mlock_posix_error_return(long retval) | |||
261 | return retval; | 260 | return retval; |
262 | } | 261 | } |
263 | 262 | ||
264 | #else /* CONFIG_UNEVICTABLE_LRU */ | ||
265 | |||
266 | /* | ||
267 | * Just make pages present if VM_LOCKED. No-op if unlocking. | ||
268 | */ | ||
269 | static long __mlock_vma_pages_range(struct vm_area_struct *vma, | ||
270 | unsigned long start, unsigned long end, | ||
271 | int mlock) | ||
272 | { | ||
273 | if (mlock && (vma->vm_flags & VM_LOCKED)) | ||
274 | return make_pages_present(start, end); | ||
275 | return 0; | ||
276 | } | ||
277 | |||
278 | static inline int __mlock_posix_error_return(long retval) | ||
279 | { | ||
280 | return 0; | ||
281 | } | ||
282 | |||
283 | #endif /* CONFIG_UNEVICTABLE_LRU */ | ||
284 | |||
285 | /** | 263 | /** |
286 | * mlock_vma_pages_range() - mlock pages in specified vma range. | 264 | * mlock_vma_pages_range() - mlock pages in specified vma range. |
287 | * @vma - the vma containing the specfied address range | 265 | * @vma - the vma containing the specfied address range |
@@ -629,52 +607,43 @@ void user_shm_unlock(size_t size, struct user_struct *user) | |||
629 | free_uid(user); | 607 | free_uid(user); |
630 | } | 608 | } |
631 | 609 | ||
632 | void *alloc_locked_buffer(size_t size) | 610 | int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim, |
611 | size_t size) | ||
633 | { | 612 | { |
634 | unsigned long rlim, vm, pgsz; | 613 | unsigned long lim, vm, pgsz; |
635 | void *buffer = NULL; | 614 | int error = -ENOMEM; |
636 | 615 | ||
637 | pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; | 616 | pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; |
638 | 617 | ||
639 | down_write(¤t->mm->mmap_sem); | 618 | down_write(&mm->mmap_sem); |
640 | 619 | ||
641 | rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; | 620 | lim = rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; |
642 | vm = current->mm->total_vm + pgsz; | 621 | vm = mm->total_vm + pgsz; |
643 | if (rlim < vm) | 622 | if (lim < vm) |
644 | goto out; | 623 | goto out; |
645 | 624 | ||
646 | rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; | 625 | lim = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; |
647 | vm = current->mm->locked_vm + pgsz; | 626 | vm = mm->locked_vm + pgsz; |
648 | if (rlim < vm) | 627 | if (lim < vm) |
649 | goto out; | 628 | goto out; |
650 | 629 | ||
651 | buffer = kzalloc(size, GFP_KERNEL); | 630 | mm->total_vm += pgsz; |
652 | if (!buffer) | 631 | mm->locked_vm += pgsz; |
653 | goto out; | ||
654 | |||
655 | current->mm->total_vm += pgsz; | ||
656 | current->mm->locked_vm += pgsz; | ||
657 | 632 | ||
633 | error = 0; | ||
658 | out: | 634 | out: |
659 | up_write(¤t->mm->mmap_sem); | 635 | up_write(&mm->mmap_sem); |
660 | return buffer; | 636 | return error; |
661 | } | 637 | } |
662 | 638 | ||
663 | void release_locked_buffer(void *buffer, size_t size) | 639 | void refund_locked_memory(struct mm_struct *mm, size_t size) |
664 | { | 640 | { |
665 | unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; | 641 | unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; |
666 | 642 | ||
667 | down_write(¤t->mm->mmap_sem); | 643 | down_write(&mm->mmap_sem); |
668 | |||
669 | current->mm->total_vm -= pgsz; | ||
670 | current->mm->locked_vm -= pgsz; | ||
671 | |||
672 | up_write(¤t->mm->mmap_sem); | ||
673 | } | ||
674 | 644 | ||
675 | void free_locked_buffer(void *buffer, size_t size) | 645 | mm->total_vm -= pgsz; |
676 | { | 646 | mm->locked_vm -= pgsz; |
677 | release_locked_buffer(buffer, size); | ||
678 | 647 | ||
679 | kfree(buffer); | 648 | up_write(&mm->mmap_sem); |
680 | } | 649 | } |
@@ -28,6 +28,7 @@ | |||
28 | #include <linux/mempolicy.h> | 28 | #include <linux/mempolicy.h> |
29 | #include <linux/rmap.h> | 29 | #include <linux/rmap.h> |
30 | #include <linux/mmu_notifier.h> | 30 | #include <linux/mmu_notifier.h> |
31 | #include <linux/perf_counter.h> | ||
31 | 32 | ||
32 | #include <asm/uaccess.h> | 33 | #include <asm/uaccess.h> |
33 | #include <asm/cacheflush.h> | 34 | #include <asm/cacheflush.h> |
@@ -87,6 +88,9 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */ | |||
87 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; | 88 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; |
88 | struct percpu_counter vm_committed_as; | 89 | struct percpu_counter vm_committed_as; |
89 | 90 | ||
91 | /* amount of vm to protect from userspace access */ | ||
92 | unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; | ||
93 | |||
90 | /* | 94 | /* |
91 | * Check that a process has enough memory to allocate a new virtual | 95 | * Check that a process has enough memory to allocate a new virtual |
92 | * mapping. 0 means there is enough memory for the allocation to | 96 | * mapping. 0 means there is enough memory for the allocation to |
@@ -1219,6 +1223,8 @@ munmap_back: | |||
1219 | if (correct_wcount) | 1223 | if (correct_wcount) |
1220 | atomic_inc(&inode->i_writecount); | 1224 | atomic_inc(&inode->i_writecount); |
1221 | out: | 1225 | out: |
1226 | perf_counter_mmap(vma); | ||
1227 | |||
1222 | mm->total_vm += len >> PAGE_SHIFT; | 1228 | mm->total_vm += len >> PAGE_SHIFT; |
1223 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); | 1229 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); |
1224 | if (vm_flags & VM_LOCKED) { | 1230 | if (vm_flags & VM_LOCKED) { |
@@ -2305,6 +2311,8 @@ int install_special_mapping(struct mm_struct *mm, | |||
2305 | 2311 | ||
2306 | mm->total_vm += len >> PAGE_SHIFT; | 2312 | mm->total_vm += len >> PAGE_SHIFT; |
2307 | 2313 | ||
2314 | perf_counter_mmap(vma); | ||
2315 | |||
2308 | return 0; | 2316 | return 0; |
2309 | } | 2317 | } |
2310 | 2318 | ||
diff --git a/mm/mprotect.c b/mm/mprotect.c index 258197b76fb4..d80311baeb2d 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/swapops.h> | 23 | #include <linux/swapops.h> |
24 | #include <linux/mmu_notifier.h> | 24 | #include <linux/mmu_notifier.h> |
25 | #include <linux/migrate.h> | 25 | #include <linux/migrate.h> |
26 | #include <linux/perf_counter.h> | ||
26 | #include <asm/uaccess.h> | 27 | #include <asm/uaccess.h> |
27 | #include <asm/pgtable.h> | 28 | #include <asm/pgtable.h> |
28 | #include <asm/cacheflush.h> | 29 | #include <asm/cacheflush.h> |
@@ -299,6 +300,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, | |||
299 | error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); | 300 | error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); |
300 | if (error) | 301 | if (error) |
301 | goto out; | 302 | goto out; |
303 | perf_counter_mmap(vma); | ||
302 | nstart = tmp; | 304 | nstart = tmp; |
303 | 305 | ||
304 | if (nstart < prev->vm_end) | 306 | if (nstart < prev->vm_end) |
diff --git a/mm/nommu.c b/mm/nommu.c index b571ef707428..2fd2ad5da98e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -69,6 +69,9 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; | |||
69 | int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; | 69 | int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; |
70 | int heap_stack_gap = 0; | 70 | int heap_stack_gap = 0; |
71 | 71 | ||
72 | /* amount of vm to protect from userspace access */ | ||
73 | unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; | ||
74 | |||
72 | atomic_long_t mmap_pages_allocated; | 75 | atomic_long_t mmap_pages_allocated; |
73 | 76 | ||
74 | EXPORT_SYMBOL(mem_map); | 77 | EXPORT_SYMBOL(mem_map); |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 92bcf1db16b2..175a67a78a99 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -58,6 +58,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
58 | unsigned long points, cpu_time, run_time; | 58 | unsigned long points, cpu_time, run_time; |
59 | struct mm_struct *mm; | 59 | struct mm_struct *mm; |
60 | struct task_struct *child; | 60 | struct task_struct *child; |
61 | int oom_adj; | ||
61 | 62 | ||
62 | task_lock(p); | 63 | task_lock(p); |
63 | mm = p->mm; | 64 | mm = p->mm; |
@@ -65,6 +66,11 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
65 | task_unlock(p); | 66 | task_unlock(p); |
66 | return 0; | 67 | return 0; |
67 | } | 68 | } |
69 | oom_adj = mm->oom_adj; | ||
70 | if (oom_adj == OOM_DISABLE) { | ||
71 | task_unlock(p); | ||
72 | return 0; | ||
73 | } | ||
68 | 74 | ||
69 | /* | 75 | /* |
70 | * The memory size of the process is the basis for the badness. | 76 | * The memory size of the process is the basis for the badness. |
@@ -148,15 +154,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
148 | points /= 8; | 154 | points /= 8; |
149 | 155 | ||
150 | /* | 156 | /* |
151 | * Adjust the score by oomkilladj. | 157 | * Adjust the score by oom_adj. |
152 | */ | 158 | */ |
153 | if (p->oomkilladj) { | 159 | if (oom_adj) { |
154 | if (p->oomkilladj > 0) { | 160 | if (oom_adj > 0) { |
155 | if (!points) | 161 | if (!points) |
156 | points = 1; | 162 | points = 1; |
157 | points <<= p->oomkilladj; | 163 | points <<= oom_adj; |
158 | } else | 164 | } else |
159 | points >>= -(p->oomkilladj); | 165 | points >>= -(oom_adj); |
160 | } | 166 | } |
161 | 167 | ||
162 | #ifdef DEBUG | 168 | #ifdef DEBUG |
@@ -251,11 +257,8 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, | |||
251 | *ppoints = ULONG_MAX; | 257 | *ppoints = ULONG_MAX; |
252 | } | 258 | } |
253 | 259 | ||
254 | if (p->oomkilladj == OOM_DISABLE) | ||
255 | continue; | ||
256 | |||
257 | points = badness(p, uptime.tv_sec); | 260 | points = badness(p, uptime.tv_sec); |
258 | if (points > *ppoints || !chosen) { | 261 | if (points > *ppoints) { |
259 | chosen = p; | 262 | chosen = p; |
260 | *ppoints = points; | 263 | *ppoints = points; |
261 | } | 264 | } |
@@ -284,22 +287,27 @@ static void dump_tasks(const struct mem_cgroup *mem) | |||
284 | printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj " | 287 | printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj " |
285 | "name\n"); | 288 | "name\n"); |
286 | do_each_thread(g, p) { | 289 | do_each_thread(g, p) { |
287 | /* | 290 | struct mm_struct *mm; |
288 | * total_vm and rss sizes do not exist for tasks with a | 291 | |
289 | * detached mm so there's no need to report them. | ||
290 | */ | ||
291 | if (!p->mm) | ||
292 | continue; | ||
293 | if (mem && !task_in_mem_cgroup(p, mem)) | 292 | if (mem && !task_in_mem_cgroup(p, mem)) |
294 | continue; | 293 | continue; |
295 | if (!thread_group_leader(p)) | 294 | if (!thread_group_leader(p)) |
296 | continue; | 295 | continue; |
297 | 296 | ||
298 | task_lock(p); | 297 | task_lock(p); |
298 | mm = p->mm; | ||
299 | if (!mm) { | ||
300 | /* | ||
301 | * total_vm and rss sizes do not exist for tasks with no | ||
302 | * mm so there's no need to report them; they can't be | ||
303 | * oom killed anyway. | ||
304 | */ | ||
305 | task_unlock(p); | ||
306 | continue; | ||
307 | } | ||
299 | printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", | 308 | printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", |
300 | p->pid, __task_cred(p)->uid, p->tgid, | 309 | p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, |
301 | p->mm->total_vm, get_mm_rss(p->mm), (int)task_cpu(p), | 310 | get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm); |
302 | p->oomkilladj, p->comm); | ||
303 | task_unlock(p); | 311 | task_unlock(p); |
304 | } while_each_thread(g, p); | 312 | } while_each_thread(g, p); |
305 | } | 313 | } |
@@ -317,11 +325,8 @@ static void __oom_kill_task(struct task_struct *p, int verbose) | |||
317 | return; | 325 | return; |
318 | } | 326 | } |
319 | 327 | ||
320 | if (!p->mm) { | 328 | if (!p->mm) |
321 | WARN_ON(1); | ||
322 | printk(KERN_WARNING "tried to kill an mm-less task!\n"); | ||
323 | return; | 329 | return; |
324 | } | ||
325 | 330 | ||
326 | if (verbose) | 331 | if (verbose) |
327 | printk(KERN_ERR "Killed process %d (%s)\n", | 332 | printk(KERN_ERR "Killed process %d (%s)\n", |
@@ -343,28 +348,13 @@ static int oom_kill_task(struct task_struct *p) | |||
343 | struct mm_struct *mm; | 348 | struct mm_struct *mm; |
344 | struct task_struct *g, *q; | 349 | struct task_struct *g, *q; |
345 | 350 | ||
351 | task_lock(p); | ||
346 | mm = p->mm; | 352 | mm = p->mm; |
347 | 353 | if (!mm || mm->oom_adj == OOM_DISABLE) { | |
348 | /* WARNING: mm may not be dereferenced since we did not obtain its | 354 | task_unlock(p); |
349 | * value from get_task_mm(p). This is OK since all we need to do is | ||
350 | * compare mm to q->mm below. | ||
351 | * | ||
352 | * Furthermore, even if mm contains a non-NULL value, p->mm may | ||
353 | * change to NULL at any time since we do not hold task_lock(p). | ||
354 | * However, this is of no concern to us. | ||
355 | */ | ||
356 | |||
357 | if (mm == NULL) | ||
358 | return 1; | 355 | return 1; |
359 | 356 | } | |
360 | /* | 357 | task_unlock(p); |
361 | * Don't kill the process if any threads are set to OOM_DISABLE | ||
362 | */ | ||
363 | do_each_thread(g, q) { | ||
364 | if (q->mm == mm && q->oomkilladj == OOM_DISABLE) | ||
365 | return 1; | ||
366 | } while_each_thread(g, q); | ||
367 | |||
368 | __oom_kill_task(p, 1); | 358 | __oom_kill_task(p, 1); |
369 | 359 | ||
370 | /* | 360 | /* |
@@ -387,10 +377,11 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
387 | struct task_struct *c; | 377 | struct task_struct *c; |
388 | 378 | ||
389 | if (printk_ratelimit()) { | 379 | if (printk_ratelimit()) { |
390 | printk(KERN_WARNING "%s invoked oom-killer: " | ||
391 | "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", | ||
392 | current->comm, gfp_mask, order, current->oomkilladj); | ||
393 | task_lock(current); | 380 | task_lock(current); |
381 | printk(KERN_WARNING "%s invoked oom-killer: " | ||
382 | "gfp_mask=0x%x, order=%d, oom_adj=%d\n", | ||
383 | current->comm, gfp_mask, order, | ||
384 | current->mm ? current->mm->oom_adj : OOM_DISABLE); | ||
394 | cpuset_print_task_mems_allowed(current); | 385 | cpuset_print_task_mems_allowed(current); |
395 | task_unlock(current); | 386 | task_unlock(current); |
396 | dump_stack(); | 387 | dump_stack(); |
@@ -403,8 +394,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
403 | /* | 394 | /* |
404 | * If the task is already exiting, don't alarm the sysadmin or kill | 395 | * If the task is already exiting, don't alarm the sysadmin or kill |
405 | * its children or threads, just set TIF_MEMDIE so it can die quickly | 396 | * its children or threads, just set TIF_MEMDIE so it can die quickly |
397 | * if its mm is still attached. | ||
406 | */ | 398 | */ |
407 | if (p->flags & PF_EXITING) { | 399 | if (p->mm && (p->flags & PF_EXITING)) { |
408 | __oom_kill_task(p, 0); | 400 | __oom_kill_task(p, 0); |
409 | return 0; | 401 | return 0; |
410 | } | 402 | } |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index bb553c3e955d..7b0dcea4935b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -265,18 +265,19 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi, | |||
265 | * This avoids exceeding the total dirty_limit when the floating averages | 265 | * This avoids exceeding the total dirty_limit when the floating averages |
266 | * fluctuate too quickly. | 266 | * fluctuate too quickly. |
267 | */ | 267 | */ |
268 | static void | 268 | static void clip_bdi_dirty_limit(struct backing_dev_info *bdi, |
269 | clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty) | 269 | unsigned long dirty, unsigned long *pbdi_dirty) |
270 | { | 270 | { |
271 | long avail_dirty; | 271 | unsigned long avail_dirty; |
272 | 272 | ||
273 | avail_dirty = dirty - | 273 | avail_dirty = global_page_state(NR_FILE_DIRTY) + |
274 | (global_page_state(NR_FILE_DIRTY) + | ||
275 | global_page_state(NR_WRITEBACK) + | 274 | global_page_state(NR_WRITEBACK) + |
276 | global_page_state(NR_UNSTABLE_NFS) + | 275 | global_page_state(NR_UNSTABLE_NFS) + |
277 | global_page_state(NR_WRITEBACK_TEMP)); | 276 | global_page_state(NR_WRITEBACK_TEMP); |
278 | 277 | ||
279 | if (avail_dirty < 0) | 278 | if (avail_dirty < dirty) |
279 | avail_dirty = dirty - avail_dirty; | ||
280 | else | ||
280 | avail_dirty = 0; | 281 | avail_dirty = 0; |
281 | 282 | ||
282 | avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + | 283 | avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + |
@@ -299,10 +300,10 @@ static inline void task_dirties_fraction(struct task_struct *tsk, | |||
299 | * | 300 | * |
300 | * dirty -= (dirty/8) * p_{t} | 301 | * dirty -= (dirty/8) * p_{t} |
301 | */ | 302 | */ |
302 | static void task_dirty_limit(struct task_struct *tsk, long *pdirty) | 303 | static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty) |
303 | { | 304 | { |
304 | long numerator, denominator; | 305 | long numerator, denominator; |
305 | long dirty = *pdirty; | 306 | unsigned long dirty = *pdirty; |
306 | u64 inv = dirty >> 3; | 307 | u64 inv = dirty >> 3; |
307 | 308 | ||
308 | task_dirties_fraction(tsk, &numerator, &denominator); | 309 | task_dirties_fraction(tsk, &numerator, &denominator); |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fe753ecf2aa5..a5f3c278c573 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/bootmem.h> | 23 | #include <linux/bootmem.h> |
24 | #include <linux/compiler.h> | 24 | #include <linux/compiler.h> |
25 | #include <linux/kernel.h> | 25 | #include <linux/kernel.h> |
26 | #include <linux/kmemcheck.h> | ||
26 | #include <linux/module.h> | 27 | #include <linux/module.h> |
27 | #include <linux/suspend.h> | 28 | #include <linux/suspend.h> |
28 | #include <linux/pagevec.h> | 29 | #include <linux/pagevec.h> |
@@ -46,6 +47,7 @@ | |||
46 | #include <linux/page-isolation.h> | 47 | #include <linux/page-isolation.h> |
47 | #include <linux/page_cgroup.h> | 48 | #include <linux/page_cgroup.h> |
48 | #include <linux/debugobjects.h> | 49 | #include <linux/debugobjects.h> |
50 | #include <linux/kmemleak.h> | ||
49 | 51 | ||
50 | #include <asm/tlbflush.h> | 52 | #include <asm/tlbflush.h> |
51 | #include <asm/div64.h> | 53 | #include <asm/div64.h> |
@@ -149,10 +151,6 @@ static unsigned long __meminitdata dma_reserve; | |||
149 | static int __meminitdata nr_nodemap_entries; | 151 | static int __meminitdata nr_nodemap_entries; |
150 | static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; | 152 | static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; |
151 | static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; | 153 | static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; |
152 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE | ||
153 | static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES]; | ||
154 | static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; | ||
155 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ | ||
156 | static unsigned long __initdata required_kernelcore; | 154 | static unsigned long __initdata required_kernelcore; |
157 | static unsigned long __initdata required_movablecore; | 155 | static unsigned long __initdata required_movablecore; |
158 | static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; | 156 | static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; |
@@ -164,17 +162,25 @@ static unsigned long __meminitdata dma_reserve; | |||
164 | 162 | ||
165 | #if MAX_NUMNODES > 1 | 163 | #if MAX_NUMNODES > 1 |
166 | int nr_node_ids __read_mostly = MAX_NUMNODES; | 164 | int nr_node_ids __read_mostly = MAX_NUMNODES; |
165 | int nr_online_nodes __read_mostly = 1; | ||
167 | EXPORT_SYMBOL(nr_node_ids); | 166 | EXPORT_SYMBOL(nr_node_ids); |
167 | EXPORT_SYMBOL(nr_online_nodes); | ||
168 | #endif | 168 | #endif |
169 | 169 | ||
170 | int page_group_by_mobility_disabled __read_mostly; | 170 | int page_group_by_mobility_disabled __read_mostly; |
171 | 171 | ||
172 | static void set_pageblock_migratetype(struct page *page, int migratetype) | 172 | static void set_pageblock_migratetype(struct page *page, int migratetype) |
173 | { | 173 | { |
174 | |||
175 | if (unlikely(page_group_by_mobility_disabled)) | ||
176 | migratetype = MIGRATE_UNMOVABLE; | ||
177 | |||
174 | set_pageblock_flags_group(page, (unsigned long)migratetype, | 178 | set_pageblock_flags_group(page, (unsigned long)migratetype, |
175 | PB_migrate, PB_migrate_end); | 179 | PB_migrate, PB_migrate_end); |
176 | } | 180 | } |
177 | 181 | ||
182 | bool oom_killer_disabled __read_mostly; | ||
183 | |||
178 | #ifdef CONFIG_DEBUG_VM | 184 | #ifdef CONFIG_DEBUG_VM |
179 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | 185 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) |
180 | { | 186 | { |
@@ -297,23 +303,6 @@ void prep_compound_page(struct page *page, unsigned long order) | |||
297 | } | 303 | } |
298 | } | 304 | } |
299 | 305 | ||
300 | #ifdef CONFIG_HUGETLBFS | ||
301 | void prep_compound_gigantic_page(struct page *page, unsigned long order) | ||
302 | { | ||
303 | int i; | ||
304 | int nr_pages = 1 << order; | ||
305 | struct page *p = page + 1; | ||
306 | |||
307 | set_compound_page_dtor(page, free_compound_page); | ||
308 | set_compound_order(page, order); | ||
309 | __SetPageHead(page); | ||
310 | for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { | ||
311 | __SetPageTail(p); | ||
312 | p->first_page = page; | ||
313 | } | ||
314 | } | ||
315 | #endif | ||
316 | |||
317 | static int destroy_compound_page(struct page *page, unsigned long order) | 306 | static int destroy_compound_page(struct page *page, unsigned long order) |
318 | { | 307 | { |
319 | int i; | 308 | int i; |
@@ -420,7 +409,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
420 | return 0; | 409 | return 0; |
421 | 410 | ||
422 | if (PageBuddy(buddy) && page_order(buddy) == order) { | 411 | if (PageBuddy(buddy) && page_order(buddy) == order) { |
423 | BUG_ON(page_count(buddy) != 0); | 412 | VM_BUG_ON(page_count(buddy) != 0); |
424 | return 1; | 413 | return 1; |
425 | } | 414 | } |
426 | return 0; | 415 | return 0; |
@@ -451,22 +440,22 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
451 | */ | 440 | */ |
452 | 441 | ||
453 | static inline void __free_one_page(struct page *page, | 442 | static inline void __free_one_page(struct page *page, |
454 | struct zone *zone, unsigned int order) | 443 | struct zone *zone, unsigned int order, |
444 | int migratetype) | ||
455 | { | 445 | { |
456 | unsigned long page_idx; | 446 | unsigned long page_idx; |
457 | int order_size = 1 << order; | ||
458 | int migratetype = get_pageblock_migratetype(page); | ||
459 | 447 | ||
460 | if (unlikely(PageCompound(page))) | 448 | if (unlikely(PageCompound(page))) |
461 | if (unlikely(destroy_compound_page(page, order))) | 449 | if (unlikely(destroy_compound_page(page, order))) |
462 | return; | 450 | return; |
463 | 451 | ||
452 | VM_BUG_ON(migratetype == -1); | ||
453 | |||
464 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); | 454 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); |
465 | 455 | ||
466 | VM_BUG_ON(page_idx & (order_size - 1)); | 456 | VM_BUG_ON(page_idx & ((1 << order) - 1)); |
467 | VM_BUG_ON(bad_range(zone, page)); | 457 | VM_BUG_ON(bad_range(zone, page)); |
468 | 458 | ||
469 | __mod_zone_page_state(zone, NR_FREE_PAGES, order_size); | ||
470 | while (order < MAX_ORDER-1) { | 459 | while (order < MAX_ORDER-1) { |
471 | unsigned long combined_idx; | 460 | unsigned long combined_idx; |
472 | struct page *buddy; | 461 | struct page *buddy; |
@@ -490,12 +479,27 @@ static inline void __free_one_page(struct page *page, | |||
490 | zone->free_area[order].nr_free++; | 479 | zone->free_area[order].nr_free++; |
491 | } | 480 | } |
492 | 481 | ||
482 | #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT | ||
483 | /* | ||
484 | * free_page_mlock() -- clean up attempts to free and mlocked() page. | ||
485 | * Page should not be on lru, so no need to fix that up. | ||
486 | * free_pages_check() will verify... | ||
487 | */ | ||
488 | static inline void free_page_mlock(struct page *page) | ||
489 | { | ||
490 | __ClearPageMlocked(page); | ||
491 | __dec_zone_page_state(page, NR_MLOCK); | ||
492 | __count_vm_event(UNEVICTABLE_MLOCKFREED); | ||
493 | } | ||
494 | #else | ||
495 | static void free_page_mlock(struct page *page) { } | ||
496 | #endif | ||
497 | |||
493 | static inline int free_pages_check(struct page *page) | 498 | static inline int free_pages_check(struct page *page) |
494 | { | 499 | { |
495 | free_page_mlock(page); | ||
496 | if (unlikely(page_mapcount(page) | | 500 | if (unlikely(page_mapcount(page) | |
497 | (page->mapping != NULL) | | 501 | (page->mapping != NULL) | |
498 | (page_count(page) != 0) | | 502 | (atomic_read(&page->_count) != 0) | |
499 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { | 503 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { |
500 | bad_page(page); | 504 | bad_page(page); |
501 | return 1; | 505 | return 1; |
@@ -522,6 +526,8 @@ static void free_pages_bulk(struct zone *zone, int count, | |||
522 | spin_lock(&zone->lock); | 526 | spin_lock(&zone->lock); |
523 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); | 527 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); |
524 | zone->pages_scanned = 0; | 528 | zone->pages_scanned = 0; |
529 | |||
530 | __mod_zone_page_state(zone, NR_FREE_PAGES, count << order); | ||
525 | while (count--) { | 531 | while (count--) { |
526 | struct page *page; | 532 | struct page *page; |
527 | 533 | ||
@@ -529,17 +535,20 @@ static void free_pages_bulk(struct zone *zone, int count, | |||
529 | page = list_entry(list->prev, struct page, lru); | 535 | page = list_entry(list->prev, struct page, lru); |
530 | /* have to delete it as __free_one_page list manipulates */ | 536 | /* have to delete it as __free_one_page list manipulates */ |
531 | list_del(&page->lru); | 537 | list_del(&page->lru); |
532 | __free_one_page(page, zone, order); | 538 | __free_one_page(page, zone, order, page_private(page)); |
533 | } | 539 | } |
534 | spin_unlock(&zone->lock); | 540 | spin_unlock(&zone->lock); |
535 | } | 541 | } |
536 | 542 | ||
537 | static void free_one_page(struct zone *zone, struct page *page, int order) | 543 | static void free_one_page(struct zone *zone, struct page *page, int order, |
544 | int migratetype) | ||
538 | { | 545 | { |
539 | spin_lock(&zone->lock); | 546 | spin_lock(&zone->lock); |
540 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); | 547 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); |
541 | zone->pages_scanned = 0; | 548 | zone->pages_scanned = 0; |
542 | __free_one_page(page, zone, order); | 549 | |
550 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); | ||
551 | __free_one_page(page, zone, order, migratetype); | ||
543 | spin_unlock(&zone->lock); | 552 | spin_unlock(&zone->lock); |
544 | } | 553 | } |
545 | 554 | ||
@@ -548,6 +557,9 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
548 | unsigned long flags; | 557 | unsigned long flags; |
549 | int i; | 558 | int i; |
550 | int bad = 0; | 559 | int bad = 0; |
560 | int clearMlocked = PageMlocked(page); | ||
561 | |||
562 | kmemcheck_free_shadow(page, order); | ||
551 | 563 | ||
552 | for (i = 0 ; i < (1 << order) ; ++i) | 564 | for (i = 0 ; i < (1 << order) ; ++i) |
553 | bad += free_pages_check(page + i); | 565 | bad += free_pages_check(page + i); |
@@ -563,8 +575,11 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
563 | kernel_map_pages(page, 1 << order, 0); | 575 | kernel_map_pages(page, 1 << order, 0); |
564 | 576 | ||
565 | local_irq_save(flags); | 577 | local_irq_save(flags); |
578 | if (unlikely(clearMlocked)) | ||
579 | free_page_mlock(page); | ||
566 | __count_vm_events(PGFREE, 1 << order); | 580 | __count_vm_events(PGFREE, 1 << order); |
567 | free_one_page(page_zone(page), page, order); | 581 | free_one_page(page_zone(page), page, order, |
582 | get_pageblock_migratetype(page)); | ||
568 | local_irq_restore(flags); | 583 | local_irq_restore(flags); |
569 | } | 584 | } |
570 | 585 | ||
@@ -635,7 +650,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | |||
635 | { | 650 | { |
636 | if (unlikely(page_mapcount(page) | | 651 | if (unlikely(page_mapcount(page) | |
637 | (page->mapping != NULL) | | 652 | (page->mapping != NULL) | |
638 | (page_count(page) != 0) | | 653 | (atomic_read(&page->_count) != 0) | |
639 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { | 654 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { |
640 | bad_page(page); | 655 | bad_page(page); |
641 | return 1; | 656 | return 1; |
@@ -660,7 +675,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | |||
660 | * Go through the free lists for the given migratetype and remove | 675 | * Go through the free lists for the given migratetype and remove |
661 | * the smallest available page from the freelists | 676 | * the smallest available page from the freelists |
662 | */ | 677 | */ |
663 | static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, | 678 | static inline |
679 | struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, | ||
664 | int migratetype) | 680 | int migratetype) |
665 | { | 681 | { |
666 | unsigned int current_order; | 682 | unsigned int current_order; |
@@ -678,7 +694,6 @@ static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, | |||
678 | list_del(&page->lru); | 694 | list_del(&page->lru); |
679 | rmv_page_order(page); | 695 | rmv_page_order(page); |
680 | area->nr_free--; | 696 | area->nr_free--; |
681 | __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order)); | ||
682 | expand(zone, page, order, current_order, area, migratetype); | 697 | expand(zone, page, order, current_order, area, migratetype); |
683 | return page; | 698 | return page; |
684 | } | 699 | } |
@@ -769,8 +784,8 @@ static int move_freepages_block(struct zone *zone, struct page *page, | |||
769 | } | 784 | } |
770 | 785 | ||
771 | /* Remove an element from the buddy allocator from the fallback list */ | 786 | /* Remove an element from the buddy allocator from the fallback list */ |
772 | static struct page *__rmqueue_fallback(struct zone *zone, int order, | 787 | static inline struct page * |
773 | int start_migratetype) | 788 | __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) |
774 | { | 789 | { |
775 | struct free_area * area; | 790 | struct free_area * area; |
776 | int current_order; | 791 | int current_order; |
@@ -818,8 +833,6 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order, | |||
818 | /* Remove the page from the freelists */ | 833 | /* Remove the page from the freelists */ |
819 | list_del(&page->lru); | 834 | list_del(&page->lru); |
820 | rmv_page_order(page); | 835 | rmv_page_order(page); |
821 | __mod_zone_page_state(zone, NR_FREE_PAGES, | ||
822 | -(1UL << order)); | ||
823 | 836 | ||
824 | if (current_order == pageblock_order) | 837 | if (current_order == pageblock_order) |
825 | set_pageblock_migratetype(page, | 838 | set_pageblock_migratetype(page, |
@@ -830,8 +843,7 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order, | |||
830 | } | 843 | } |
831 | } | 844 | } |
832 | 845 | ||
833 | /* Use MIGRATE_RESERVE rather than fail an allocation */ | 846 | return NULL; |
834 | return __rmqueue_smallest(zone, order, MIGRATE_RESERVE); | ||
835 | } | 847 | } |
836 | 848 | ||
837 | /* | 849 | /* |
@@ -843,11 +855,23 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order, | |||
843 | { | 855 | { |
844 | struct page *page; | 856 | struct page *page; |
845 | 857 | ||
858 | retry_reserve: | ||
846 | page = __rmqueue_smallest(zone, order, migratetype); | 859 | page = __rmqueue_smallest(zone, order, migratetype); |
847 | 860 | ||
848 | if (unlikely(!page)) | 861 | if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { |
849 | page = __rmqueue_fallback(zone, order, migratetype); | 862 | page = __rmqueue_fallback(zone, order, migratetype); |
850 | 863 | ||
864 | /* | ||
865 | * Use MIGRATE_RESERVE rather than fail an allocation. goto | ||
866 | * is used because __rmqueue_smallest is an inline function | ||
867 | * and we want just one call site | ||
868 | */ | ||
869 | if (!page) { | ||
870 | migratetype = MIGRATE_RESERVE; | ||
871 | goto retry_reserve; | ||
872 | } | ||
873 | } | ||
874 | |||
851 | return page; | 875 | return page; |
852 | } | 876 | } |
853 | 877 | ||
@@ -881,6 +905,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
881 | set_page_private(page, migratetype); | 905 | set_page_private(page, migratetype); |
882 | list = &page->lru; | 906 | list = &page->lru; |
883 | } | 907 | } |
908 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); | ||
884 | spin_unlock(&zone->lock); | 909 | spin_unlock(&zone->lock); |
885 | return i; | 910 | return i; |
886 | } | 911 | } |
@@ -996,6 +1021,9 @@ static void free_hot_cold_page(struct page *page, int cold) | |||
996 | struct zone *zone = page_zone(page); | 1021 | struct zone *zone = page_zone(page); |
997 | struct per_cpu_pages *pcp; | 1022 | struct per_cpu_pages *pcp; |
998 | unsigned long flags; | 1023 | unsigned long flags; |
1024 | int clearMlocked = PageMlocked(page); | ||
1025 | |||
1026 | kmemcheck_free_shadow(page, 0); | ||
999 | 1027 | ||
1000 | if (PageAnon(page)) | 1028 | if (PageAnon(page)) |
1001 | page->mapping = NULL; | 1029 | page->mapping = NULL; |
@@ -1010,13 +1038,16 @@ static void free_hot_cold_page(struct page *page, int cold) | |||
1010 | kernel_map_pages(page, 1, 0); | 1038 | kernel_map_pages(page, 1, 0); |
1011 | 1039 | ||
1012 | pcp = &zone_pcp(zone, get_cpu())->pcp; | 1040 | pcp = &zone_pcp(zone, get_cpu())->pcp; |
1041 | set_page_private(page, get_pageblock_migratetype(page)); | ||
1013 | local_irq_save(flags); | 1042 | local_irq_save(flags); |
1043 | if (unlikely(clearMlocked)) | ||
1044 | free_page_mlock(page); | ||
1014 | __count_vm_event(PGFREE); | 1045 | __count_vm_event(PGFREE); |
1046 | |||
1015 | if (cold) | 1047 | if (cold) |
1016 | list_add_tail(&page->lru, &pcp->list); | 1048 | list_add_tail(&page->lru, &pcp->list); |
1017 | else | 1049 | else |
1018 | list_add(&page->lru, &pcp->list); | 1050 | list_add(&page->lru, &pcp->list); |
1019 | set_page_private(page, get_pageblock_migratetype(page)); | ||
1020 | pcp->count++; | 1051 | pcp->count++; |
1021 | if (pcp->count >= pcp->high) { | 1052 | if (pcp->count >= pcp->high) { |
1022 | free_pages_bulk(zone, pcp->batch, &pcp->list, 0); | 1053 | free_pages_bulk(zone, pcp->batch, &pcp->list, 0); |
@@ -1050,6 +1081,16 @@ void split_page(struct page *page, unsigned int order) | |||
1050 | 1081 | ||
1051 | VM_BUG_ON(PageCompound(page)); | 1082 | VM_BUG_ON(PageCompound(page)); |
1052 | VM_BUG_ON(!page_count(page)); | 1083 | VM_BUG_ON(!page_count(page)); |
1084 | |||
1085 | #ifdef CONFIG_KMEMCHECK | ||
1086 | /* | ||
1087 | * Split shadow pages too, because free(page[0]) would | ||
1088 | * otherwise free the whole shadow. | ||
1089 | */ | ||
1090 | if (kmemcheck_page_is_tracked(page)) | ||
1091 | split_page(virt_to_page(page[0].shadow), order); | ||
1092 | #endif | ||
1093 | |||
1053 | for (i = 1; i < (1 << order); i++) | 1094 | for (i = 1; i < (1 << order); i++) |
1054 | set_page_refcounted(page + i); | 1095 | set_page_refcounted(page + i); |
1055 | } | 1096 | } |
@@ -1059,14 +1100,15 @@ void split_page(struct page *page, unsigned int order) | |||
1059 | * we cheat by calling it from here, in the order > 0 path. Saves a branch | 1100 | * we cheat by calling it from here, in the order > 0 path. Saves a branch |
1060 | * or two. | 1101 | * or two. |
1061 | */ | 1102 | */ |
1062 | static struct page *buffered_rmqueue(struct zone *preferred_zone, | 1103 | static inline |
1063 | struct zone *zone, int order, gfp_t gfp_flags) | 1104 | struct page *buffered_rmqueue(struct zone *preferred_zone, |
1105 | struct zone *zone, int order, gfp_t gfp_flags, | ||
1106 | int migratetype) | ||
1064 | { | 1107 | { |
1065 | unsigned long flags; | 1108 | unsigned long flags; |
1066 | struct page *page; | 1109 | struct page *page; |
1067 | int cold = !!(gfp_flags & __GFP_COLD); | 1110 | int cold = !!(gfp_flags & __GFP_COLD); |
1068 | int cpu; | 1111 | int cpu; |
1069 | int migratetype = allocflags_to_migratetype(gfp_flags); | ||
1070 | 1112 | ||
1071 | again: | 1113 | again: |
1072 | cpu = get_cpu(); | 1114 | cpu = get_cpu(); |
@@ -1103,8 +1145,22 @@ again: | |||
1103 | list_del(&page->lru); | 1145 | list_del(&page->lru); |
1104 | pcp->count--; | 1146 | pcp->count--; |
1105 | } else { | 1147 | } else { |
1148 | if (unlikely(gfp_flags & __GFP_NOFAIL)) { | ||
1149 | /* | ||
1150 | * __GFP_NOFAIL is not to be used in new code. | ||
1151 | * | ||
1152 | * All __GFP_NOFAIL callers should be fixed so that they | ||
1153 | * properly detect and handle allocation failures. | ||
1154 | * | ||
1155 | * We most definitely don't want callers attempting to | ||
1156 | * allocate greater than single-page units with | ||
1157 | * __GFP_NOFAIL. | ||
1158 | */ | ||
1159 | WARN_ON_ONCE(order > 0); | ||
1160 | } | ||
1106 | spin_lock_irqsave(&zone->lock, flags); | 1161 | spin_lock_irqsave(&zone->lock, flags); |
1107 | page = __rmqueue(zone, order, migratetype); | 1162 | page = __rmqueue(zone, order, migratetype); |
1163 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); | ||
1108 | spin_unlock(&zone->lock); | 1164 | spin_unlock(&zone->lock); |
1109 | if (!page) | 1165 | if (!page) |
1110 | goto failed; | 1166 | goto failed; |
@@ -1126,10 +1182,15 @@ failed: | |||
1126 | return NULL; | 1182 | return NULL; |
1127 | } | 1183 | } |
1128 | 1184 | ||
1129 | #define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ | 1185 | /* The ALLOC_WMARK bits are used as an index to zone->watermark */ |
1130 | #define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ | 1186 | #define ALLOC_WMARK_MIN WMARK_MIN |
1131 | #define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ | 1187 | #define ALLOC_WMARK_LOW WMARK_LOW |
1132 | #define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ | 1188 | #define ALLOC_WMARK_HIGH WMARK_HIGH |
1189 | #define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ | ||
1190 | |||
1191 | /* Mask to get the watermark bits */ | ||
1192 | #define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) | ||
1193 | |||
1133 | #define ALLOC_HARDER 0x10 /* try to alloc harder */ | 1194 | #define ALLOC_HARDER 0x10 /* try to alloc harder */ |
1134 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ | 1195 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ |
1135 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ | 1196 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ |
@@ -1387,23 +1448,18 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) | |||
1387 | */ | 1448 | */ |
1388 | static struct page * | 1449 | static struct page * |
1389 | get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, | 1450 | get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, |
1390 | struct zonelist *zonelist, int high_zoneidx, int alloc_flags) | 1451 | struct zonelist *zonelist, int high_zoneidx, int alloc_flags, |
1452 | struct zone *preferred_zone, int migratetype) | ||
1391 | { | 1453 | { |
1392 | struct zoneref *z; | 1454 | struct zoneref *z; |
1393 | struct page *page = NULL; | 1455 | struct page *page = NULL; |
1394 | int classzone_idx; | 1456 | int classzone_idx; |
1395 | struct zone *zone, *preferred_zone; | 1457 | struct zone *zone; |
1396 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ | 1458 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ |
1397 | int zlc_active = 0; /* set if using zonelist_cache */ | 1459 | int zlc_active = 0; /* set if using zonelist_cache */ |
1398 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ | 1460 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ |
1399 | 1461 | ||
1400 | (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask, | ||
1401 | &preferred_zone); | ||
1402 | if (!preferred_zone) | ||
1403 | return NULL; | ||
1404 | |||
1405 | classzone_idx = zone_idx(preferred_zone); | 1462 | classzone_idx = zone_idx(preferred_zone); |
1406 | |||
1407 | zonelist_scan: | 1463 | zonelist_scan: |
1408 | /* | 1464 | /* |
1409 | * Scan zonelist, looking for a zone with enough free. | 1465 | * Scan zonelist, looking for a zone with enough free. |
@@ -1418,31 +1474,49 @@ zonelist_scan: | |||
1418 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) | 1474 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) |
1419 | goto try_next_zone; | 1475 | goto try_next_zone; |
1420 | 1476 | ||
1477 | BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); | ||
1421 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { | 1478 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { |
1422 | unsigned long mark; | 1479 | unsigned long mark; |
1423 | if (alloc_flags & ALLOC_WMARK_MIN) | 1480 | int ret; |
1424 | mark = zone->pages_min; | 1481 | |
1425 | else if (alloc_flags & ALLOC_WMARK_LOW) | 1482 | mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; |
1426 | mark = zone->pages_low; | 1483 | if (zone_watermark_ok(zone, order, mark, |
1427 | else | 1484 | classzone_idx, alloc_flags)) |
1428 | mark = zone->pages_high; | 1485 | goto try_this_zone; |
1429 | if (!zone_watermark_ok(zone, order, mark, | 1486 | |
1430 | classzone_idx, alloc_flags)) { | 1487 | if (zone_reclaim_mode == 0) |
1431 | if (!zone_reclaim_mode || | 1488 | goto this_zone_full; |
1432 | !zone_reclaim(zone, gfp_mask, order)) | 1489 | |
1490 | ret = zone_reclaim(zone, gfp_mask, order); | ||
1491 | switch (ret) { | ||
1492 | case ZONE_RECLAIM_NOSCAN: | ||
1493 | /* did not scan */ | ||
1494 | goto try_next_zone; | ||
1495 | case ZONE_RECLAIM_FULL: | ||
1496 | /* scanned but unreclaimable */ | ||
1497 | goto this_zone_full; | ||
1498 | default: | ||
1499 | /* did we reclaim enough */ | ||
1500 | if (!zone_watermark_ok(zone, order, mark, | ||
1501 | classzone_idx, alloc_flags)) | ||
1433 | goto this_zone_full; | 1502 | goto this_zone_full; |
1434 | } | 1503 | } |
1435 | } | 1504 | } |
1436 | 1505 | ||
1437 | page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask); | 1506 | try_this_zone: |
1507 | page = buffered_rmqueue(preferred_zone, zone, order, | ||
1508 | gfp_mask, migratetype); | ||
1438 | if (page) | 1509 | if (page) |
1439 | break; | 1510 | break; |
1440 | this_zone_full: | 1511 | this_zone_full: |
1441 | if (NUMA_BUILD) | 1512 | if (NUMA_BUILD) |
1442 | zlc_mark_zone_full(zonelist, z); | 1513 | zlc_mark_zone_full(zonelist, z); |
1443 | try_next_zone: | 1514 | try_next_zone: |
1444 | if (NUMA_BUILD && !did_zlc_setup) { | 1515 | if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { |
1445 | /* we do zlc_setup after the first zone is tried */ | 1516 | /* |
1517 | * we do zlc_setup after the first zone is tried but only | ||
1518 | * if there are multiple nodes make it worthwhile | ||
1519 | */ | ||
1446 | allowednodes = zlc_setup(zonelist, alloc_flags); | 1520 | allowednodes = zlc_setup(zonelist, alloc_flags); |
1447 | zlc_active = 1; | 1521 | zlc_active = 1; |
1448 | did_zlc_setup = 1; | 1522 | did_zlc_setup = 1; |
@@ -1457,47 +1531,217 @@ try_next_zone: | |||
1457 | return page; | 1531 | return page; |
1458 | } | 1532 | } |
1459 | 1533 | ||
1534 | static inline int | ||
1535 | should_alloc_retry(gfp_t gfp_mask, unsigned int order, | ||
1536 | unsigned long pages_reclaimed) | ||
1537 | { | ||
1538 | /* Do not loop if specifically requested */ | ||
1539 | if (gfp_mask & __GFP_NORETRY) | ||
1540 | return 0; | ||
1541 | |||
1542 | /* | ||
1543 | * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER | ||
1544 | * means __GFP_NOFAIL, but that may not be true in other | ||
1545 | * implementations. | ||
1546 | */ | ||
1547 | if (order <= PAGE_ALLOC_COSTLY_ORDER) | ||
1548 | return 1; | ||
1549 | |||
1550 | /* | ||
1551 | * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is | ||
1552 | * specified, then we retry until we no longer reclaim any pages | ||
1553 | * (above), or we've reclaimed an order of pages at least as | ||
1554 | * large as the allocation's order. In both cases, if the | ||
1555 | * allocation still fails, we stop retrying. | ||
1556 | */ | ||
1557 | if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) | ||
1558 | return 1; | ||
1559 | |||
1560 | /* | ||
1561 | * Don't let big-order allocations loop unless the caller | ||
1562 | * explicitly requests that. | ||
1563 | */ | ||
1564 | if (gfp_mask & __GFP_NOFAIL) | ||
1565 | return 1; | ||
1566 | |||
1567 | return 0; | ||
1568 | } | ||
1569 | |||
1570 | static inline struct page * | ||
1571 | __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | ||
1572 | struct zonelist *zonelist, enum zone_type high_zoneidx, | ||
1573 | nodemask_t *nodemask, struct zone *preferred_zone, | ||
1574 | int migratetype) | ||
1575 | { | ||
1576 | struct page *page; | ||
1577 | |||
1578 | /* Acquire the OOM killer lock for the zones in zonelist */ | ||
1579 | if (!try_set_zone_oom(zonelist, gfp_mask)) { | ||
1580 | schedule_timeout_uninterruptible(1); | ||
1581 | return NULL; | ||
1582 | } | ||
1583 | |||
1584 | /* | ||
1585 | * Go through the zonelist yet one more time, keep very high watermark | ||
1586 | * here, this is only to catch a parallel oom killing, we must fail if | ||
1587 | * we're still under heavy pressure. | ||
1588 | */ | ||
1589 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, | ||
1590 | order, zonelist, high_zoneidx, | ||
1591 | ALLOC_WMARK_HIGH|ALLOC_CPUSET, | ||
1592 | preferred_zone, migratetype); | ||
1593 | if (page) | ||
1594 | goto out; | ||
1595 | |||
1596 | /* The OOM killer will not help higher order allocs */ | ||
1597 | if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL)) | ||
1598 | goto out; | ||
1599 | |||
1600 | /* Exhausted what can be done so it's blamo time */ | ||
1601 | out_of_memory(zonelist, gfp_mask, order); | ||
1602 | |||
1603 | out: | ||
1604 | clear_zonelist_oom(zonelist, gfp_mask); | ||
1605 | return page; | ||
1606 | } | ||
1607 | |||
1608 | /* The really slow allocator path where we enter direct reclaim */ | ||
1609 | static inline struct page * | ||
1610 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | ||
1611 | struct zonelist *zonelist, enum zone_type high_zoneidx, | ||
1612 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | ||
1613 | int migratetype, unsigned long *did_some_progress) | ||
1614 | { | ||
1615 | struct page *page = NULL; | ||
1616 | struct reclaim_state reclaim_state; | ||
1617 | struct task_struct *p = current; | ||
1618 | |||
1619 | cond_resched(); | ||
1620 | |||
1621 | /* We now go into synchronous reclaim */ | ||
1622 | cpuset_memory_pressure_bump(); | ||
1623 | |||
1624 | /* | ||
1625 | * The task's cpuset might have expanded its set of allowable nodes | ||
1626 | */ | ||
1627 | p->flags |= PF_MEMALLOC; | ||
1628 | lockdep_set_current_reclaim_state(gfp_mask); | ||
1629 | reclaim_state.reclaimed_slab = 0; | ||
1630 | p->reclaim_state = &reclaim_state; | ||
1631 | |||
1632 | *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); | ||
1633 | |||
1634 | p->reclaim_state = NULL; | ||
1635 | lockdep_clear_current_reclaim_state(); | ||
1636 | p->flags &= ~PF_MEMALLOC; | ||
1637 | |||
1638 | cond_resched(); | ||
1639 | |||
1640 | if (order != 0) | ||
1641 | drain_all_pages(); | ||
1642 | |||
1643 | if (likely(*did_some_progress)) | ||
1644 | page = get_page_from_freelist(gfp_mask, nodemask, order, | ||
1645 | zonelist, high_zoneidx, | ||
1646 | alloc_flags, preferred_zone, | ||
1647 | migratetype); | ||
1648 | return page; | ||
1649 | } | ||
1650 | |||
1460 | /* | 1651 | /* |
1461 | * This is the 'heart' of the zoned buddy allocator. | 1652 | * This is called in the allocator slow-path if the allocation request is of |
1653 | * sufficient urgency to ignore watermarks and take other desperate measures | ||
1462 | */ | 1654 | */ |
1463 | struct page * | 1655 | static inline struct page * |
1464 | __alloc_pages_internal(gfp_t gfp_mask, unsigned int order, | 1656 | __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, |
1465 | struct zonelist *zonelist, nodemask_t *nodemask) | 1657 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
1658 | nodemask_t *nodemask, struct zone *preferred_zone, | ||
1659 | int migratetype) | ||
1660 | { | ||
1661 | struct page *page; | ||
1662 | |||
1663 | do { | ||
1664 | page = get_page_from_freelist(gfp_mask, nodemask, order, | ||
1665 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, | ||
1666 | preferred_zone, migratetype); | ||
1667 | |||
1668 | if (!page && gfp_mask & __GFP_NOFAIL) | ||
1669 | congestion_wait(WRITE, HZ/50); | ||
1670 | } while (!page && (gfp_mask & __GFP_NOFAIL)); | ||
1671 | |||
1672 | return page; | ||
1673 | } | ||
1674 | |||
1675 | static inline | ||
1676 | void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, | ||
1677 | enum zone_type high_zoneidx) | ||
1466 | { | 1678 | { |
1467 | const gfp_t wait = gfp_mask & __GFP_WAIT; | ||
1468 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | ||
1469 | struct zoneref *z; | 1679 | struct zoneref *z; |
1470 | struct zone *zone; | 1680 | struct zone *zone; |
1471 | struct page *page; | ||
1472 | struct reclaim_state reclaim_state; | ||
1473 | struct task_struct *p = current; | ||
1474 | int do_retry; | ||
1475 | int alloc_flags; | ||
1476 | unsigned long did_some_progress; | ||
1477 | unsigned long pages_reclaimed = 0; | ||
1478 | 1681 | ||
1479 | lockdep_trace_alloc(gfp_mask); | 1682 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) |
1683 | wakeup_kswapd(zone, order); | ||
1684 | } | ||
1480 | 1685 | ||
1481 | might_sleep_if(wait); | 1686 | static inline int |
1687 | gfp_to_alloc_flags(gfp_t gfp_mask) | ||
1688 | { | ||
1689 | struct task_struct *p = current; | ||
1690 | int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; | ||
1691 | const gfp_t wait = gfp_mask & __GFP_WAIT; | ||
1482 | 1692 | ||
1483 | if (should_fail_alloc_page(gfp_mask, order)) | 1693 | /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ |
1484 | return NULL; | 1694 | BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH); |
1485 | 1695 | ||
1486 | restart: | 1696 | /* |
1487 | z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */ | 1697 | * The caller may dip into page reserves a bit more if the caller |
1698 | * cannot run direct reclaim, or if the caller has realtime scheduling | ||
1699 | * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will | ||
1700 | * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). | ||
1701 | */ | ||
1702 | alloc_flags |= (gfp_mask & __GFP_HIGH); | ||
1488 | 1703 | ||
1489 | if (unlikely(!z->zone)) { | 1704 | if (!wait) { |
1705 | alloc_flags |= ALLOC_HARDER; | ||
1490 | /* | 1706 | /* |
1491 | * Happens if we have an empty zonelist as a result of | 1707 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. |
1492 | * GFP_THISNODE being used on a memoryless node | 1708 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
1493 | */ | 1709 | */ |
1494 | return NULL; | 1710 | alloc_flags &= ~ALLOC_CPUSET; |
1711 | } else if (unlikely(rt_task(p))) | ||
1712 | alloc_flags |= ALLOC_HARDER; | ||
1713 | |||
1714 | if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { | ||
1715 | if (!in_interrupt() && | ||
1716 | ((p->flags & PF_MEMALLOC) || | ||
1717 | unlikely(test_thread_flag(TIF_MEMDIE)))) | ||
1718 | alloc_flags |= ALLOC_NO_WATERMARKS; | ||
1495 | } | 1719 | } |
1496 | 1720 | ||
1497 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 1721 | return alloc_flags; |
1498 | zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); | 1722 | } |
1499 | if (page) | 1723 | |
1500 | goto got_pg; | 1724 | static inline struct page * |
1725 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | ||
1726 | struct zonelist *zonelist, enum zone_type high_zoneidx, | ||
1727 | nodemask_t *nodemask, struct zone *preferred_zone, | ||
1728 | int migratetype) | ||
1729 | { | ||
1730 | const gfp_t wait = gfp_mask & __GFP_WAIT; | ||
1731 | struct page *page = NULL; | ||
1732 | int alloc_flags; | ||
1733 | unsigned long pages_reclaimed = 0; | ||
1734 | unsigned long did_some_progress; | ||
1735 | struct task_struct *p = current; | ||
1736 | |||
1737 | /* | ||
1738 | * In the slowpath, we sanity check order to avoid ever trying to | ||
1739 | * reclaim >= MAX_ORDER areas which will never succeed. Callers may | ||
1740 | * be using allocators in order of preference for an area that is | ||
1741 | * too large. | ||
1742 | */ | ||
1743 | if (WARN_ON_ONCE(order >= MAX_ORDER)) | ||
1744 | return NULL; | ||
1501 | 1745 | ||
1502 | /* | 1746 | /* |
1503 | * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and | 1747 | * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and |
@@ -1510,154 +1754,83 @@ restart: | |||
1510 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) | 1754 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) |
1511 | goto nopage; | 1755 | goto nopage; |
1512 | 1756 | ||
1513 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) | 1757 | wake_all_kswapd(order, zonelist, high_zoneidx); |
1514 | wakeup_kswapd(zone, order); | ||
1515 | 1758 | ||
1516 | /* | 1759 | /* |
1517 | * OK, we're below the kswapd watermark and have kicked background | 1760 | * OK, we're below the kswapd watermark and have kicked background |
1518 | * reclaim. Now things get more complex, so set up alloc_flags according | 1761 | * reclaim. Now things get more complex, so set up alloc_flags according |
1519 | * to how we want to proceed. | 1762 | * to how we want to proceed. |
1520 | * | ||
1521 | * The caller may dip into page reserves a bit more if the caller | ||
1522 | * cannot run direct reclaim, or if the caller has realtime scheduling | ||
1523 | * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will | ||
1524 | * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). | ||
1525 | */ | 1763 | */ |
1526 | alloc_flags = ALLOC_WMARK_MIN; | 1764 | alloc_flags = gfp_to_alloc_flags(gfp_mask); |
1527 | if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) | ||
1528 | alloc_flags |= ALLOC_HARDER; | ||
1529 | if (gfp_mask & __GFP_HIGH) | ||
1530 | alloc_flags |= ALLOC_HIGH; | ||
1531 | if (wait) | ||
1532 | alloc_flags |= ALLOC_CPUSET; | ||
1533 | 1765 | ||
1534 | /* | 1766 | restart: |
1535 | * Go through the zonelist again. Let __GFP_HIGH and allocations | 1767 | /* This is the last chance, in general, before the goto nopage. */ |
1536 | * coming from realtime tasks go deeper into reserves. | ||
1537 | * | ||
1538 | * This is the last chance, in general, before the goto nopage. | ||
1539 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. | ||
1540 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | ||
1541 | */ | ||
1542 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, | 1768 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, |
1543 | high_zoneidx, alloc_flags); | 1769 | high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, |
1770 | preferred_zone, migratetype); | ||
1544 | if (page) | 1771 | if (page) |
1545 | goto got_pg; | 1772 | goto got_pg; |
1546 | 1773 | ||
1547 | /* This allocation should allow future memory freeing. */ | ||
1548 | |||
1549 | rebalance: | 1774 | rebalance: |
1550 | if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) | 1775 | /* Allocate without watermarks if the context allows */ |
1551 | && !in_interrupt()) { | 1776 | if (alloc_flags & ALLOC_NO_WATERMARKS) { |
1552 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { | 1777 | page = __alloc_pages_high_priority(gfp_mask, order, |
1553 | nofail_alloc: | 1778 | zonelist, high_zoneidx, nodemask, |
1554 | /* go through the zonelist yet again, ignoring mins */ | 1779 | preferred_zone, migratetype); |
1555 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 1780 | if (page) |
1556 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); | 1781 | goto got_pg; |
1557 | if (page) | ||
1558 | goto got_pg; | ||
1559 | if (gfp_mask & __GFP_NOFAIL) { | ||
1560 | congestion_wait(WRITE, HZ/50); | ||
1561 | goto nofail_alloc; | ||
1562 | } | ||
1563 | } | ||
1564 | goto nopage; | ||
1565 | } | 1782 | } |
1566 | 1783 | ||
1567 | /* Atomic allocations - we can't balance anything */ | 1784 | /* Atomic allocations - we can't balance anything */ |
1568 | if (!wait) | 1785 | if (!wait) |
1569 | goto nopage; | 1786 | goto nopage; |
1570 | 1787 | ||
1571 | cond_resched(); | 1788 | /* Avoid recursion of direct reclaim */ |
1789 | if (p->flags & PF_MEMALLOC) | ||
1790 | goto nopage; | ||
1791 | |||
1792 | /* Try direct reclaim and then allocating */ | ||
1793 | page = __alloc_pages_direct_reclaim(gfp_mask, order, | ||
1794 | zonelist, high_zoneidx, | ||
1795 | nodemask, | ||
1796 | alloc_flags, preferred_zone, | ||
1797 | migratetype, &did_some_progress); | ||
1798 | if (page) | ||
1799 | goto got_pg; | ||
1572 | 1800 | ||
1573 | /* We now go into synchronous reclaim */ | ||
1574 | cpuset_memory_pressure_bump(); | ||
1575 | /* | 1801 | /* |
1576 | * The task's cpuset might have expanded its set of allowable nodes | 1802 | * If we failed to make any progress reclaiming, then we are |
1803 | * running out of options and have to consider going OOM | ||
1577 | */ | 1804 | */ |
1578 | cpuset_update_task_memory_state(); | 1805 | if (!did_some_progress) { |
1579 | p->flags |= PF_MEMALLOC; | 1806 | if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { |
1580 | 1807 | if (oom_killer_disabled) | |
1581 | lockdep_set_current_reclaim_state(gfp_mask); | 1808 | goto nopage; |
1582 | reclaim_state.reclaimed_slab = 0; | 1809 | page = __alloc_pages_may_oom(gfp_mask, order, |
1583 | p->reclaim_state = &reclaim_state; | 1810 | zonelist, high_zoneidx, |
1584 | 1811 | nodemask, preferred_zone, | |
1585 | did_some_progress = try_to_free_pages(zonelist, order, | 1812 | migratetype); |
1586 | gfp_mask, nodemask); | 1813 | if (page) |
1587 | 1814 | goto got_pg; | |
1588 | p->reclaim_state = NULL; | ||
1589 | lockdep_clear_current_reclaim_state(); | ||
1590 | p->flags &= ~PF_MEMALLOC; | ||
1591 | |||
1592 | cond_resched(); | ||
1593 | 1815 | ||
1594 | if (order != 0) | 1816 | /* |
1595 | drain_all_pages(); | 1817 | * The OOM killer does not trigger for high-order |
1818 | * ~__GFP_NOFAIL allocations so if no progress is being | ||
1819 | * made, there are no other options and retrying is | ||
1820 | * unlikely to help. | ||
1821 | */ | ||
1822 | if (order > PAGE_ALLOC_COSTLY_ORDER && | ||
1823 | !(gfp_mask & __GFP_NOFAIL)) | ||
1824 | goto nopage; | ||
1596 | 1825 | ||
1597 | if (likely(did_some_progress)) { | ||
1598 | page = get_page_from_freelist(gfp_mask, nodemask, order, | ||
1599 | zonelist, high_zoneidx, alloc_flags); | ||
1600 | if (page) | ||
1601 | goto got_pg; | ||
1602 | } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { | ||
1603 | if (!try_set_zone_oom(zonelist, gfp_mask)) { | ||
1604 | schedule_timeout_uninterruptible(1); | ||
1605 | goto restart; | 1826 | goto restart; |
1606 | } | 1827 | } |
1607 | |||
1608 | /* | ||
1609 | * Go through the zonelist yet one more time, keep | ||
1610 | * very high watermark here, this is only to catch | ||
1611 | * a parallel oom killing, we must fail if we're still | ||
1612 | * under heavy pressure. | ||
1613 | */ | ||
1614 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, | ||
1615 | order, zonelist, high_zoneidx, | ||
1616 | ALLOC_WMARK_HIGH|ALLOC_CPUSET); | ||
1617 | if (page) { | ||
1618 | clear_zonelist_oom(zonelist, gfp_mask); | ||
1619 | goto got_pg; | ||
1620 | } | ||
1621 | |||
1622 | /* The OOM killer will not help higher order allocs so fail */ | ||
1623 | if (order > PAGE_ALLOC_COSTLY_ORDER) { | ||
1624 | clear_zonelist_oom(zonelist, gfp_mask); | ||
1625 | goto nopage; | ||
1626 | } | ||
1627 | |||
1628 | out_of_memory(zonelist, gfp_mask, order); | ||
1629 | clear_zonelist_oom(zonelist, gfp_mask); | ||
1630 | goto restart; | ||
1631 | } | 1828 | } |
1632 | 1829 | ||
1633 | /* | 1830 | /* Check if we should retry the allocation */ |
1634 | * Don't let big-order allocations loop unless the caller explicitly | ||
1635 | * requests that. Wait for some write requests to complete then retry. | ||
1636 | * | ||
1637 | * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER | ||
1638 | * means __GFP_NOFAIL, but that may not be true in other | ||
1639 | * implementations. | ||
1640 | * | ||
1641 | * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is | ||
1642 | * specified, then we retry until we no longer reclaim any pages | ||
1643 | * (above), or we've reclaimed an order of pages at least as | ||
1644 | * large as the allocation's order. In both cases, if the | ||
1645 | * allocation still fails, we stop retrying. | ||
1646 | */ | ||
1647 | pages_reclaimed += did_some_progress; | 1831 | pages_reclaimed += did_some_progress; |
1648 | do_retry = 0; | 1832 | if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { |
1649 | if (!(gfp_mask & __GFP_NORETRY)) { | 1833 | /* Wait for some write requests to complete then retry */ |
1650 | if (order <= PAGE_ALLOC_COSTLY_ORDER) { | ||
1651 | do_retry = 1; | ||
1652 | } else { | ||
1653 | if (gfp_mask & __GFP_REPEAT && | ||
1654 | pages_reclaimed < (1 << order)) | ||
1655 | do_retry = 1; | ||
1656 | } | ||
1657 | if (gfp_mask & __GFP_NOFAIL) | ||
1658 | do_retry = 1; | ||
1659 | } | ||
1660 | if (do_retry) { | ||
1661 | congestion_wait(WRITE, HZ/50); | 1834 | congestion_wait(WRITE, HZ/50); |
1662 | goto rebalance; | 1835 | goto rebalance; |
1663 | } | 1836 | } |
@@ -1670,10 +1843,58 @@ nopage: | |||
1670 | dump_stack(); | 1843 | dump_stack(); |
1671 | show_mem(); | 1844 | show_mem(); |
1672 | } | 1845 | } |
1846 | return page; | ||
1673 | got_pg: | 1847 | got_pg: |
1848 | if (kmemcheck_enabled) | ||
1849 | kmemcheck_pagealloc_alloc(page, order, gfp_mask); | ||
1674 | return page; | 1850 | return page; |
1851 | |||
1675 | } | 1852 | } |
1676 | EXPORT_SYMBOL(__alloc_pages_internal); | 1853 | |
1854 | /* | ||
1855 | * This is the 'heart' of the zoned buddy allocator. | ||
1856 | */ | ||
1857 | struct page * | ||
1858 | __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | ||
1859 | struct zonelist *zonelist, nodemask_t *nodemask) | ||
1860 | { | ||
1861 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | ||
1862 | struct zone *preferred_zone; | ||
1863 | struct page *page; | ||
1864 | int migratetype = allocflags_to_migratetype(gfp_mask); | ||
1865 | |||
1866 | lockdep_trace_alloc(gfp_mask); | ||
1867 | |||
1868 | might_sleep_if(gfp_mask & __GFP_WAIT); | ||
1869 | |||
1870 | if (should_fail_alloc_page(gfp_mask, order)) | ||
1871 | return NULL; | ||
1872 | |||
1873 | /* | ||
1874 | * Check the zones suitable for the gfp_mask contain at least one | ||
1875 | * valid zone. It's possible to have an empty zonelist as a result | ||
1876 | * of GFP_THISNODE and a memoryless node | ||
1877 | */ | ||
1878 | if (unlikely(!zonelist->_zonerefs->zone)) | ||
1879 | return NULL; | ||
1880 | |||
1881 | /* The preferred zone is used for statistics later */ | ||
1882 | first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); | ||
1883 | if (!preferred_zone) | ||
1884 | return NULL; | ||
1885 | |||
1886 | /* First allocation attempt */ | ||
1887 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | ||
1888 | zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET, | ||
1889 | preferred_zone, migratetype); | ||
1890 | if (unlikely(!page)) | ||
1891 | page = __alloc_pages_slowpath(gfp_mask, order, | ||
1892 | zonelist, high_zoneidx, nodemask, | ||
1893 | preferred_zone, migratetype); | ||
1894 | |||
1895 | return page; | ||
1896 | } | ||
1897 | EXPORT_SYMBOL(__alloc_pages_nodemask); | ||
1677 | 1898 | ||
1678 | /* | 1899 | /* |
1679 | * Common helper functions. | 1900 | * Common helper functions. |
@@ -1802,7 +2023,7 @@ static unsigned int nr_free_zone_pages(int offset) | |||
1802 | 2023 | ||
1803 | for_each_zone_zonelist(zone, z, zonelist, offset) { | 2024 | for_each_zone_zonelist(zone, z, zonelist, offset) { |
1804 | unsigned long size = zone->present_pages; | 2025 | unsigned long size = zone->present_pages; |
1805 | unsigned long high = zone->pages_high; | 2026 | unsigned long high = high_wmark_pages(zone); |
1806 | if (size > high) | 2027 | if (size > high) |
1807 | sum += size - high; | 2028 | sum += size - high; |
1808 | } | 2029 | } |
@@ -1894,19 +2115,14 @@ void show_free_areas(void) | |||
1894 | 2115 | ||
1895 | printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" | 2116 | printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" |
1896 | " inactive_file:%lu" | 2117 | " inactive_file:%lu" |
1897 | //TODO: check/adjust line lengths | ||
1898 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
1899 | " unevictable:%lu" | 2118 | " unevictable:%lu" |
1900 | #endif | ||
1901 | " dirty:%lu writeback:%lu unstable:%lu\n" | 2119 | " dirty:%lu writeback:%lu unstable:%lu\n" |
1902 | " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", | 2120 | " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", |
1903 | global_page_state(NR_ACTIVE_ANON), | 2121 | global_page_state(NR_ACTIVE_ANON), |
1904 | global_page_state(NR_ACTIVE_FILE), | 2122 | global_page_state(NR_ACTIVE_FILE), |
1905 | global_page_state(NR_INACTIVE_ANON), | 2123 | global_page_state(NR_INACTIVE_ANON), |
1906 | global_page_state(NR_INACTIVE_FILE), | 2124 | global_page_state(NR_INACTIVE_FILE), |
1907 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
1908 | global_page_state(NR_UNEVICTABLE), | 2125 | global_page_state(NR_UNEVICTABLE), |
1909 | #endif | ||
1910 | global_page_state(NR_FILE_DIRTY), | 2126 | global_page_state(NR_FILE_DIRTY), |
1911 | global_page_state(NR_WRITEBACK), | 2127 | global_page_state(NR_WRITEBACK), |
1912 | global_page_state(NR_UNSTABLE_NFS), | 2128 | global_page_state(NR_UNSTABLE_NFS), |
@@ -1930,25 +2146,21 @@ void show_free_areas(void) | |||
1930 | " inactive_anon:%lukB" | 2146 | " inactive_anon:%lukB" |
1931 | " active_file:%lukB" | 2147 | " active_file:%lukB" |
1932 | " inactive_file:%lukB" | 2148 | " inactive_file:%lukB" |
1933 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
1934 | " unevictable:%lukB" | 2149 | " unevictable:%lukB" |
1935 | #endif | ||
1936 | " present:%lukB" | 2150 | " present:%lukB" |
1937 | " pages_scanned:%lu" | 2151 | " pages_scanned:%lu" |
1938 | " all_unreclaimable? %s" | 2152 | " all_unreclaimable? %s" |
1939 | "\n", | 2153 | "\n", |
1940 | zone->name, | 2154 | zone->name, |
1941 | K(zone_page_state(zone, NR_FREE_PAGES)), | 2155 | K(zone_page_state(zone, NR_FREE_PAGES)), |
1942 | K(zone->pages_min), | 2156 | K(min_wmark_pages(zone)), |
1943 | K(zone->pages_low), | 2157 | K(low_wmark_pages(zone)), |
1944 | K(zone->pages_high), | 2158 | K(high_wmark_pages(zone)), |
1945 | K(zone_page_state(zone, NR_ACTIVE_ANON)), | 2159 | K(zone_page_state(zone, NR_ACTIVE_ANON)), |
1946 | K(zone_page_state(zone, NR_INACTIVE_ANON)), | 2160 | K(zone_page_state(zone, NR_INACTIVE_ANON)), |
1947 | K(zone_page_state(zone, NR_ACTIVE_FILE)), | 2161 | K(zone_page_state(zone, NR_ACTIVE_FILE)), |
1948 | K(zone_page_state(zone, NR_INACTIVE_FILE)), | 2162 | K(zone_page_state(zone, NR_INACTIVE_FILE)), |
1949 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
1950 | K(zone_page_state(zone, NR_UNEVICTABLE)), | 2163 | K(zone_page_state(zone, NR_UNEVICTABLE)), |
1951 | #endif | ||
1952 | K(zone->present_pages), | 2164 | K(zone->present_pages), |
1953 | zone->pages_scanned, | 2165 | zone->pages_scanned, |
1954 | (zone_is_all_unreclaimable(zone) ? "yes" : "no") | 2166 | (zone_is_all_unreclaimable(zone) ? "yes" : "no") |
@@ -2106,7 +2318,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write, | |||
2106 | } | 2318 | } |
2107 | 2319 | ||
2108 | 2320 | ||
2109 | #define MAX_NODE_LOAD (num_online_nodes()) | 2321 | #define MAX_NODE_LOAD (nr_online_nodes) |
2110 | static int node_load[MAX_NUMNODES]; | 2322 | static int node_load[MAX_NUMNODES]; |
2111 | 2323 | ||
2112 | /** | 2324 | /** |
@@ -2315,7 +2527,7 @@ static void build_zonelists(pg_data_t *pgdat) | |||
2315 | 2527 | ||
2316 | /* NUMA-aware ordering of nodes */ | 2528 | /* NUMA-aware ordering of nodes */ |
2317 | local_node = pgdat->node_id; | 2529 | local_node = pgdat->node_id; |
2318 | load = num_online_nodes(); | 2530 | load = nr_online_nodes; |
2319 | prev_node = local_node; | 2531 | prev_node = local_node; |
2320 | nodes_clear(used_mask); | 2532 | nodes_clear(used_mask); |
2321 | 2533 | ||
@@ -2466,7 +2678,7 @@ void build_all_zonelists(void) | |||
2466 | 2678 | ||
2467 | printk("Built %i zonelists in %s order, mobility grouping %s. " | 2679 | printk("Built %i zonelists in %s order, mobility grouping %s. " |
2468 | "Total pages: %ld\n", | 2680 | "Total pages: %ld\n", |
2469 | num_online_nodes(), | 2681 | nr_online_nodes, |
2470 | zonelist_order_name[current_zonelist_order], | 2682 | zonelist_order_name[current_zonelist_order], |
2471 | page_group_by_mobility_disabled ? "off" : "on", | 2683 | page_group_by_mobility_disabled ? "off" : "on", |
2472 | vm_total_pages); | 2684 | vm_total_pages); |
@@ -2545,8 +2757,8 @@ static inline unsigned long wait_table_bits(unsigned long size) | |||
2545 | 2757 | ||
2546 | /* | 2758 | /* |
2547 | * Mark a number of pageblocks as MIGRATE_RESERVE. The number | 2759 | * Mark a number of pageblocks as MIGRATE_RESERVE. The number |
2548 | * of blocks reserved is based on zone->pages_min. The memory within the | 2760 | * of blocks reserved is based on min_wmark_pages(zone). The memory within |
2549 | * reserve will tend to store contiguous free pages. Setting min_free_kbytes | 2761 | * the reserve will tend to store contiguous free pages. Setting min_free_kbytes |
2550 | * higher will lead to a bigger reserve which will get freed as contiguous | 2762 | * higher will lead to a bigger reserve which will get freed as contiguous |
2551 | * blocks as reclaim kicks in | 2763 | * blocks as reclaim kicks in |
2552 | */ | 2764 | */ |
@@ -2559,7 +2771,7 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
2559 | /* Get the start pfn, end pfn and the number of blocks to reserve */ | 2771 | /* Get the start pfn, end pfn and the number of blocks to reserve */ |
2560 | start_pfn = zone->zone_start_pfn; | 2772 | start_pfn = zone->zone_start_pfn; |
2561 | end_pfn = start_pfn + zone->spanned_pages; | 2773 | end_pfn = start_pfn + zone->spanned_pages; |
2562 | reserve = roundup(zone->pages_min, pageblock_nr_pages) >> | 2774 | reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> |
2563 | pageblock_order; | 2775 | pageblock_order; |
2564 | 2776 | ||
2565 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { | 2777 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { |
@@ -3103,64 +3315,6 @@ void __init sparse_memory_present_with_active_regions(int nid) | |||
3103 | } | 3315 | } |
3104 | 3316 | ||
3105 | /** | 3317 | /** |
3106 | * push_node_boundaries - Push node boundaries to at least the requested boundary | ||
3107 | * @nid: The nid of the node to push the boundary for | ||
3108 | * @start_pfn: The start pfn of the node | ||
3109 | * @end_pfn: The end pfn of the node | ||
3110 | * | ||
3111 | * In reserve-based hot-add, mem_map is allocated that is unused until hotadd | ||
3112 | * time. Specifically, on x86_64, SRAT will report ranges that can potentially | ||
3113 | * be hotplugged even though no physical memory exists. This function allows | ||
3114 | * an arch to push out the node boundaries so mem_map is allocated that can | ||
3115 | * be used later. | ||
3116 | */ | ||
3117 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE | ||
3118 | void __init push_node_boundaries(unsigned int nid, | ||
3119 | unsigned long start_pfn, unsigned long end_pfn) | ||
3120 | { | ||
3121 | mminit_dprintk(MMINIT_TRACE, "zoneboundary", | ||
3122 | "Entering push_node_boundaries(%u, %lu, %lu)\n", | ||
3123 | nid, start_pfn, end_pfn); | ||
3124 | |||
3125 | /* Initialise the boundary for this node if necessary */ | ||
3126 | if (node_boundary_end_pfn[nid] == 0) | ||
3127 | node_boundary_start_pfn[nid] = -1UL; | ||
3128 | |||
3129 | /* Update the boundaries */ | ||
3130 | if (node_boundary_start_pfn[nid] > start_pfn) | ||
3131 | node_boundary_start_pfn[nid] = start_pfn; | ||
3132 | if (node_boundary_end_pfn[nid] < end_pfn) | ||
3133 | node_boundary_end_pfn[nid] = end_pfn; | ||
3134 | } | ||
3135 | |||
3136 | /* If necessary, push the node boundary out for reserve hotadd */ | ||
3137 | static void __meminit account_node_boundary(unsigned int nid, | ||
3138 | unsigned long *start_pfn, unsigned long *end_pfn) | ||
3139 | { | ||
3140 | mminit_dprintk(MMINIT_TRACE, "zoneboundary", | ||
3141 | "Entering account_node_boundary(%u, %lu, %lu)\n", | ||
3142 | nid, *start_pfn, *end_pfn); | ||
3143 | |||
3144 | /* Return if boundary information has not been provided */ | ||
3145 | if (node_boundary_end_pfn[nid] == 0) | ||
3146 | return; | ||
3147 | |||
3148 | /* Check the boundaries and update if necessary */ | ||
3149 | if (node_boundary_start_pfn[nid] < *start_pfn) | ||
3150 | *start_pfn = node_boundary_start_pfn[nid]; | ||
3151 | if (node_boundary_end_pfn[nid] > *end_pfn) | ||
3152 | *end_pfn = node_boundary_end_pfn[nid]; | ||
3153 | } | ||
3154 | #else | ||
3155 | void __init push_node_boundaries(unsigned int nid, | ||
3156 | unsigned long start_pfn, unsigned long end_pfn) {} | ||
3157 | |||
3158 | static void __meminit account_node_boundary(unsigned int nid, | ||
3159 | unsigned long *start_pfn, unsigned long *end_pfn) {} | ||
3160 | #endif | ||
3161 | |||
3162 | |||
3163 | /** | ||
3164 | * get_pfn_range_for_nid - Return the start and end page frames for a node | 3318 | * get_pfn_range_for_nid - Return the start and end page frames for a node |
3165 | * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. | 3319 | * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. |
3166 | * @start_pfn: Passed by reference. On return, it will have the node start_pfn. | 3320 | * @start_pfn: Passed by reference. On return, it will have the node start_pfn. |
@@ -3185,9 +3339,6 @@ void __meminit get_pfn_range_for_nid(unsigned int nid, | |||
3185 | 3339 | ||
3186 | if (*start_pfn == -1UL) | 3340 | if (*start_pfn == -1UL) |
3187 | *start_pfn = 0; | 3341 | *start_pfn = 0; |
3188 | |||
3189 | /* Push the node boundaries out if requested */ | ||
3190 | account_node_boundary(nid, start_pfn, end_pfn); | ||
3191 | } | 3342 | } |
3192 | 3343 | ||
3193 | /* | 3344 | /* |
@@ -3552,7 +3703,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
3552 | zone_pcp_init(zone); | 3703 | zone_pcp_init(zone); |
3553 | for_each_lru(l) { | 3704 | for_each_lru(l) { |
3554 | INIT_LIST_HEAD(&zone->lru[l].list); | 3705 | INIT_LIST_HEAD(&zone->lru[l].list); |
3555 | zone->lru[l].nr_scan = 0; | 3706 | zone->lru[l].nr_saved_scan = 0; |
3556 | } | 3707 | } |
3557 | zone->reclaim_stat.recent_rotated[0] = 0; | 3708 | zone->reclaim_stat.recent_rotated[0] = 0; |
3558 | zone->reclaim_stat.recent_rotated[1] = 0; | 3709 | zone->reclaim_stat.recent_rotated[1] = 0; |
@@ -3793,10 +3944,6 @@ void __init remove_all_active_ranges(void) | |||
3793 | { | 3944 | { |
3794 | memset(early_node_map, 0, sizeof(early_node_map)); | 3945 | memset(early_node_map, 0, sizeof(early_node_map)); |
3795 | nr_nodemap_entries = 0; | 3946 | nr_nodemap_entries = 0; |
3796 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE | ||
3797 | memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn)); | ||
3798 | memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn)); | ||
3799 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ | ||
3800 | } | 3947 | } |
3801 | 3948 | ||
3802 | /* Compare two active node_active_regions */ | 3949 | /* Compare two active node_active_regions */ |
@@ -4093,6 +4240,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
4093 | early_node_map[i].start_pfn, | 4240 | early_node_map[i].start_pfn, |
4094 | early_node_map[i].end_pfn); | 4241 | early_node_map[i].end_pfn); |
4095 | 4242 | ||
4243 | /* | ||
4244 | * find_zone_movable_pfns_for_nodes/early_calculate_totalpages init | ||
4245 | * that node_mask, clear it at first | ||
4246 | */ | ||
4247 | nodes_clear(node_states[N_HIGH_MEMORY]); | ||
4096 | /* Initialise every node */ | 4248 | /* Initialise every node */ |
4097 | mminit_verify_pageflags_layout(); | 4249 | mminit_verify_pageflags_layout(); |
4098 | setup_nr_node_ids(); | 4250 | setup_nr_node_ids(); |
@@ -4227,8 +4379,8 @@ static void calculate_totalreserve_pages(void) | |||
4227 | max = zone->lowmem_reserve[j]; | 4379 | max = zone->lowmem_reserve[j]; |
4228 | } | 4380 | } |
4229 | 4381 | ||
4230 | /* we treat pages_high as reserved pages. */ | 4382 | /* we treat the high watermark as reserved pages. */ |
4231 | max += zone->pages_high; | 4383 | max += high_wmark_pages(zone); |
4232 | 4384 | ||
4233 | if (max > zone->present_pages) | 4385 | if (max > zone->present_pages) |
4234 | max = zone->present_pages; | 4386 | max = zone->present_pages; |
@@ -4278,12 +4430,13 @@ static void setup_per_zone_lowmem_reserve(void) | |||
4278 | } | 4430 | } |
4279 | 4431 | ||
4280 | /** | 4432 | /** |
4281 | * setup_per_zone_pages_min - called when min_free_kbytes changes. | 4433 | * setup_per_zone_wmarks - called when min_free_kbytes changes |
4434 | * or when memory is hot-{added|removed} | ||
4282 | * | 4435 | * |
4283 | * Ensures that the pages_{min,low,high} values for each zone are set correctly | 4436 | * Ensures that the watermark[min,low,high] values for each zone are set |
4284 | * with respect to min_free_kbytes. | 4437 | * correctly with respect to min_free_kbytes. |
4285 | */ | 4438 | */ |
4286 | void setup_per_zone_pages_min(void) | 4439 | void setup_per_zone_wmarks(void) |
4287 | { | 4440 | { |
4288 | unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); | 4441 | unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); |
4289 | unsigned long lowmem_pages = 0; | 4442 | unsigned long lowmem_pages = 0; |
@@ -4308,7 +4461,7 @@ void setup_per_zone_pages_min(void) | |||
4308 | * need highmem pages, so cap pages_min to a small | 4461 | * need highmem pages, so cap pages_min to a small |
4309 | * value here. | 4462 | * value here. |
4310 | * | 4463 | * |
4311 | * The (pages_high-pages_low) and (pages_low-pages_min) | 4464 | * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) |
4312 | * deltas controls asynch page reclaim, and so should | 4465 | * deltas controls asynch page reclaim, and so should |
4313 | * not be capped for highmem. | 4466 | * not be capped for highmem. |
4314 | */ | 4467 | */ |
@@ -4319,17 +4472,17 @@ void setup_per_zone_pages_min(void) | |||
4319 | min_pages = SWAP_CLUSTER_MAX; | 4472 | min_pages = SWAP_CLUSTER_MAX; |
4320 | if (min_pages > 128) | 4473 | if (min_pages > 128) |
4321 | min_pages = 128; | 4474 | min_pages = 128; |
4322 | zone->pages_min = min_pages; | 4475 | zone->watermark[WMARK_MIN] = min_pages; |
4323 | } else { | 4476 | } else { |
4324 | /* | 4477 | /* |
4325 | * If it's a lowmem zone, reserve a number of pages | 4478 | * If it's a lowmem zone, reserve a number of pages |
4326 | * proportionate to the zone's size. | 4479 | * proportionate to the zone's size. |
4327 | */ | 4480 | */ |
4328 | zone->pages_min = tmp; | 4481 | zone->watermark[WMARK_MIN] = tmp; |
4329 | } | 4482 | } |
4330 | 4483 | ||
4331 | zone->pages_low = zone->pages_min + (tmp >> 2); | 4484 | zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); |
4332 | zone->pages_high = zone->pages_min + (tmp >> 1); | 4485 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); |
4333 | setup_zone_migrate_reserve(zone); | 4486 | setup_zone_migrate_reserve(zone); |
4334 | spin_unlock_irqrestore(&zone->lock, flags); | 4487 | spin_unlock_irqrestore(&zone->lock, flags); |
4335 | } | 4488 | } |
@@ -4339,8 +4492,6 @@ void setup_per_zone_pages_min(void) | |||
4339 | } | 4492 | } |
4340 | 4493 | ||
4341 | /** | 4494 | /** |
4342 | * setup_per_zone_inactive_ratio - called when min_free_kbytes changes. | ||
4343 | * | ||
4344 | * The inactive anon list should be small enough that the VM never has to | 4495 | * The inactive anon list should be small enough that the VM never has to |
4345 | * do too much work, but large enough that each inactive page has a chance | 4496 | * do too much work, but large enough that each inactive page has a chance |
4346 | * to be referenced again before it is swapped out. | 4497 | * to be referenced again before it is swapped out. |
@@ -4361,21 +4512,26 @@ void setup_per_zone_pages_min(void) | |||
4361 | * 1TB 101 10GB | 4512 | * 1TB 101 10GB |
4362 | * 10TB 320 32GB | 4513 | * 10TB 320 32GB |
4363 | */ | 4514 | */ |
4364 | static void setup_per_zone_inactive_ratio(void) | 4515 | void calculate_zone_inactive_ratio(struct zone *zone) |
4365 | { | 4516 | { |
4366 | struct zone *zone; | 4517 | unsigned int gb, ratio; |
4367 | 4518 | ||
4368 | for_each_zone(zone) { | 4519 | /* Zone size in gigabytes */ |
4369 | unsigned int gb, ratio; | 4520 | gb = zone->present_pages >> (30 - PAGE_SHIFT); |
4370 | 4521 | if (gb) | |
4371 | /* Zone size in gigabytes */ | ||
4372 | gb = zone->present_pages >> (30 - PAGE_SHIFT); | ||
4373 | ratio = int_sqrt(10 * gb); | 4522 | ratio = int_sqrt(10 * gb); |
4374 | if (!ratio) | 4523 | else |
4375 | ratio = 1; | 4524 | ratio = 1; |
4376 | 4525 | ||
4377 | zone->inactive_ratio = ratio; | 4526 | zone->inactive_ratio = ratio; |
4378 | } | 4527 | } |
4528 | |||
4529 | static void __init setup_per_zone_inactive_ratio(void) | ||
4530 | { | ||
4531 | struct zone *zone; | ||
4532 | |||
4533 | for_each_zone(zone) | ||
4534 | calculate_zone_inactive_ratio(zone); | ||
4379 | } | 4535 | } |
4380 | 4536 | ||
4381 | /* | 4537 | /* |
@@ -4402,7 +4558,7 @@ static void setup_per_zone_inactive_ratio(void) | |||
4402 | * 8192MB: 11584k | 4558 | * 8192MB: 11584k |
4403 | * 16384MB: 16384k | 4559 | * 16384MB: 16384k |
4404 | */ | 4560 | */ |
4405 | static int __init init_per_zone_pages_min(void) | 4561 | static int __init init_per_zone_wmark_min(void) |
4406 | { | 4562 | { |
4407 | unsigned long lowmem_kbytes; | 4563 | unsigned long lowmem_kbytes; |
4408 | 4564 | ||
@@ -4413,12 +4569,12 @@ static int __init init_per_zone_pages_min(void) | |||
4413 | min_free_kbytes = 128; | 4569 | min_free_kbytes = 128; |
4414 | if (min_free_kbytes > 65536) | 4570 | if (min_free_kbytes > 65536) |
4415 | min_free_kbytes = 65536; | 4571 | min_free_kbytes = 65536; |
4416 | setup_per_zone_pages_min(); | 4572 | setup_per_zone_wmarks(); |
4417 | setup_per_zone_lowmem_reserve(); | 4573 | setup_per_zone_lowmem_reserve(); |
4418 | setup_per_zone_inactive_ratio(); | 4574 | setup_per_zone_inactive_ratio(); |
4419 | return 0; | 4575 | return 0; |
4420 | } | 4576 | } |
4421 | module_init(init_per_zone_pages_min) | 4577 | module_init(init_per_zone_wmark_min) |
4422 | 4578 | ||
4423 | /* | 4579 | /* |
4424 | * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so | 4580 | * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so |
@@ -4430,7 +4586,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, | |||
4430 | { | 4586 | { |
4431 | proc_dointvec(table, write, file, buffer, length, ppos); | 4587 | proc_dointvec(table, write, file, buffer, length, ppos); |
4432 | if (write) | 4588 | if (write) |
4433 | setup_per_zone_pages_min(); | 4589 | setup_per_zone_wmarks(); |
4434 | return 0; | 4590 | return 0; |
4435 | } | 4591 | } |
4436 | 4592 | ||
@@ -4474,7 +4630,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, | |||
4474 | * whenever sysctl_lowmem_reserve_ratio changes. | 4630 | * whenever sysctl_lowmem_reserve_ratio changes. |
4475 | * | 4631 | * |
4476 | * The reserve ratio obviously has absolutely no relation with the | 4632 | * The reserve ratio obviously has absolutely no relation with the |
4477 | * pages_min watermarks. The lowmem reserve ratio can only make sense | 4633 | * minimum watermarks. The lowmem reserve ratio can only make sense |
4478 | * if in function of the boot time zone sizes. | 4634 | * if in function of the boot time zone sizes. |
4479 | */ | 4635 | */ |
4480 | int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, | 4636 | int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, |
@@ -4581,23 +4737,13 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
4581 | else if (hashdist) | 4737 | else if (hashdist) |
4582 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); | 4738 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); |
4583 | else { | 4739 | else { |
4584 | unsigned long order = get_order(size); | ||
4585 | table = (void*) __get_free_pages(GFP_ATOMIC, order); | ||
4586 | /* | 4740 | /* |
4587 | * If bucketsize is not a power-of-two, we may free | 4741 | * If bucketsize is not a power-of-two, we may free |
4588 | * some pages at the end of hash table. | 4742 | * some pages at the end of hash table which |
4743 | * alloc_pages_exact() automatically does | ||
4589 | */ | 4744 | */ |
4590 | if (table) { | 4745 | if (get_order(size) < MAX_ORDER) |
4591 | unsigned long alloc_end = (unsigned long)table + | 4746 | table = alloc_pages_exact(size, GFP_ATOMIC); |
4592 | (PAGE_SIZE << order); | ||
4593 | unsigned long used = (unsigned long)table + | ||
4594 | PAGE_ALIGN(size); | ||
4595 | split_page(virt_to_page(table), order); | ||
4596 | while (used < alloc_end) { | ||
4597 | free_page(used); | ||
4598 | used += PAGE_SIZE; | ||
4599 | } | ||
4600 | } | ||
4601 | } | 4747 | } |
4602 | } while (!table && size > PAGE_SIZE && --log2qty); | 4748 | } while (!table && size > PAGE_SIZE && --log2qty); |
4603 | 4749 | ||
@@ -4615,6 +4761,16 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
4615 | if (_hash_mask) | 4761 | if (_hash_mask) |
4616 | *_hash_mask = (1 << log2qty) - 1; | 4762 | *_hash_mask = (1 << log2qty) - 1; |
4617 | 4763 | ||
4764 | /* | ||
4765 | * If hashdist is set, the table allocation is done with __vmalloc() | ||
4766 | * which invokes the kmemleak_alloc() callback. This function may also | ||
4767 | * be called before the slab and kmemleak are initialised when | ||
4768 | * kmemleak simply buffers the request to be executed later | ||
4769 | * (GFP_ATOMIC flag ignored in this case). | ||
4770 | */ | ||
4771 | if (!hashdist) | ||
4772 | kmemleak_alloc(table, size, 1, GFP_ATOMIC); | ||
4773 | |||
4618 | return table; | 4774 | return table; |
4619 | } | 4775 | } |
4620 | 4776 | ||
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 791905c991df..11a8a10a3909 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -69,7 +69,7 @@ static int __init alloc_node_page_cgroup(int nid) | |||
69 | return 0; | 69 | return 0; |
70 | } | 70 | } |
71 | 71 | ||
72 | void __init page_cgroup_init(void) | 72 | void __init page_cgroup_init_flatmem(void) |
73 | { | 73 | { |
74 | 74 | ||
75 | int nid, fail; | 75 | int nid, fail; |
@@ -113,16 +113,11 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn) | |||
113 | if (!section->page_cgroup) { | 113 | if (!section->page_cgroup) { |
114 | nid = page_to_nid(pfn_to_page(pfn)); | 114 | nid = page_to_nid(pfn_to_page(pfn)); |
115 | table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; | 115 | table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; |
116 | if (slab_is_available()) { | 116 | VM_BUG_ON(!slab_is_available()); |
117 | base = kmalloc_node(table_size, | 117 | base = kmalloc_node(table_size, |
118 | GFP_KERNEL | __GFP_NOWARN, nid); | 118 | GFP_KERNEL | __GFP_NOWARN, nid); |
119 | if (!base) | 119 | if (!base) |
120 | base = vmalloc_node(table_size, nid); | 120 | base = vmalloc_node(table_size, nid); |
121 | } else { | ||
122 | base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), | ||
123 | table_size, | ||
124 | PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); | ||
125 | } | ||
126 | } else { | 121 | } else { |
127 | /* | 122 | /* |
128 | * We don't have to allocate page_cgroup again, but | 123 | * We don't have to allocate page_cgroup again, but |
diff --git a/mm/page_io.c b/mm/page_io.c index 3023c475e041..c6f3e5071de3 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -120,7 +120,7 @@ out: | |||
120 | return ret; | 120 | return ret; |
121 | } | 121 | } |
122 | 122 | ||
123 | int swap_readpage(struct file *file, struct page *page) | 123 | int swap_readpage(struct page *page) |
124 | { | 124 | { |
125 | struct bio *bio; | 125 | struct bio *bio; |
126 | int ret = 0; | 126 | int ret = 0; |
diff --git a/mm/percpu.c b/mm/percpu.c index 1aa5d8fbca12..c0b2c1a76e81 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -23,7 +23,7 @@ | |||
23 | * Allocation is done in offset-size areas of single unit space. Ie, | 23 | * Allocation is done in offset-size areas of single unit space. Ie, |
24 | * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, | 24 | * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, |
25 | * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring | 25 | * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring |
26 | * percpu base registers UNIT_SIZE apart. | 26 | * percpu base registers pcpu_unit_size apart. |
27 | * | 27 | * |
28 | * There are usually many small percpu allocations many of them as | 28 | * There are usually many small percpu allocations many of them as |
29 | * small as 4 bytes. The allocator organizes chunks into lists | 29 | * small as 4 bytes. The allocator organizes chunks into lists |
@@ -38,8 +38,8 @@ | |||
38 | * region and negative allocated. Allocation inside a chunk is done | 38 | * region and negative allocated. Allocation inside a chunk is done |
39 | * by scanning this map sequentially and serving the first matching | 39 | * by scanning this map sequentially and serving the first matching |
40 | * entry. This is mostly copied from the percpu_modalloc() allocator. | 40 | * entry. This is mostly copied from the percpu_modalloc() allocator. |
41 | * Chunks are also linked into a rb tree to ease address to chunk | 41 | * Chunks can be determined from the address using the index field |
42 | * mapping during free. | 42 | * in the page struct. The index field contains a pointer to the chunk. |
43 | * | 43 | * |
44 | * To use this allocator, arch code should do the followings. | 44 | * To use this allocator, arch code should do the followings. |
45 | * | 45 | * |
@@ -61,7 +61,6 @@ | |||
61 | #include <linux/mutex.h> | 61 | #include <linux/mutex.h> |
62 | #include <linux/percpu.h> | 62 | #include <linux/percpu.h> |
63 | #include <linux/pfn.h> | 63 | #include <linux/pfn.h> |
64 | #include <linux/rbtree.h> | ||
65 | #include <linux/slab.h> | 64 | #include <linux/slab.h> |
66 | #include <linux/spinlock.h> | 65 | #include <linux/spinlock.h> |
67 | #include <linux/vmalloc.h> | 66 | #include <linux/vmalloc.h> |
@@ -88,7 +87,6 @@ | |||
88 | 87 | ||
89 | struct pcpu_chunk { | 88 | struct pcpu_chunk { |
90 | struct list_head list; /* linked to pcpu_slot lists */ | 89 | struct list_head list; /* linked to pcpu_slot lists */ |
91 | struct rb_node rb_node; /* key is chunk->vm->addr */ | ||
92 | int free_size; /* free bytes in the chunk */ | 90 | int free_size; /* free bytes in the chunk */ |
93 | int contig_hint; /* max contiguous size hint */ | 91 | int contig_hint; /* max contiguous size hint */ |
94 | struct vm_struct *vm; /* mapped vmalloc region */ | 92 | struct vm_struct *vm; /* mapped vmalloc region */ |
@@ -110,9 +108,21 @@ static size_t pcpu_chunk_struct_size __read_mostly; | |||
110 | void *pcpu_base_addr __read_mostly; | 108 | void *pcpu_base_addr __read_mostly; |
111 | EXPORT_SYMBOL_GPL(pcpu_base_addr); | 109 | EXPORT_SYMBOL_GPL(pcpu_base_addr); |
112 | 110 | ||
113 | /* optional reserved chunk, only accessible for reserved allocations */ | 111 | /* |
112 | * The first chunk which always exists. Note that unlike other | ||
113 | * chunks, this one can be allocated and mapped in several different | ||
114 | * ways and thus often doesn't live in the vmalloc area. | ||
115 | */ | ||
116 | static struct pcpu_chunk *pcpu_first_chunk; | ||
117 | |||
118 | /* | ||
119 | * Optional reserved chunk. This chunk reserves part of the first | ||
120 | * chunk and serves it for reserved allocations. The amount of | ||
121 | * reserved offset is in pcpu_reserved_chunk_limit. When reserved | ||
122 | * area doesn't exist, the following variables contain NULL and 0 | ||
123 | * respectively. | ||
124 | */ | ||
114 | static struct pcpu_chunk *pcpu_reserved_chunk; | 125 | static struct pcpu_chunk *pcpu_reserved_chunk; |
115 | /* offset limit of the reserved chunk */ | ||
116 | static int pcpu_reserved_chunk_limit; | 126 | static int pcpu_reserved_chunk_limit; |
117 | 127 | ||
118 | /* | 128 | /* |
@@ -121,7 +131,7 @@ static int pcpu_reserved_chunk_limit; | |||
121 | * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former | 131 | * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former |
122 | * protects allocation/reclaim paths, chunks and chunk->page arrays. | 132 | * protects allocation/reclaim paths, chunks and chunk->page arrays. |
123 | * The latter is a spinlock and protects the index data structures - | 133 | * The latter is a spinlock and protects the index data structures - |
124 | * chunk slots, rbtree, chunks and area maps in chunks. | 134 | * chunk slots, chunks and area maps in chunks. |
125 | * | 135 | * |
126 | * During allocation, pcpu_alloc_mutex is kept locked all the time and | 136 | * During allocation, pcpu_alloc_mutex is kept locked all the time and |
127 | * pcpu_lock is grabbed and released as necessary. All actual memory | 137 | * pcpu_lock is grabbed and released as necessary. All actual memory |
@@ -140,7 +150,6 @@ static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */ | |||
140 | static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ | 150 | static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ |
141 | 151 | ||
142 | static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ | 152 | static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ |
143 | static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */ | ||
144 | 153 | ||
145 | /* reclaim work to release fully free chunks, scheduled from free path */ | 154 | /* reclaim work to release fully free chunks, scheduled from free path */ |
146 | static void pcpu_reclaim(struct work_struct *work); | 155 | static void pcpu_reclaim(struct work_struct *work); |
@@ -191,6 +200,18 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, | |||
191 | return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; | 200 | return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; |
192 | } | 201 | } |
193 | 202 | ||
203 | /* set the pointer to a chunk in a page struct */ | ||
204 | static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu) | ||
205 | { | ||
206 | page->index = (unsigned long)pcpu; | ||
207 | } | ||
208 | |||
209 | /* obtain pointer to a chunk from a page struct */ | ||
210 | static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) | ||
211 | { | ||
212 | return (struct pcpu_chunk *)page->index; | ||
213 | } | ||
214 | |||
194 | /** | 215 | /** |
195 | * pcpu_mem_alloc - allocate memory | 216 | * pcpu_mem_alloc - allocate memory |
196 | * @size: bytes to allocate | 217 | * @size: bytes to allocate |
@@ -257,93 +278,26 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) | |||
257 | } | 278 | } |
258 | } | 279 | } |
259 | 280 | ||
260 | static struct rb_node **pcpu_chunk_rb_search(void *addr, | ||
261 | struct rb_node **parentp) | ||
262 | { | ||
263 | struct rb_node **p = &pcpu_addr_root.rb_node; | ||
264 | struct rb_node *parent = NULL; | ||
265 | struct pcpu_chunk *chunk; | ||
266 | |||
267 | while (*p) { | ||
268 | parent = *p; | ||
269 | chunk = rb_entry(parent, struct pcpu_chunk, rb_node); | ||
270 | |||
271 | if (addr < chunk->vm->addr) | ||
272 | p = &(*p)->rb_left; | ||
273 | else if (addr > chunk->vm->addr) | ||
274 | p = &(*p)->rb_right; | ||
275 | else | ||
276 | break; | ||
277 | } | ||
278 | |||
279 | if (parentp) | ||
280 | *parentp = parent; | ||
281 | return p; | ||
282 | } | ||
283 | |||
284 | /** | 281 | /** |
285 | * pcpu_chunk_addr_search - search for chunk containing specified address | 282 | * pcpu_chunk_addr_search - determine chunk containing specified address |
286 | * @addr: address to search for | 283 | * @addr: address for which the chunk needs to be determined. |
287 | * | ||
288 | * Look for chunk which might contain @addr. More specifically, it | ||
289 | * searchs for the chunk with the highest start address which isn't | ||
290 | * beyond @addr. | ||
291 | * | ||
292 | * CONTEXT: | ||
293 | * pcpu_lock. | ||
294 | * | 284 | * |
295 | * RETURNS: | 285 | * RETURNS: |
296 | * The address of the found chunk. | 286 | * The address of the found chunk. |
297 | */ | 287 | */ |
298 | static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) | 288 | static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) |
299 | { | 289 | { |
300 | struct rb_node *n, *parent; | 290 | void *first_start = pcpu_first_chunk->vm->addr; |
301 | struct pcpu_chunk *chunk; | ||
302 | 291 | ||
303 | /* is it in the reserved chunk? */ | 292 | /* is it in the first chunk? */ |
304 | if (pcpu_reserved_chunk) { | 293 | if (addr >= first_start && addr < first_start + pcpu_chunk_size) { |
305 | void *start = pcpu_reserved_chunk->vm->addr; | 294 | /* is it in the reserved area? */ |
306 | 295 | if (addr < first_start + pcpu_reserved_chunk_limit) | |
307 | if (addr >= start && addr < start + pcpu_reserved_chunk_limit) | ||
308 | return pcpu_reserved_chunk; | 296 | return pcpu_reserved_chunk; |
297 | return pcpu_first_chunk; | ||
309 | } | 298 | } |
310 | 299 | ||
311 | /* nah... search the regular ones */ | 300 | return pcpu_get_page_chunk(vmalloc_to_page(addr)); |
312 | n = *pcpu_chunk_rb_search(addr, &parent); | ||
313 | if (!n) { | ||
314 | /* no exactly matching chunk, the parent is the closest */ | ||
315 | n = parent; | ||
316 | BUG_ON(!n); | ||
317 | } | ||
318 | chunk = rb_entry(n, struct pcpu_chunk, rb_node); | ||
319 | |||
320 | if (addr < chunk->vm->addr) { | ||
321 | /* the parent was the next one, look for the previous one */ | ||
322 | n = rb_prev(n); | ||
323 | BUG_ON(!n); | ||
324 | chunk = rb_entry(n, struct pcpu_chunk, rb_node); | ||
325 | } | ||
326 | |||
327 | return chunk; | ||
328 | } | ||
329 | |||
330 | /** | ||
331 | * pcpu_chunk_addr_insert - insert chunk into address rb tree | ||
332 | * @new: chunk to insert | ||
333 | * | ||
334 | * Insert @new into address rb tree. | ||
335 | * | ||
336 | * CONTEXT: | ||
337 | * pcpu_lock. | ||
338 | */ | ||
339 | static void pcpu_chunk_addr_insert(struct pcpu_chunk *new) | ||
340 | { | ||
341 | struct rb_node **p, *parent; | ||
342 | |||
343 | p = pcpu_chunk_rb_search(new->vm->addr, &parent); | ||
344 | BUG_ON(*p); | ||
345 | rb_link_node(&new->rb_node, parent, p); | ||
346 | rb_insert_color(&new->rb_node, &pcpu_addr_root); | ||
347 | } | 301 | } |
348 | 302 | ||
349 | /** | 303 | /** |
@@ -755,6 +709,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) | |||
755 | alloc_mask, 0); | 709 | alloc_mask, 0); |
756 | if (!*pagep) | 710 | if (!*pagep) |
757 | goto err; | 711 | goto err; |
712 | pcpu_set_page_chunk(*pagep, chunk); | ||
758 | } | 713 | } |
759 | } | 714 | } |
760 | 715 | ||
@@ -879,7 +834,6 @@ restart: | |||
879 | 834 | ||
880 | spin_lock_irq(&pcpu_lock); | 835 | spin_lock_irq(&pcpu_lock); |
881 | pcpu_chunk_relocate(chunk, -1); | 836 | pcpu_chunk_relocate(chunk, -1); |
882 | pcpu_chunk_addr_insert(chunk); | ||
883 | goto restart; | 837 | goto restart; |
884 | 838 | ||
885 | area_found: | 839 | area_found: |
@@ -968,7 +922,6 @@ static void pcpu_reclaim(struct work_struct *work) | |||
968 | if (chunk == list_first_entry(head, struct pcpu_chunk, list)) | 922 | if (chunk == list_first_entry(head, struct pcpu_chunk, list)) |
969 | continue; | 923 | continue; |
970 | 924 | ||
971 | rb_erase(&chunk->rb_node, &pcpu_addr_root); | ||
972 | list_move(&chunk->list, &todo); | 925 | list_move(&chunk->list, &todo); |
973 | } | 926 | } |
974 | 927 | ||
@@ -1147,7 +1100,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, | |||
1147 | 1100 | ||
1148 | if (reserved_size) { | 1101 | if (reserved_size) { |
1149 | schunk->free_size = reserved_size; | 1102 | schunk->free_size = reserved_size; |
1150 | pcpu_reserved_chunk = schunk; /* not for dynamic alloc */ | 1103 | pcpu_reserved_chunk = schunk; |
1104 | pcpu_reserved_chunk_limit = static_size + reserved_size; | ||
1151 | } else { | 1105 | } else { |
1152 | schunk->free_size = dyn_size; | 1106 | schunk->free_size = dyn_size; |
1153 | dyn_size = 0; /* dynamic area covered */ | 1107 | dyn_size = 0; /* dynamic area covered */ |
@@ -1158,8 +1112,6 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, | |||
1158 | if (schunk->free_size) | 1112 | if (schunk->free_size) |
1159 | schunk->map[schunk->map_used++] = schunk->free_size; | 1113 | schunk->map[schunk->map_used++] = schunk->free_size; |
1160 | 1114 | ||
1161 | pcpu_reserved_chunk_limit = static_size + schunk->free_size; | ||
1162 | |||
1163 | /* init dynamic chunk if necessary */ | 1115 | /* init dynamic chunk if necessary */ |
1164 | if (dyn_size) { | 1116 | if (dyn_size) { |
1165 | dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); | 1117 | dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); |
@@ -1226,13 +1178,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, | |||
1226 | } | 1178 | } |
1227 | 1179 | ||
1228 | /* link the first chunk in */ | 1180 | /* link the first chunk in */ |
1229 | if (!dchunk) { | 1181 | pcpu_first_chunk = dchunk ?: schunk; |
1230 | pcpu_chunk_relocate(schunk, -1); | 1182 | pcpu_chunk_relocate(pcpu_first_chunk, -1); |
1231 | pcpu_chunk_addr_insert(schunk); | ||
1232 | } else { | ||
1233 | pcpu_chunk_relocate(dchunk, -1); | ||
1234 | pcpu_chunk_addr_insert(dchunk); | ||
1235 | } | ||
1236 | 1183 | ||
1237 | /* we're done */ | 1184 | /* we're done */ |
1238 | pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); | 1185 | pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); |
diff --git a/mm/readahead.c b/mm/readahead.c index 133b6d525513..aa1aa2345235 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -133,15 +133,12 @@ out: | |||
133 | } | 133 | } |
134 | 134 | ||
135 | /* | 135 | /* |
136 | * do_page_cache_readahead actually reads a chunk of disk. It allocates all | 136 | * __do_page_cache_readahead() actually reads a chunk of disk. It allocates all |
137 | * the pages first, then submits them all for I/O. This avoids the very bad | 137 | * the pages first, then submits them all for I/O. This avoids the very bad |
138 | * behaviour which would occur if page allocations are causing VM writeback. | 138 | * behaviour which would occur if page allocations are causing VM writeback. |
139 | * We really don't want to intermingle reads and writes like that. | 139 | * We really don't want to intermingle reads and writes like that. |
140 | * | 140 | * |
141 | * Returns the number of pages requested, or the maximum amount of I/O allowed. | 141 | * Returns the number of pages requested, or the maximum amount of I/O allowed. |
142 | * | ||
143 | * do_page_cache_readahead() returns -1 if it encountered request queue | ||
144 | * congestion. | ||
145 | */ | 142 | */ |
146 | static int | 143 | static int |
147 | __do_page_cache_readahead(struct address_space *mapping, struct file *filp, | 144 | __do_page_cache_readahead(struct address_space *mapping, struct file *filp, |
@@ -210,6 +207,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
210 | if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) | 207 | if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) |
211 | return -EINVAL; | 208 | return -EINVAL; |
212 | 209 | ||
210 | nr_to_read = max_sane_readahead(nr_to_read); | ||
213 | while (nr_to_read) { | 211 | while (nr_to_read) { |
214 | int err; | 212 | int err; |
215 | 213 | ||
@@ -231,22 +229,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
231 | } | 229 | } |
232 | 230 | ||
233 | /* | 231 | /* |
234 | * This version skips the IO if the queue is read-congested, and will tell the | ||
235 | * block layer to abandon the readahead if request allocation would block. | ||
236 | * | ||
237 | * force_page_cache_readahead() will ignore queue congestion and will block on | ||
238 | * request queues. | ||
239 | */ | ||
240 | int do_page_cache_readahead(struct address_space *mapping, struct file *filp, | ||
241 | pgoff_t offset, unsigned long nr_to_read) | ||
242 | { | ||
243 | if (bdi_read_congested(mapping->backing_dev_info)) | ||
244 | return -1; | ||
245 | |||
246 | return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0); | ||
247 | } | ||
248 | |||
249 | /* | ||
250 | * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a | 232 | * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a |
251 | * sensible upper limit. | 233 | * sensible upper limit. |
252 | */ | 234 | */ |
@@ -259,7 +241,7 @@ unsigned long max_sane_readahead(unsigned long nr) | |||
259 | /* | 241 | /* |
260 | * Submit IO for the read-ahead request in file_ra_state. | 242 | * Submit IO for the read-ahead request in file_ra_state. |
261 | */ | 243 | */ |
262 | static unsigned long ra_submit(struct file_ra_state *ra, | 244 | unsigned long ra_submit(struct file_ra_state *ra, |
263 | struct address_space *mapping, struct file *filp) | 245 | struct address_space *mapping, struct file *filp) |
264 | { | 246 | { |
265 | int actual; | 247 | int actual; |
@@ -348,6 +330,59 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra, | |||
348 | */ | 330 | */ |
349 | 331 | ||
350 | /* | 332 | /* |
333 | * Count contiguously cached pages from @offset-1 to @offset-@max, | ||
334 | * this count is a conservative estimation of | ||
335 | * - length of the sequential read sequence, or | ||
336 | * - thrashing threshold in memory tight systems | ||
337 | */ | ||
338 | static pgoff_t count_history_pages(struct address_space *mapping, | ||
339 | struct file_ra_state *ra, | ||
340 | pgoff_t offset, unsigned long max) | ||
341 | { | ||
342 | pgoff_t head; | ||
343 | |||
344 | rcu_read_lock(); | ||
345 | head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max); | ||
346 | rcu_read_unlock(); | ||
347 | |||
348 | return offset - 1 - head; | ||
349 | } | ||
350 | |||
351 | /* | ||
352 | * page cache context based read-ahead | ||
353 | */ | ||
354 | static int try_context_readahead(struct address_space *mapping, | ||
355 | struct file_ra_state *ra, | ||
356 | pgoff_t offset, | ||
357 | unsigned long req_size, | ||
358 | unsigned long max) | ||
359 | { | ||
360 | pgoff_t size; | ||
361 | |||
362 | size = count_history_pages(mapping, ra, offset, max); | ||
363 | |||
364 | /* | ||
365 | * no history pages: | ||
366 | * it could be a random read | ||
367 | */ | ||
368 | if (!size) | ||
369 | return 0; | ||
370 | |||
371 | /* | ||
372 | * starts from beginning of file: | ||
373 | * it is a strong indication of long-run stream (or whole-file-read) | ||
374 | */ | ||
375 | if (size >= offset) | ||
376 | size *= 2; | ||
377 | |||
378 | ra->start = offset; | ||
379 | ra->size = get_init_ra_size(size + req_size, max); | ||
380 | ra->async_size = ra->size; | ||
381 | |||
382 | return 1; | ||
383 | } | ||
384 | |||
385 | /* | ||
351 | * A minimal readahead algorithm for trivial sequential/random reads. | 386 | * A minimal readahead algorithm for trivial sequential/random reads. |
352 | */ | 387 | */ |
353 | static unsigned long | 388 | static unsigned long |
@@ -356,34 +391,26 @@ ondemand_readahead(struct address_space *mapping, | |||
356 | bool hit_readahead_marker, pgoff_t offset, | 391 | bool hit_readahead_marker, pgoff_t offset, |
357 | unsigned long req_size) | 392 | unsigned long req_size) |
358 | { | 393 | { |
359 | int max = ra->ra_pages; /* max readahead pages */ | 394 | unsigned long max = max_sane_readahead(ra->ra_pages); |
360 | pgoff_t prev_offset; | 395 | |
361 | int sequential; | 396 | /* |
397 | * start of file | ||
398 | */ | ||
399 | if (!offset) | ||
400 | goto initial_readahead; | ||
362 | 401 | ||
363 | /* | 402 | /* |
364 | * It's the expected callback offset, assume sequential access. | 403 | * It's the expected callback offset, assume sequential access. |
365 | * Ramp up sizes, and push forward the readahead window. | 404 | * Ramp up sizes, and push forward the readahead window. |
366 | */ | 405 | */ |
367 | if (offset && (offset == (ra->start + ra->size - ra->async_size) || | 406 | if ((offset == (ra->start + ra->size - ra->async_size) || |
368 | offset == (ra->start + ra->size))) { | 407 | offset == (ra->start + ra->size))) { |
369 | ra->start += ra->size; | 408 | ra->start += ra->size; |
370 | ra->size = get_next_ra_size(ra, max); | 409 | ra->size = get_next_ra_size(ra, max); |
371 | ra->async_size = ra->size; | 410 | ra->async_size = ra->size; |
372 | goto readit; | 411 | goto readit; |
373 | } | 412 | } |
374 | 413 | ||
375 | prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT; | ||
376 | sequential = offset - prev_offset <= 1UL || req_size > max; | ||
377 | |||
378 | /* | ||
379 | * Standalone, small read. | ||
380 | * Read as is, and do not pollute the readahead state. | ||
381 | */ | ||
382 | if (!hit_readahead_marker && !sequential) { | ||
383 | return __do_page_cache_readahead(mapping, filp, | ||
384 | offset, req_size, 0); | ||
385 | } | ||
386 | |||
387 | /* | 414 | /* |
388 | * Hit a marked page without valid readahead state. | 415 | * Hit a marked page without valid readahead state. |
389 | * E.g. interleaved reads. | 416 | * E.g. interleaved reads. |
@@ -394,7 +421,7 @@ ondemand_readahead(struct address_space *mapping, | |||
394 | pgoff_t start; | 421 | pgoff_t start; |
395 | 422 | ||
396 | rcu_read_lock(); | 423 | rcu_read_lock(); |
397 | start = radix_tree_next_hole(&mapping->page_tree, offset,max+1); | 424 | start = radix_tree_next_hole(&mapping->page_tree, offset+1,max); |
398 | rcu_read_unlock(); | 425 | rcu_read_unlock(); |
399 | 426 | ||
400 | if (!start || start - offset > max) | 427 | if (!start || start - offset > max) |
@@ -402,23 +429,53 @@ ondemand_readahead(struct address_space *mapping, | |||
402 | 429 | ||
403 | ra->start = start; | 430 | ra->start = start; |
404 | ra->size = start - offset; /* old async_size */ | 431 | ra->size = start - offset; /* old async_size */ |
432 | ra->size += req_size; | ||
405 | ra->size = get_next_ra_size(ra, max); | 433 | ra->size = get_next_ra_size(ra, max); |
406 | ra->async_size = ra->size; | 434 | ra->async_size = ra->size; |
407 | goto readit; | 435 | goto readit; |
408 | } | 436 | } |
409 | 437 | ||
410 | /* | 438 | /* |
411 | * It may be one of | 439 | * oversize read |
412 | * - first read on start of file | 440 | */ |
413 | * - sequential cache miss | 441 | if (req_size > max) |
414 | * - oversize random read | 442 | goto initial_readahead; |
415 | * Start readahead for it. | 443 | |
444 | /* | ||
445 | * sequential cache miss | ||
446 | */ | ||
447 | if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL) | ||
448 | goto initial_readahead; | ||
449 | |||
450 | /* | ||
451 | * Query the page cache and look for the traces(cached history pages) | ||
452 | * that a sequential stream would leave behind. | ||
453 | */ | ||
454 | if (try_context_readahead(mapping, ra, offset, req_size, max)) | ||
455 | goto readit; | ||
456 | |||
457 | /* | ||
458 | * standalone, small random read | ||
459 | * Read as is, and do not pollute the readahead state. | ||
416 | */ | 460 | */ |
461 | return __do_page_cache_readahead(mapping, filp, offset, req_size, 0); | ||
462 | |||
463 | initial_readahead: | ||
417 | ra->start = offset; | 464 | ra->start = offset; |
418 | ra->size = get_init_ra_size(req_size, max); | 465 | ra->size = get_init_ra_size(req_size, max); |
419 | ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; | 466 | ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; |
420 | 467 | ||
421 | readit: | 468 | readit: |
469 | /* | ||
470 | * Will this read hit the readahead marker made by itself? | ||
471 | * If so, trigger the readahead marker hit now, and merge | ||
472 | * the resulted next readahead window into the current one. | ||
473 | */ | ||
474 | if (offset == ra->start && ra->size == ra->async_size) { | ||
475 | ra->async_size = get_next_ra_size(ra, max); | ||
476 | ra->size += ra->async_size; | ||
477 | } | ||
478 | |||
422 | return ra_submit(ra, mapping, filp); | 479 | return ra_submit(ra, mapping, filp); |
423 | } | 480 | } |
424 | 481 | ||
@@ -333,7 +333,9 @@ static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) | |||
333 | * repeatedly from either page_referenced_anon or page_referenced_file. | 333 | * repeatedly from either page_referenced_anon or page_referenced_file. |
334 | */ | 334 | */ |
335 | static int page_referenced_one(struct page *page, | 335 | static int page_referenced_one(struct page *page, |
336 | struct vm_area_struct *vma, unsigned int *mapcount) | 336 | struct vm_area_struct *vma, |
337 | unsigned int *mapcount, | ||
338 | unsigned long *vm_flags) | ||
337 | { | 339 | { |
338 | struct mm_struct *mm = vma->vm_mm; | 340 | struct mm_struct *mm = vma->vm_mm; |
339 | unsigned long address; | 341 | unsigned long address; |
@@ -381,11 +383,14 @@ out_unmap: | |||
381 | (*mapcount)--; | 383 | (*mapcount)--; |
382 | pte_unmap_unlock(pte, ptl); | 384 | pte_unmap_unlock(pte, ptl); |
383 | out: | 385 | out: |
386 | if (referenced) | ||
387 | *vm_flags |= vma->vm_flags; | ||
384 | return referenced; | 388 | return referenced; |
385 | } | 389 | } |
386 | 390 | ||
387 | static int page_referenced_anon(struct page *page, | 391 | static int page_referenced_anon(struct page *page, |
388 | struct mem_cgroup *mem_cont) | 392 | struct mem_cgroup *mem_cont, |
393 | unsigned long *vm_flags) | ||
389 | { | 394 | { |
390 | unsigned int mapcount; | 395 | unsigned int mapcount; |
391 | struct anon_vma *anon_vma; | 396 | struct anon_vma *anon_vma; |
@@ -405,7 +410,8 @@ static int page_referenced_anon(struct page *page, | |||
405 | */ | 410 | */ |
406 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) | 411 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) |
407 | continue; | 412 | continue; |
408 | referenced += page_referenced_one(page, vma, &mapcount); | 413 | referenced += page_referenced_one(page, vma, |
414 | &mapcount, vm_flags); | ||
409 | if (!mapcount) | 415 | if (!mapcount) |
410 | break; | 416 | break; |
411 | } | 417 | } |
@@ -418,6 +424,7 @@ static int page_referenced_anon(struct page *page, | |||
418 | * page_referenced_file - referenced check for object-based rmap | 424 | * page_referenced_file - referenced check for object-based rmap |
419 | * @page: the page we're checking references on. | 425 | * @page: the page we're checking references on. |
420 | * @mem_cont: target memory controller | 426 | * @mem_cont: target memory controller |
427 | * @vm_flags: collect encountered vma->vm_flags who actually referenced the page | ||
421 | * | 428 | * |
422 | * For an object-based mapped page, find all the places it is mapped and | 429 | * For an object-based mapped page, find all the places it is mapped and |
423 | * check/clear the referenced flag. This is done by following the page->mapping | 430 | * check/clear the referenced flag. This is done by following the page->mapping |
@@ -427,7 +434,8 @@ static int page_referenced_anon(struct page *page, | |||
427 | * This function is only called from page_referenced for object-based pages. | 434 | * This function is only called from page_referenced for object-based pages. |
428 | */ | 435 | */ |
429 | static int page_referenced_file(struct page *page, | 436 | static int page_referenced_file(struct page *page, |
430 | struct mem_cgroup *mem_cont) | 437 | struct mem_cgroup *mem_cont, |
438 | unsigned long *vm_flags) | ||
431 | { | 439 | { |
432 | unsigned int mapcount; | 440 | unsigned int mapcount; |
433 | struct address_space *mapping = page->mapping; | 441 | struct address_space *mapping = page->mapping; |
@@ -467,7 +475,8 @@ static int page_referenced_file(struct page *page, | |||
467 | */ | 475 | */ |
468 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) | 476 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) |
469 | continue; | 477 | continue; |
470 | referenced += page_referenced_one(page, vma, &mapcount); | 478 | referenced += page_referenced_one(page, vma, |
479 | &mapcount, vm_flags); | ||
471 | if (!mapcount) | 480 | if (!mapcount) |
472 | break; | 481 | break; |
473 | } | 482 | } |
@@ -481,29 +490,35 @@ static int page_referenced_file(struct page *page, | |||
481 | * @page: the page to test | 490 | * @page: the page to test |
482 | * @is_locked: caller holds lock on the page | 491 | * @is_locked: caller holds lock on the page |
483 | * @mem_cont: target memory controller | 492 | * @mem_cont: target memory controller |
493 | * @vm_flags: collect encountered vma->vm_flags who actually referenced the page | ||
484 | * | 494 | * |
485 | * Quick test_and_clear_referenced for all mappings to a page, | 495 | * Quick test_and_clear_referenced for all mappings to a page, |
486 | * returns the number of ptes which referenced the page. | 496 | * returns the number of ptes which referenced the page. |
487 | */ | 497 | */ |
488 | int page_referenced(struct page *page, int is_locked, | 498 | int page_referenced(struct page *page, |
489 | struct mem_cgroup *mem_cont) | 499 | int is_locked, |
500 | struct mem_cgroup *mem_cont, | ||
501 | unsigned long *vm_flags) | ||
490 | { | 502 | { |
491 | int referenced = 0; | 503 | int referenced = 0; |
492 | 504 | ||
493 | if (TestClearPageReferenced(page)) | 505 | if (TestClearPageReferenced(page)) |
494 | referenced++; | 506 | referenced++; |
495 | 507 | ||
508 | *vm_flags = 0; | ||
496 | if (page_mapped(page) && page->mapping) { | 509 | if (page_mapped(page) && page->mapping) { |
497 | if (PageAnon(page)) | 510 | if (PageAnon(page)) |
498 | referenced += page_referenced_anon(page, mem_cont); | 511 | referenced += page_referenced_anon(page, mem_cont, |
512 | vm_flags); | ||
499 | else if (is_locked) | 513 | else if (is_locked) |
500 | referenced += page_referenced_file(page, mem_cont); | 514 | referenced += page_referenced_file(page, mem_cont, |
515 | vm_flags); | ||
501 | else if (!trylock_page(page)) | 516 | else if (!trylock_page(page)) |
502 | referenced++; | 517 | referenced++; |
503 | else { | 518 | else { |
504 | if (page->mapping) | 519 | if (page->mapping) |
505 | referenced += | 520 | referenced += page_referenced_file(page, |
506 | page_referenced_file(page, mem_cont); | 521 | mem_cont, vm_flags); |
507 | unlock_page(page); | 522 | unlock_page(page); |
508 | } | 523 | } |
509 | } | 524 | } |
@@ -1202,7 +1217,6 @@ int try_to_unmap(struct page *page, int migration) | |||
1202 | return ret; | 1217 | return ret; |
1203 | } | 1218 | } |
1204 | 1219 | ||
1205 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
1206 | /** | 1220 | /** |
1207 | * try_to_munlock - try to munlock a page | 1221 | * try_to_munlock - try to munlock a page |
1208 | * @page: the page to be munlocked | 1222 | * @page: the page to be munlocked |
@@ -1226,4 +1240,4 @@ int try_to_munlock(struct page *page) | |||
1226 | else | 1240 | else |
1227 | return try_to_unmap_file(page, 1, 0); | 1241 | return try_to_unmap_file(page, 1, 0); |
1228 | } | 1242 | } |
1229 | #endif | 1243 | |
diff --git a/mm/shmem.c b/mm/shmem.c index b25f95ce3db7..e89d7ec18eda 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -1097,7 +1097,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
1097 | shmem_swp_unmap(entry); | 1097 | shmem_swp_unmap(entry); |
1098 | unlock: | 1098 | unlock: |
1099 | spin_unlock(&info->lock); | 1099 | spin_unlock(&info->lock); |
1100 | swap_free(swap); | 1100 | swapcache_free(swap, NULL); |
1101 | redirty: | 1101 | redirty: |
1102 | set_page_dirty(page); | 1102 | set_page_dirty(page); |
1103 | if (wbc->for_reclaim) | 1103 | if (wbc->for_reclaim) |
@@ -2612,7 +2612,7 @@ int shmem_unuse(swp_entry_t entry, struct page *page) | |||
2612 | * @size: size to be set for the file | 2612 | * @size: size to be set for the file |
2613 | * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size | 2613 | * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size |
2614 | */ | 2614 | */ |
2615 | struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) | 2615 | struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) |
2616 | { | 2616 | { |
2617 | int error; | 2617 | int error; |
2618 | struct file *file; | 2618 | struct file *file; |
@@ -2659,6 +2659,7 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) | |||
2659 | if (error) | 2659 | if (error) |
2660 | goto close_file; | 2660 | goto close_file; |
2661 | #endif | 2661 | #endif |
2662 | ima_counts_get(file); | ||
2662 | return file; | 2663 | return file; |
2663 | 2664 | ||
2664 | close_file: | 2665 | close_file: |
@@ -2684,7 +2685,6 @@ int shmem_zero_setup(struct vm_area_struct *vma) | |||
2684 | if (IS_ERR(file)) | 2685 | if (IS_ERR(file)) |
2685 | return PTR_ERR(file); | 2686 | return PTR_ERR(file); |
2686 | 2687 | ||
2687 | ima_shm_check(file); | ||
2688 | if (vma->vm_file) | 2688 | if (vma->vm_file) |
2689 | fput(vma->vm_file); | 2689 | fput(vma->vm_file); |
2690 | vma->vm_file = file; | 2690 | vma->vm_file = file; |
@@ -102,17 +102,19 @@ | |||
102 | #include <linux/cpu.h> | 102 | #include <linux/cpu.h> |
103 | #include <linux/sysctl.h> | 103 | #include <linux/sysctl.h> |
104 | #include <linux/module.h> | 104 | #include <linux/module.h> |
105 | #include <trace/kmemtrace.h> | 105 | #include <linux/kmemtrace.h> |
106 | #include <linux/rcupdate.h> | 106 | #include <linux/rcupdate.h> |
107 | #include <linux/string.h> | 107 | #include <linux/string.h> |
108 | #include <linux/uaccess.h> | 108 | #include <linux/uaccess.h> |
109 | #include <linux/nodemask.h> | 109 | #include <linux/nodemask.h> |
110 | #include <linux/kmemleak.h> | ||
110 | #include <linux/mempolicy.h> | 111 | #include <linux/mempolicy.h> |
111 | #include <linux/mutex.h> | 112 | #include <linux/mutex.h> |
112 | #include <linux/fault-inject.h> | 113 | #include <linux/fault-inject.h> |
113 | #include <linux/rtmutex.h> | 114 | #include <linux/rtmutex.h> |
114 | #include <linux/reciprocal_div.h> | 115 | #include <linux/reciprocal_div.h> |
115 | #include <linux/debugobjects.h> | 116 | #include <linux/debugobjects.h> |
117 | #include <linux/kmemcheck.h> | ||
116 | 118 | ||
117 | #include <asm/cacheflush.h> | 119 | #include <asm/cacheflush.h> |
118 | #include <asm/tlbflush.h> | 120 | #include <asm/tlbflush.h> |
@@ -178,13 +180,13 @@ | |||
178 | SLAB_STORE_USER | \ | 180 | SLAB_STORE_USER | \ |
179 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ | 181 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ |
180 | SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ | 182 | SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ |
181 | SLAB_DEBUG_OBJECTS) | 183 | SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) |
182 | #else | 184 | #else |
183 | # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ | 185 | # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ |
184 | SLAB_CACHE_DMA | \ | 186 | SLAB_CACHE_DMA | \ |
185 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ | 187 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ |
186 | SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ | 188 | SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ |
187 | SLAB_DEBUG_OBJECTS) | 189 | SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) |
188 | #endif | 190 | #endif |
189 | 191 | ||
190 | /* | 192 | /* |
@@ -303,6 +305,12 @@ struct kmem_list3 { | |||
303 | }; | 305 | }; |
304 | 306 | ||
305 | /* | 307 | /* |
308 | * The slab allocator is initialized with interrupts disabled. Therefore, make | ||
309 | * sure early boot allocations don't accidentally enable interrupts. | ||
310 | */ | ||
311 | static gfp_t slab_gfp_mask __read_mostly = SLAB_GFP_BOOT_MASK; | ||
312 | |||
313 | /* | ||
306 | * Need this for bootstrapping a per node allocator. | 314 | * Need this for bootstrapping a per node allocator. |
307 | */ | 315 | */ |
308 | #define NUM_INIT_LISTS (3 * MAX_NUMNODES) | 316 | #define NUM_INIT_LISTS (3 * MAX_NUMNODES) |
@@ -315,7 +323,7 @@ static int drain_freelist(struct kmem_cache *cache, | |||
315 | struct kmem_list3 *l3, int tofree); | 323 | struct kmem_list3 *l3, int tofree); |
316 | static void free_block(struct kmem_cache *cachep, void **objpp, int len, | 324 | static void free_block(struct kmem_cache *cachep, void **objpp, int len, |
317 | int node); | 325 | int node); |
318 | static int enable_cpucache(struct kmem_cache *cachep); | 326 | static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); |
319 | static void cache_reap(struct work_struct *unused); | 327 | static void cache_reap(struct work_struct *unused); |
320 | 328 | ||
321 | /* | 329 | /* |
@@ -373,87 +381,6 @@ static void kmem_list3_init(struct kmem_list3 *parent) | |||
373 | MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ | 381 | MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ |
374 | } while (0) | 382 | } while (0) |
375 | 383 | ||
376 | /* | ||
377 | * struct kmem_cache | ||
378 | * | ||
379 | * manages a cache. | ||
380 | */ | ||
381 | |||
382 | struct kmem_cache { | ||
383 | /* 1) per-cpu data, touched during every alloc/free */ | ||
384 | struct array_cache *array[NR_CPUS]; | ||
385 | /* 2) Cache tunables. Protected by cache_chain_mutex */ | ||
386 | unsigned int batchcount; | ||
387 | unsigned int limit; | ||
388 | unsigned int shared; | ||
389 | |||
390 | unsigned int buffer_size; | ||
391 | u32 reciprocal_buffer_size; | ||
392 | /* 3) touched by every alloc & free from the backend */ | ||
393 | |||
394 | unsigned int flags; /* constant flags */ | ||
395 | unsigned int num; /* # of objs per slab */ | ||
396 | |||
397 | /* 4) cache_grow/shrink */ | ||
398 | /* order of pgs per slab (2^n) */ | ||
399 | unsigned int gfporder; | ||
400 | |||
401 | /* force GFP flags, e.g. GFP_DMA */ | ||
402 | gfp_t gfpflags; | ||
403 | |||
404 | size_t colour; /* cache colouring range */ | ||
405 | unsigned int colour_off; /* colour offset */ | ||
406 | struct kmem_cache *slabp_cache; | ||
407 | unsigned int slab_size; | ||
408 | unsigned int dflags; /* dynamic flags */ | ||
409 | |||
410 | /* constructor func */ | ||
411 | void (*ctor)(void *obj); | ||
412 | |||
413 | /* 5) cache creation/removal */ | ||
414 | const char *name; | ||
415 | struct list_head next; | ||
416 | |||
417 | /* 6) statistics */ | ||
418 | #if STATS | ||
419 | unsigned long num_active; | ||
420 | unsigned long num_allocations; | ||
421 | unsigned long high_mark; | ||
422 | unsigned long grown; | ||
423 | unsigned long reaped; | ||
424 | unsigned long errors; | ||
425 | unsigned long max_freeable; | ||
426 | unsigned long node_allocs; | ||
427 | unsigned long node_frees; | ||
428 | unsigned long node_overflow; | ||
429 | atomic_t allochit; | ||
430 | atomic_t allocmiss; | ||
431 | atomic_t freehit; | ||
432 | atomic_t freemiss; | ||
433 | #endif | ||
434 | #if DEBUG | ||
435 | /* | ||
436 | * If debugging is enabled, then the allocator can add additional | ||
437 | * fields and/or padding to every object. buffer_size contains the total | ||
438 | * object size including these internal fields, the following two | ||
439 | * variables contain the offset to the user object and its size. | ||
440 | */ | ||
441 | int obj_offset; | ||
442 | int obj_size; | ||
443 | #endif | ||
444 | /* | ||
445 | * We put nodelists[] at the end of kmem_cache, because we want to size | ||
446 | * this array to nr_node_ids slots instead of MAX_NUMNODES | ||
447 | * (see kmem_cache_init()) | ||
448 | * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache | ||
449 | * is statically defined, so we reserve the max number of nodes. | ||
450 | */ | ||
451 | struct kmem_list3 *nodelists[MAX_NUMNODES]; | ||
452 | /* | ||
453 | * Do not add fields after nodelists[] | ||
454 | */ | ||
455 | }; | ||
456 | |||
457 | #define CFLGS_OFF_SLAB (0x80000000UL) | 384 | #define CFLGS_OFF_SLAB (0x80000000UL) |
458 | #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) | 385 | #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) |
459 | 386 | ||
@@ -752,6 +679,7 @@ static enum { | |||
752 | NONE, | 679 | NONE, |
753 | PARTIAL_AC, | 680 | PARTIAL_AC, |
754 | PARTIAL_L3, | 681 | PARTIAL_L3, |
682 | EARLY, | ||
755 | FULL | 683 | FULL |
756 | } g_cpucache_up; | 684 | } g_cpucache_up; |
757 | 685 | ||
@@ -760,7 +688,7 @@ static enum { | |||
760 | */ | 688 | */ |
761 | int slab_is_available(void) | 689 | int slab_is_available(void) |
762 | { | 690 | { |
763 | return g_cpucache_up == FULL; | 691 | return g_cpucache_up >= EARLY; |
764 | } | 692 | } |
765 | 693 | ||
766 | static DEFINE_PER_CPU(struct delayed_work, reap_work); | 694 | static DEFINE_PER_CPU(struct delayed_work, reap_work); |
@@ -890,7 +818,6 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, | |||
890 | */ | 818 | */ |
891 | 819 | ||
892 | static int use_alien_caches __read_mostly = 1; | 820 | static int use_alien_caches __read_mostly = 1; |
893 | static int numa_platform __read_mostly = 1; | ||
894 | static int __init noaliencache_setup(char *s) | 821 | static int __init noaliencache_setup(char *s) |
895 | { | 822 | { |
896 | use_alien_caches = 0; | 823 | use_alien_caches = 0; |
@@ -958,12 +885,20 @@ static void __cpuinit start_cpu_timer(int cpu) | |||
958 | } | 885 | } |
959 | 886 | ||
960 | static struct array_cache *alloc_arraycache(int node, int entries, | 887 | static struct array_cache *alloc_arraycache(int node, int entries, |
961 | int batchcount) | 888 | int batchcount, gfp_t gfp) |
962 | { | 889 | { |
963 | int memsize = sizeof(void *) * entries + sizeof(struct array_cache); | 890 | int memsize = sizeof(void *) * entries + sizeof(struct array_cache); |
964 | struct array_cache *nc = NULL; | 891 | struct array_cache *nc = NULL; |
965 | 892 | ||
966 | nc = kmalloc_node(memsize, GFP_KERNEL, node); | 893 | nc = kmalloc_node(memsize, gfp, node); |
894 | /* | ||
895 | * The array_cache structures contain pointers to free object. | ||
896 | * However, when such objects are allocated or transfered to another | ||
897 | * cache the pointers are not cleared and they could be counted as | ||
898 | * valid references during a kmemleak scan. Therefore, kmemleak must | ||
899 | * not scan such objects. | ||
900 | */ | ||
901 | kmemleak_no_scan(nc); | ||
967 | if (nc) { | 902 | if (nc) { |
968 | nc->avail = 0; | 903 | nc->avail = 0; |
969 | nc->limit = entries; | 904 | nc->limit = entries; |
@@ -1003,7 +938,7 @@ static int transfer_objects(struct array_cache *to, | |||
1003 | #define drain_alien_cache(cachep, alien) do { } while (0) | 938 | #define drain_alien_cache(cachep, alien) do { } while (0) |
1004 | #define reap_alien(cachep, l3) do { } while (0) | 939 | #define reap_alien(cachep, l3) do { } while (0) |
1005 | 940 | ||
1006 | static inline struct array_cache **alloc_alien_cache(int node, int limit) | 941 | static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) |
1007 | { | 942 | { |
1008 | return (struct array_cache **)BAD_ALIEN_MAGIC; | 943 | return (struct array_cache **)BAD_ALIEN_MAGIC; |
1009 | } | 944 | } |
@@ -1034,7 +969,7 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep, | |||
1034 | static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); | 969 | static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); |
1035 | static void *alternate_node_alloc(struct kmem_cache *, gfp_t); | 970 | static void *alternate_node_alloc(struct kmem_cache *, gfp_t); |
1036 | 971 | ||
1037 | static struct array_cache **alloc_alien_cache(int node, int limit) | 972 | static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) |
1038 | { | 973 | { |
1039 | struct array_cache **ac_ptr; | 974 | struct array_cache **ac_ptr; |
1040 | int memsize = sizeof(void *) * nr_node_ids; | 975 | int memsize = sizeof(void *) * nr_node_ids; |
@@ -1042,14 +977,14 @@ static struct array_cache **alloc_alien_cache(int node, int limit) | |||
1042 | 977 | ||
1043 | if (limit > 1) | 978 | if (limit > 1) |
1044 | limit = 12; | 979 | limit = 12; |
1045 | ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node); | 980 | ac_ptr = kmalloc_node(memsize, gfp, node); |
1046 | if (ac_ptr) { | 981 | if (ac_ptr) { |
1047 | for_each_node(i) { | 982 | for_each_node(i) { |
1048 | if (i == node || !node_online(i)) { | 983 | if (i == node || !node_online(i)) { |
1049 | ac_ptr[i] = NULL; | 984 | ac_ptr[i] = NULL; |
1050 | continue; | 985 | continue; |
1051 | } | 986 | } |
1052 | ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d); | 987 | ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); |
1053 | if (!ac_ptr[i]) { | 988 | if (!ac_ptr[i]) { |
1054 | for (i--; i >= 0; i--) | 989 | for (i--; i >= 0; i--) |
1055 | kfree(ac_ptr[i]); | 990 | kfree(ac_ptr[i]); |
@@ -1282,20 +1217,20 @@ static int __cpuinit cpuup_prepare(long cpu) | |||
1282 | struct array_cache **alien = NULL; | 1217 | struct array_cache **alien = NULL; |
1283 | 1218 | ||
1284 | nc = alloc_arraycache(node, cachep->limit, | 1219 | nc = alloc_arraycache(node, cachep->limit, |
1285 | cachep->batchcount); | 1220 | cachep->batchcount, GFP_KERNEL); |
1286 | if (!nc) | 1221 | if (!nc) |
1287 | goto bad; | 1222 | goto bad; |
1288 | if (cachep->shared) { | 1223 | if (cachep->shared) { |
1289 | shared = alloc_arraycache(node, | 1224 | shared = alloc_arraycache(node, |
1290 | cachep->shared * cachep->batchcount, | 1225 | cachep->shared * cachep->batchcount, |
1291 | 0xbaadf00d); | 1226 | 0xbaadf00d, GFP_KERNEL); |
1292 | if (!shared) { | 1227 | if (!shared) { |
1293 | kfree(nc); | 1228 | kfree(nc); |
1294 | goto bad; | 1229 | goto bad; |
1295 | } | 1230 | } |
1296 | } | 1231 | } |
1297 | if (use_alien_caches) { | 1232 | if (use_alien_caches) { |
1298 | alien = alloc_alien_cache(node, cachep->limit); | 1233 | alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL); |
1299 | if (!alien) { | 1234 | if (!alien) { |
1300 | kfree(shared); | 1235 | kfree(shared); |
1301 | kfree(nc); | 1236 | kfree(nc); |
@@ -1399,10 +1334,9 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, | |||
1399 | { | 1334 | { |
1400 | struct kmem_list3 *ptr; | 1335 | struct kmem_list3 *ptr; |
1401 | 1336 | ||
1402 | ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid); | 1337 | ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_NOWAIT, nodeid); |
1403 | BUG_ON(!ptr); | 1338 | BUG_ON(!ptr); |
1404 | 1339 | ||
1405 | local_irq_disable(); | ||
1406 | memcpy(ptr, list, sizeof(struct kmem_list3)); | 1340 | memcpy(ptr, list, sizeof(struct kmem_list3)); |
1407 | /* | 1341 | /* |
1408 | * Do not assume that spinlocks can be initialized via memcpy: | 1342 | * Do not assume that spinlocks can be initialized via memcpy: |
@@ -1411,7 +1345,6 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, | |||
1411 | 1345 | ||
1412 | MAKE_ALL_LISTS(cachep, ptr, nodeid); | 1346 | MAKE_ALL_LISTS(cachep, ptr, nodeid); |
1413 | cachep->nodelists[nodeid] = ptr; | 1347 | cachep->nodelists[nodeid] = ptr; |
1414 | local_irq_enable(); | ||
1415 | } | 1348 | } |
1416 | 1349 | ||
1417 | /* | 1350 | /* |
@@ -1443,10 +1376,8 @@ void __init kmem_cache_init(void) | |||
1443 | int order; | 1376 | int order; |
1444 | int node; | 1377 | int node; |
1445 | 1378 | ||
1446 | if (num_possible_nodes() == 1) { | 1379 | if (num_possible_nodes() == 1) |
1447 | use_alien_caches = 0; | 1380 | use_alien_caches = 0; |
1448 | numa_platform = 0; | ||
1449 | } | ||
1450 | 1381 | ||
1451 | for (i = 0; i < NUM_INIT_LISTS; i++) { | 1382 | for (i = 0; i < NUM_INIT_LISTS; i++) { |
1452 | kmem_list3_init(&initkmem_list3[i]); | 1383 | kmem_list3_init(&initkmem_list3[i]); |
@@ -1575,9 +1506,8 @@ void __init kmem_cache_init(void) | |||
1575 | { | 1506 | { |
1576 | struct array_cache *ptr; | 1507 | struct array_cache *ptr; |
1577 | 1508 | ||
1578 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); | 1509 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); |
1579 | 1510 | ||
1580 | local_irq_disable(); | ||
1581 | BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); | 1511 | BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); |
1582 | memcpy(ptr, cpu_cache_get(&cache_cache), | 1512 | memcpy(ptr, cpu_cache_get(&cache_cache), |
1583 | sizeof(struct arraycache_init)); | 1513 | sizeof(struct arraycache_init)); |
@@ -1587,11 +1517,9 @@ void __init kmem_cache_init(void) | |||
1587 | spin_lock_init(&ptr->lock); | 1517 | spin_lock_init(&ptr->lock); |
1588 | 1518 | ||
1589 | cache_cache.array[smp_processor_id()] = ptr; | 1519 | cache_cache.array[smp_processor_id()] = ptr; |
1590 | local_irq_enable(); | ||
1591 | 1520 | ||
1592 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); | 1521 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); |
1593 | 1522 | ||
1594 | local_irq_disable(); | ||
1595 | BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep) | 1523 | BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep) |
1596 | != &initarray_generic.cache); | 1524 | != &initarray_generic.cache); |
1597 | memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), | 1525 | memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), |
@@ -1603,7 +1531,6 @@ void __init kmem_cache_init(void) | |||
1603 | 1531 | ||
1604 | malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = | 1532 | malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = |
1605 | ptr; | 1533 | ptr; |
1606 | local_irq_enable(); | ||
1607 | } | 1534 | } |
1608 | /* 5) Replace the bootstrap kmem_list3's */ | 1535 | /* 5) Replace the bootstrap kmem_list3's */ |
1609 | { | 1536 | { |
@@ -1622,19 +1549,27 @@ void __init kmem_cache_init(void) | |||
1622 | } | 1549 | } |
1623 | } | 1550 | } |
1624 | 1551 | ||
1625 | /* 6) resize the head arrays to their final sizes */ | 1552 | g_cpucache_up = EARLY; |
1626 | { | ||
1627 | struct kmem_cache *cachep; | ||
1628 | mutex_lock(&cache_chain_mutex); | ||
1629 | list_for_each_entry(cachep, &cache_chain, next) | ||
1630 | if (enable_cpucache(cachep)) | ||
1631 | BUG(); | ||
1632 | mutex_unlock(&cache_chain_mutex); | ||
1633 | } | ||
1634 | 1553 | ||
1635 | /* Annotate slab for lockdep -- annotate the malloc caches */ | 1554 | /* Annotate slab for lockdep -- annotate the malloc caches */ |
1636 | init_lock_keys(); | 1555 | init_lock_keys(); |
1556 | } | ||
1557 | |||
1558 | void __init kmem_cache_init_late(void) | ||
1559 | { | ||
1560 | struct kmem_cache *cachep; | ||
1637 | 1561 | ||
1562 | /* | ||
1563 | * Interrupts are enabled now so all GFP allocations are safe. | ||
1564 | */ | ||
1565 | slab_gfp_mask = __GFP_BITS_MASK; | ||
1566 | |||
1567 | /* 6) resize the head arrays to their final sizes */ | ||
1568 | mutex_lock(&cache_chain_mutex); | ||
1569 | list_for_each_entry(cachep, &cache_chain, next) | ||
1570 | if (enable_cpucache(cachep, GFP_NOWAIT)) | ||
1571 | BUG(); | ||
1572 | mutex_unlock(&cache_chain_mutex); | ||
1638 | 1573 | ||
1639 | /* Done! */ | 1574 | /* Done! */ |
1640 | g_cpucache_up = FULL; | 1575 | g_cpucache_up = FULL; |
@@ -1689,7 +1624,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
1689 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | 1624 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) |
1690 | flags |= __GFP_RECLAIMABLE; | 1625 | flags |= __GFP_RECLAIMABLE; |
1691 | 1626 | ||
1692 | page = alloc_pages_node(nodeid, flags, cachep->gfporder); | 1627 | page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); |
1693 | if (!page) | 1628 | if (!page) |
1694 | return NULL; | 1629 | return NULL; |
1695 | 1630 | ||
@@ -1702,6 +1637,16 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
1702 | NR_SLAB_UNRECLAIMABLE, nr_pages); | 1637 | NR_SLAB_UNRECLAIMABLE, nr_pages); |
1703 | for (i = 0; i < nr_pages; i++) | 1638 | for (i = 0; i < nr_pages; i++) |
1704 | __SetPageSlab(page + i); | 1639 | __SetPageSlab(page + i); |
1640 | |||
1641 | if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { | ||
1642 | kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); | ||
1643 | |||
1644 | if (cachep->ctor) | ||
1645 | kmemcheck_mark_uninitialized_pages(page, nr_pages); | ||
1646 | else | ||
1647 | kmemcheck_mark_unallocated_pages(page, nr_pages); | ||
1648 | } | ||
1649 | |||
1705 | return page_address(page); | 1650 | return page_address(page); |
1706 | } | 1651 | } |
1707 | 1652 | ||
@@ -1714,6 +1659,8 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) | |||
1714 | struct page *page = virt_to_page(addr); | 1659 | struct page *page = virt_to_page(addr); |
1715 | const unsigned long nr_freed = i; | 1660 | const unsigned long nr_freed = i; |
1716 | 1661 | ||
1662 | kmemcheck_free_shadow(page, cachep->gfporder); | ||
1663 | |||
1717 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | 1664 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) |
1718 | sub_zone_page_state(page_zone(page), | 1665 | sub_zone_page_state(page_zone(page), |
1719 | NR_SLAB_RECLAIMABLE, nr_freed); | 1666 | NR_SLAB_RECLAIMABLE, nr_freed); |
@@ -2064,10 +2011,10 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, | |||
2064 | return left_over; | 2011 | return left_over; |
2065 | } | 2012 | } |
2066 | 2013 | ||
2067 | static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) | 2014 | static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) |
2068 | { | 2015 | { |
2069 | if (g_cpucache_up == FULL) | 2016 | if (g_cpucache_up == FULL) |
2070 | return enable_cpucache(cachep); | 2017 | return enable_cpucache(cachep, gfp); |
2071 | 2018 | ||
2072 | if (g_cpucache_up == NONE) { | 2019 | if (g_cpucache_up == NONE) { |
2073 | /* | 2020 | /* |
@@ -2089,7 +2036,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) | |||
2089 | g_cpucache_up = PARTIAL_AC; | 2036 | g_cpucache_up = PARTIAL_AC; |
2090 | } else { | 2037 | } else { |
2091 | cachep->array[smp_processor_id()] = | 2038 | cachep->array[smp_processor_id()] = |
2092 | kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); | 2039 | kmalloc(sizeof(struct arraycache_init), gfp); |
2093 | 2040 | ||
2094 | if (g_cpucache_up == PARTIAL_AC) { | 2041 | if (g_cpucache_up == PARTIAL_AC) { |
2095 | set_up_list3s(cachep, SIZE_L3); | 2042 | set_up_list3s(cachep, SIZE_L3); |
@@ -2099,7 +2046,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) | |||
2099 | for_each_online_node(node) { | 2046 | for_each_online_node(node) { |
2100 | cachep->nodelists[node] = | 2047 | cachep->nodelists[node] = |
2101 | kmalloc_node(sizeof(struct kmem_list3), | 2048 | kmalloc_node(sizeof(struct kmem_list3), |
2102 | GFP_KERNEL, node); | 2049 | gfp, node); |
2103 | BUG_ON(!cachep->nodelists[node]); | 2050 | BUG_ON(!cachep->nodelists[node]); |
2104 | kmem_list3_init(cachep->nodelists[node]); | 2051 | kmem_list3_init(cachep->nodelists[node]); |
2105 | } | 2052 | } |
@@ -2153,6 +2100,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2153 | { | 2100 | { |
2154 | size_t left_over, slab_size, ralign; | 2101 | size_t left_over, slab_size, ralign; |
2155 | struct kmem_cache *cachep = NULL, *pc; | 2102 | struct kmem_cache *cachep = NULL, *pc; |
2103 | gfp_t gfp; | ||
2156 | 2104 | ||
2157 | /* | 2105 | /* |
2158 | * Sanity checks... these are all serious usage bugs. | 2106 | * Sanity checks... these are all serious usage bugs. |
@@ -2168,8 +2116,10 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2168 | * We use cache_chain_mutex to ensure a consistent view of | 2116 | * We use cache_chain_mutex to ensure a consistent view of |
2169 | * cpu_online_mask as well. Please see cpuup_callback | 2117 | * cpu_online_mask as well. Please see cpuup_callback |
2170 | */ | 2118 | */ |
2171 | get_online_cpus(); | 2119 | if (slab_is_available()) { |
2172 | mutex_lock(&cache_chain_mutex); | 2120 | get_online_cpus(); |
2121 | mutex_lock(&cache_chain_mutex); | ||
2122 | } | ||
2173 | 2123 | ||
2174 | list_for_each_entry(pc, &cache_chain, next) { | 2124 | list_for_each_entry(pc, &cache_chain, next) { |
2175 | char tmp; | 2125 | char tmp; |
@@ -2278,8 +2228,13 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2278 | */ | 2228 | */ |
2279 | align = ralign; | 2229 | align = ralign; |
2280 | 2230 | ||
2231 | if (slab_is_available()) | ||
2232 | gfp = GFP_KERNEL; | ||
2233 | else | ||
2234 | gfp = GFP_NOWAIT; | ||
2235 | |||
2281 | /* Get cache's description obj. */ | 2236 | /* Get cache's description obj. */ |
2282 | cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL); | 2237 | cachep = kmem_cache_zalloc(&cache_cache, gfp); |
2283 | if (!cachep) | 2238 | if (!cachep) |
2284 | goto oops; | 2239 | goto oops; |
2285 | 2240 | ||
@@ -2353,6 +2308,15 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2353 | /* really off slab. No need for manual alignment */ | 2308 | /* really off slab. No need for manual alignment */ |
2354 | slab_size = | 2309 | slab_size = |
2355 | cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); | 2310 | cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); |
2311 | |||
2312 | #ifdef CONFIG_PAGE_POISONING | ||
2313 | /* If we're going to use the generic kernel_map_pages() | ||
2314 | * poisoning, then it's going to smash the contents of | ||
2315 | * the redzone and userword anyhow, so switch them off. | ||
2316 | */ | ||
2317 | if (size % PAGE_SIZE == 0 && flags & SLAB_POISON) | ||
2318 | flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); | ||
2319 | #endif | ||
2356 | } | 2320 | } |
2357 | 2321 | ||
2358 | cachep->colour_off = cache_line_size(); | 2322 | cachep->colour_off = cache_line_size(); |
@@ -2382,7 +2346,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2382 | cachep->ctor = ctor; | 2346 | cachep->ctor = ctor; |
2383 | cachep->name = name; | 2347 | cachep->name = name; |
2384 | 2348 | ||
2385 | if (setup_cpu_cache(cachep)) { | 2349 | if (setup_cpu_cache(cachep, gfp)) { |
2386 | __kmem_cache_destroy(cachep); | 2350 | __kmem_cache_destroy(cachep); |
2387 | cachep = NULL; | 2351 | cachep = NULL; |
2388 | goto oops; | 2352 | goto oops; |
@@ -2394,8 +2358,10 @@ oops: | |||
2394 | if (!cachep && (flags & SLAB_PANIC)) | 2358 | if (!cachep && (flags & SLAB_PANIC)) |
2395 | panic("kmem_cache_create(): failed to create slab `%s'\n", | 2359 | panic("kmem_cache_create(): failed to create slab `%s'\n", |
2396 | name); | 2360 | name); |
2397 | mutex_unlock(&cache_chain_mutex); | 2361 | if (slab_is_available()) { |
2398 | put_online_cpus(); | 2362 | mutex_unlock(&cache_chain_mutex); |
2363 | put_online_cpus(); | ||
2364 | } | ||
2399 | return cachep; | 2365 | return cachep; |
2400 | } | 2366 | } |
2401 | EXPORT_SYMBOL(kmem_cache_create); | 2367 | EXPORT_SYMBOL(kmem_cache_create); |
@@ -2621,6 +2587,14 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, | |||
2621 | /* Slab management obj is off-slab. */ | 2587 | /* Slab management obj is off-slab. */ |
2622 | slabp = kmem_cache_alloc_node(cachep->slabp_cache, | 2588 | slabp = kmem_cache_alloc_node(cachep->slabp_cache, |
2623 | local_flags, nodeid); | 2589 | local_flags, nodeid); |
2590 | /* | ||
2591 | * If the first object in the slab is leaked (it's allocated | ||
2592 | * but no one has a reference to it), we want to make sure | ||
2593 | * kmemleak does not treat the ->s_mem pointer as a reference | ||
2594 | * to the object. Otherwise we will not report the leak. | ||
2595 | */ | ||
2596 | kmemleak_scan_area(slabp, offsetof(struct slab, list), | ||
2597 | sizeof(struct list_head), local_flags); | ||
2624 | if (!slabp) | 2598 | if (!slabp) |
2625 | return NULL; | 2599 | return NULL; |
2626 | } else { | 2600 | } else { |
@@ -3141,6 +3115,12 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3141 | STATS_INC_ALLOCMISS(cachep); | 3115 | STATS_INC_ALLOCMISS(cachep); |
3142 | objp = cache_alloc_refill(cachep, flags); | 3116 | objp = cache_alloc_refill(cachep, flags); |
3143 | } | 3117 | } |
3118 | /* | ||
3119 | * To avoid a false negative, if an object that is in one of the | ||
3120 | * per-CPU caches is leaked, we need to make sure kmemleak doesn't | ||
3121 | * treat the array pointers as a reference to the object. | ||
3122 | */ | ||
3123 | kmemleak_erase(&ac->entry[ac->avail]); | ||
3144 | return objp; | 3124 | return objp; |
3145 | } | 3125 | } |
3146 | 3126 | ||
@@ -3219,7 +3199,7 @@ retry: | |||
3219 | if (local_flags & __GFP_WAIT) | 3199 | if (local_flags & __GFP_WAIT) |
3220 | local_irq_enable(); | 3200 | local_irq_enable(); |
3221 | kmem_flagcheck(cache, flags); | 3201 | kmem_flagcheck(cache, flags); |
3222 | obj = kmem_getpages(cache, local_flags, -1); | 3202 | obj = kmem_getpages(cache, local_flags, numa_node_id()); |
3223 | if (local_flags & __GFP_WAIT) | 3203 | if (local_flags & __GFP_WAIT) |
3224 | local_irq_disable(); | 3204 | local_irq_disable(); |
3225 | if (obj) { | 3205 | if (obj) { |
@@ -3327,6 +3307,8 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, | |||
3327 | unsigned long save_flags; | 3307 | unsigned long save_flags; |
3328 | void *ptr; | 3308 | void *ptr; |
3329 | 3309 | ||
3310 | flags &= slab_gfp_mask; | ||
3311 | |||
3330 | lockdep_trace_alloc(flags); | 3312 | lockdep_trace_alloc(flags); |
3331 | 3313 | ||
3332 | if (slab_should_failslab(cachep, flags)) | 3314 | if (slab_should_failslab(cachep, flags)) |
@@ -3360,6 +3342,11 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, | |||
3360 | out: | 3342 | out: |
3361 | local_irq_restore(save_flags); | 3343 | local_irq_restore(save_flags); |
3362 | ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); | 3344 | ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); |
3345 | kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags, | ||
3346 | flags); | ||
3347 | |||
3348 | if (likely(ptr)) | ||
3349 | kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep)); | ||
3363 | 3350 | ||
3364 | if (unlikely((flags & __GFP_ZERO) && ptr)) | 3351 | if (unlikely((flags & __GFP_ZERO) && ptr)) |
3365 | memset(ptr, 0, obj_size(cachep)); | 3352 | memset(ptr, 0, obj_size(cachep)); |
@@ -3405,6 +3392,8 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) | |||
3405 | unsigned long save_flags; | 3392 | unsigned long save_flags; |
3406 | void *objp; | 3393 | void *objp; |
3407 | 3394 | ||
3395 | flags &= slab_gfp_mask; | ||
3396 | |||
3408 | lockdep_trace_alloc(flags); | 3397 | lockdep_trace_alloc(flags); |
3409 | 3398 | ||
3410 | if (slab_should_failslab(cachep, flags)) | 3399 | if (slab_should_failslab(cachep, flags)) |
@@ -3415,8 +3404,13 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) | |||
3415 | objp = __do_cache_alloc(cachep, flags); | 3404 | objp = __do_cache_alloc(cachep, flags); |
3416 | local_irq_restore(save_flags); | 3405 | local_irq_restore(save_flags); |
3417 | objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); | 3406 | objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); |
3407 | kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags, | ||
3408 | flags); | ||
3418 | prefetchw(objp); | 3409 | prefetchw(objp); |
3419 | 3410 | ||
3411 | if (likely(objp)) | ||
3412 | kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep)); | ||
3413 | |||
3420 | if (unlikely((flags & __GFP_ZERO) && objp)) | 3414 | if (unlikely((flags & __GFP_ZERO) && objp)) |
3421 | memset(objp, 0, obj_size(cachep)); | 3415 | memset(objp, 0, obj_size(cachep)); |
3422 | 3416 | ||
@@ -3530,8 +3524,11 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) | |||
3530 | struct array_cache *ac = cpu_cache_get(cachep); | 3524 | struct array_cache *ac = cpu_cache_get(cachep); |
3531 | 3525 | ||
3532 | check_irq_off(); | 3526 | check_irq_off(); |
3527 | kmemleak_free_recursive(objp, cachep->flags); | ||
3533 | objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); | 3528 | objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); |
3534 | 3529 | ||
3530 | kmemcheck_slab_free(cachep, objp, obj_size(cachep)); | ||
3531 | |||
3535 | /* | 3532 | /* |
3536 | * Skip calling cache_free_alien() when the platform is not numa. | 3533 | * Skip calling cache_free_alien() when the platform is not numa. |
3537 | * This will avoid cache misses that happen while accessing slabp (which | 3534 | * This will avoid cache misses that happen while accessing slabp (which |
@@ -3539,7 +3536,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) | |||
3539 | * variable to skip the call, which is mostly likely to be present in | 3536 | * variable to skip the call, which is mostly likely to be present in |
3540 | * the cache. | 3537 | * the cache. |
3541 | */ | 3538 | */ |
3542 | if (numa_platform && cache_free_alien(cachep, objp)) | 3539 | if (nr_online_nodes > 1 && cache_free_alien(cachep, objp)) |
3543 | return; | 3540 | return; |
3544 | 3541 | ||
3545 | if (likely(ac->avail < ac->limit)) { | 3542 | if (likely(ac->avail < ac->limit)) { |
@@ -3802,7 +3799,7 @@ EXPORT_SYMBOL_GPL(kmem_cache_name); | |||
3802 | /* | 3799 | /* |
3803 | * This initializes kmem_list3 or resizes various caches for all nodes. | 3800 | * This initializes kmem_list3 or resizes various caches for all nodes. |
3804 | */ | 3801 | */ |
3805 | static int alloc_kmemlist(struct kmem_cache *cachep) | 3802 | static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) |
3806 | { | 3803 | { |
3807 | int node; | 3804 | int node; |
3808 | struct kmem_list3 *l3; | 3805 | struct kmem_list3 *l3; |
@@ -3812,7 +3809,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep) | |||
3812 | for_each_online_node(node) { | 3809 | for_each_online_node(node) { |
3813 | 3810 | ||
3814 | if (use_alien_caches) { | 3811 | if (use_alien_caches) { |
3815 | new_alien = alloc_alien_cache(node, cachep->limit); | 3812 | new_alien = alloc_alien_cache(node, cachep->limit, gfp); |
3816 | if (!new_alien) | 3813 | if (!new_alien) |
3817 | goto fail; | 3814 | goto fail; |
3818 | } | 3815 | } |
@@ -3821,7 +3818,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep) | |||
3821 | if (cachep->shared) { | 3818 | if (cachep->shared) { |
3822 | new_shared = alloc_arraycache(node, | 3819 | new_shared = alloc_arraycache(node, |
3823 | cachep->shared*cachep->batchcount, | 3820 | cachep->shared*cachep->batchcount, |
3824 | 0xbaadf00d); | 3821 | 0xbaadf00d, gfp); |
3825 | if (!new_shared) { | 3822 | if (!new_shared) { |
3826 | free_alien_cache(new_alien); | 3823 | free_alien_cache(new_alien); |
3827 | goto fail; | 3824 | goto fail; |
@@ -3850,7 +3847,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep) | |||
3850 | free_alien_cache(new_alien); | 3847 | free_alien_cache(new_alien); |
3851 | continue; | 3848 | continue; |
3852 | } | 3849 | } |
3853 | l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node); | 3850 | l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node); |
3854 | if (!l3) { | 3851 | if (!l3) { |
3855 | free_alien_cache(new_alien); | 3852 | free_alien_cache(new_alien); |
3856 | kfree(new_shared); | 3853 | kfree(new_shared); |
@@ -3906,18 +3903,18 @@ static void do_ccupdate_local(void *info) | |||
3906 | 3903 | ||
3907 | /* Always called with the cache_chain_mutex held */ | 3904 | /* Always called with the cache_chain_mutex held */ |
3908 | static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | 3905 | static int do_tune_cpucache(struct kmem_cache *cachep, int limit, |
3909 | int batchcount, int shared) | 3906 | int batchcount, int shared, gfp_t gfp) |
3910 | { | 3907 | { |
3911 | struct ccupdate_struct *new; | 3908 | struct ccupdate_struct *new; |
3912 | int i; | 3909 | int i; |
3913 | 3910 | ||
3914 | new = kzalloc(sizeof(*new), GFP_KERNEL); | 3911 | new = kzalloc(sizeof(*new), gfp); |
3915 | if (!new) | 3912 | if (!new) |
3916 | return -ENOMEM; | 3913 | return -ENOMEM; |
3917 | 3914 | ||
3918 | for_each_online_cpu(i) { | 3915 | for_each_online_cpu(i) { |
3919 | new->new[i] = alloc_arraycache(cpu_to_node(i), limit, | 3916 | new->new[i] = alloc_arraycache(cpu_to_node(i), limit, |
3920 | batchcount); | 3917 | batchcount, gfp); |
3921 | if (!new->new[i]) { | 3918 | if (!new->new[i]) { |
3922 | for (i--; i >= 0; i--) | 3919 | for (i--; i >= 0; i--) |
3923 | kfree(new->new[i]); | 3920 | kfree(new->new[i]); |
@@ -3944,11 +3941,11 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | |||
3944 | kfree(ccold); | 3941 | kfree(ccold); |
3945 | } | 3942 | } |
3946 | kfree(new); | 3943 | kfree(new); |
3947 | return alloc_kmemlist(cachep); | 3944 | return alloc_kmemlist(cachep, gfp); |
3948 | } | 3945 | } |
3949 | 3946 | ||
3950 | /* Called with cache_chain_mutex held always */ | 3947 | /* Called with cache_chain_mutex held always */ |
3951 | static int enable_cpucache(struct kmem_cache *cachep) | 3948 | static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) |
3952 | { | 3949 | { |
3953 | int err; | 3950 | int err; |
3954 | int limit, shared; | 3951 | int limit, shared; |
@@ -3994,7 +3991,7 @@ static int enable_cpucache(struct kmem_cache *cachep) | |||
3994 | if (limit > 32) | 3991 | if (limit > 32) |
3995 | limit = 32; | 3992 | limit = 32; |
3996 | #endif | 3993 | #endif |
3997 | err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared); | 3994 | err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp); |
3998 | if (err) | 3995 | if (err) |
3999 | printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", | 3996 | printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", |
4000 | cachep->name, -err); | 3997 | cachep->name, -err); |
@@ -4300,7 +4297,8 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, | |||
4300 | res = 0; | 4297 | res = 0; |
4301 | } else { | 4298 | } else { |
4302 | res = do_tune_cpucache(cachep, limit, | 4299 | res = do_tune_cpucache(cachep, limit, |
4303 | batchcount, shared); | 4300 | batchcount, shared, |
4301 | GFP_KERNEL); | ||
4304 | } | 4302 | } |
4305 | break; | 4303 | break; |
4306 | } | 4304 | } |
@@ -46,7 +46,7 @@ | |||
46 | * NUMA support in SLOB is fairly simplistic, pushing most of the real | 46 | * NUMA support in SLOB is fairly simplistic, pushing most of the real |
47 | * logic down to the page allocator, and simply doing the node accounting | 47 | * logic down to the page allocator, and simply doing the node accounting |
48 | * on the upper levels. In the event that a node id is explicitly | 48 | * on the upper levels. In the event that a node id is explicitly |
49 | * provided, alloc_pages_node() with the specified node id is used | 49 | * provided, alloc_pages_exact_node() with the specified node id is used |
50 | * instead. The common case (or when the node id isn't explicitly provided) | 50 | * instead. The common case (or when the node id isn't explicitly provided) |
51 | * will default to the current node, as per numa_node_id(). | 51 | * will default to the current node, as per numa_node_id(). |
52 | * | 52 | * |
@@ -66,7 +66,8 @@ | |||
66 | #include <linux/module.h> | 66 | #include <linux/module.h> |
67 | #include <linux/rcupdate.h> | 67 | #include <linux/rcupdate.h> |
68 | #include <linux/list.h> | 68 | #include <linux/list.h> |
69 | #include <trace/kmemtrace.h> | 69 | #include <linux/kmemtrace.h> |
70 | #include <linux/kmemleak.h> | ||
70 | #include <asm/atomic.h> | 71 | #include <asm/atomic.h> |
71 | 72 | ||
72 | /* | 73 | /* |
@@ -132,17 +133,17 @@ static LIST_HEAD(free_slob_large); | |||
132 | */ | 133 | */ |
133 | static inline int is_slob_page(struct slob_page *sp) | 134 | static inline int is_slob_page(struct slob_page *sp) |
134 | { | 135 | { |
135 | return PageSlobPage((struct page *)sp); | 136 | return PageSlab((struct page *)sp); |
136 | } | 137 | } |
137 | 138 | ||
138 | static inline void set_slob_page(struct slob_page *sp) | 139 | static inline void set_slob_page(struct slob_page *sp) |
139 | { | 140 | { |
140 | __SetPageSlobPage((struct page *)sp); | 141 | __SetPageSlab((struct page *)sp); |
141 | } | 142 | } |
142 | 143 | ||
143 | static inline void clear_slob_page(struct slob_page *sp) | 144 | static inline void clear_slob_page(struct slob_page *sp) |
144 | { | 145 | { |
145 | __ClearPageSlobPage((struct page *)sp); | 146 | __ClearPageSlab((struct page *)sp); |
146 | } | 147 | } |
147 | 148 | ||
148 | static inline struct slob_page *slob_page(const void *addr) | 149 | static inline struct slob_page *slob_page(const void *addr) |
@@ -243,7 +244,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node) | |||
243 | 244 | ||
244 | #ifdef CONFIG_NUMA | 245 | #ifdef CONFIG_NUMA |
245 | if (node != -1) | 246 | if (node != -1) |
246 | page = alloc_pages_node(node, gfp, order); | 247 | page = alloc_pages_exact_node(node, gfp, order); |
247 | else | 248 | else |
248 | #endif | 249 | #endif |
249 | page = alloc_pages(gfp, order); | 250 | page = alloc_pages(gfp, order); |
@@ -509,6 +510,7 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) | |||
509 | size, PAGE_SIZE << order, gfp, node); | 510 | size, PAGE_SIZE << order, gfp, node); |
510 | } | 511 | } |
511 | 512 | ||
513 | kmemleak_alloc(ret, size, 1, gfp); | ||
512 | return ret; | 514 | return ret; |
513 | } | 515 | } |
514 | EXPORT_SYMBOL(__kmalloc_node); | 516 | EXPORT_SYMBOL(__kmalloc_node); |
@@ -521,6 +523,7 @@ void kfree(const void *block) | |||
521 | 523 | ||
522 | if (unlikely(ZERO_OR_NULL_PTR(block))) | 524 | if (unlikely(ZERO_OR_NULL_PTR(block))) |
523 | return; | 525 | return; |
526 | kmemleak_free(block); | ||
524 | 527 | ||
525 | sp = slob_page(block); | 528 | sp = slob_page(block); |
526 | if (is_slob_page(sp)) { | 529 | if (is_slob_page(sp)) { |
@@ -584,12 +587,14 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, | |||
584 | } else if (flags & SLAB_PANIC) | 587 | } else if (flags & SLAB_PANIC) |
585 | panic("Cannot create slab cache %s\n", name); | 588 | panic("Cannot create slab cache %s\n", name); |
586 | 589 | ||
590 | kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL); | ||
587 | return c; | 591 | return c; |
588 | } | 592 | } |
589 | EXPORT_SYMBOL(kmem_cache_create); | 593 | EXPORT_SYMBOL(kmem_cache_create); |
590 | 594 | ||
591 | void kmem_cache_destroy(struct kmem_cache *c) | 595 | void kmem_cache_destroy(struct kmem_cache *c) |
592 | { | 596 | { |
597 | kmemleak_free(c); | ||
593 | slob_free(c, sizeof(struct kmem_cache)); | 598 | slob_free(c, sizeof(struct kmem_cache)); |
594 | } | 599 | } |
595 | EXPORT_SYMBOL(kmem_cache_destroy); | 600 | EXPORT_SYMBOL(kmem_cache_destroy); |
@@ -613,6 +618,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) | |||
613 | if (c->ctor) | 618 | if (c->ctor) |
614 | c->ctor(b); | 619 | c->ctor(b); |
615 | 620 | ||
621 | kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags); | ||
616 | return b; | 622 | return b; |
617 | } | 623 | } |
618 | EXPORT_SYMBOL(kmem_cache_alloc_node); | 624 | EXPORT_SYMBOL(kmem_cache_alloc_node); |
@@ -635,6 +641,7 @@ static void kmem_rcu_free(struct rcu_head *head) | |||
635 | 641 | ||
636 | void kmem_cache_free(struct kmem_cache *c, void *b) | 642 | void kmem_cache_free(struct kmem_cache *c, void *b) |
637 | { | 643 | { |
644 | kmemleak_free_recursive(b, c->flags); | ||
638 | if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) { | 645 | if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) { |
639 | struct slob_rcu *slob_rcu; | 646 | struct slob_rcu *slob_rcu; |
640 | slob_rcu = b + (c->size - sizeof(struct slob_rcu)); | 647 | slob_rcu = b + (c->size - sizeof(struct slob_rcu)); |
@@ -17,9 +17,11 @@ | |||
17 | #include <linux/slab.h> | 17 | #include <linux/slab.h> |
18 | #include <linux/proc_fs.h> | 18 | #include <linux/proc_fs.h> |
19 | #include <linux/seq_file.h> | 19 | #include <linux/seq_file.h> |
20 | #include <trace/kmemtrace.h> | 20 | #include <linux/kmemtrace.h> |
21 | #include <linux/kmemcheck.h> | ||
21 | #include <linux/cpu.h> | 22 | #include <linux/cpu.h> |
22 | #include <linux/cpuset.h> | 23 | #include <linux/cpuset.h> |
24 | #include <linux/kmemleak.h> | ||
23 | #include <linux/mempolicy.h> | 25 | #include <linux/mempolicy.h> |
24 | #include <linux/ctype.h> | 26 | #include <linux/ctype.h> |
25 | #include <linux/debugobjects.h> | 27 | #include <linux/debugobjects.h> |
@@ -143,10 +145,10 @@ | |||
143 | * Set of flags that will prevent slab merging | 145 | * Set of flags that will prevent slab merging |
144 | */ | 146 | */ |
145 | #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ | 147 | #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ |
146 | SLAB_TRACE | SLAB_DESTROY_BY_RCU) | 148 | SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE) |
147 | 149 | ||
148 | #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ | 150 | #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ |
149 | SLAB_CACHE_DMA) | 151 | SLAB_CACHE_DMA | SLAB_NOTRACK) |
150 | 152 | ||
151 | #ifndef ARCH_KMALLOC_MINALIGN | 153 | #ifndef ARCH_KMALLOC_MINALIGN |
152 | #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) | 154 | #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) |
@@ -177,6 +179,12 @@ static enum { | |||
177 | SYSFS /* Sysfs up */ | 179 | SYSFS /* Sysfs up */ |
178 | } slab_state = DOWN; | 180 | } slab_state = DOWN; |
179 | 181 | ||
182 | /* | ||
183 | * The slab allocator is initialized with interrupts disabled. Therefore, make | ||
184 | * sure early boot allocations don't accidentally enable interrupts. | ||
185 | */ | ||
186 | static gfp_t slab_gfp_mask __read_mostly = SLAB_GFP_BOOT_MASK; | ||
187 | |||
180 | /* A list of all slab caches on the system */ | 188 | /* A list of all slab caches on the system */ |
181 | static DECLARE_RWSEM(slub_lock); | 189 | static DECLARE_RWSEM(slub_lock); |
182 | static LIST_HEAD(slab_caches); | 190 | static LIST_HEAD(slab_caches); |
@@ -1071,6 +1079,8 @@ static inline struct page *alloc_slab_page(gfp_t flags, int node, | |||
1071 | { | 1079 | { |
1072 | int order = oo_order(oo); | 1080 | int order = oo_order(oo); |
1073 | 1081 | ||
1082 | flags |= __GFP_NOTRACK; | ||
1083 | |||
1074 | if (node == -1) | 1084 | if (node == -1) |
1075 | return alloc_pages(flags, order); | 1085 | return alloc_pages(flags, order); |
1076 | else | 1086 | else |
@@ -1098,6 +1108,24 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1098 | 1108 | ||
1099 | stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); | 1109 | stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); |
1100 | } | 1110 | } |
1111 | |||
1112 | if (kmemcheck_enabled | ||
1113 | && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) | ||
1114 | { | ||
1115 | int pages = 1 << oo_order(oo); | ||
1116 | |||
1117 | kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); | ||
1118 | |||
1119 | /* | ||
1120 | * Objects from caches that have a constructor don't get | ||
1121 | * cleared when they're allocated, so we need to do it here. | ||
1122 | */ | ||
1123 | if (s->ctor) | ||
1124 | kmemcheck_mark_uninitialized_pages(page, pages); | ||
1125 | else | ||
1126 | kmemcheck_mark_unallocated_pages(page, pages); | ||
1127 | } | ||
1128 | |||
1101 | page->objects = oo_objects(oo); | 1129 | page->objects = oo_objects(oo); |
1102 | mod_zone_page_state(page_zone(page), | 1130 | mod_zone_page_state(page_zone(page), |
1103 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? | 1131 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? |
@@ -1171,6 +1199,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page) | |||
1171 | __ClearPageSlubDebug(page); | 1199 | __ClearPageSlubDebug(page); |
1172 | } | 1200 | } |
1173 | 1201 | ||
1202 | kmemcheck_free_shadow(page, compound_order(page)); | ||
1203 | |||
1174 | mod_zone_page_state(page_zone(page), | 1204 | mod_zone_page_state(page_zone(page), |
1175 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? | 1205 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? |
1176 | NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, | 1206 | NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, |
@@ -1662,6 +1692,8 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, | |||
1662 | unsigned long flags; | 1692 | unsigned long flags; |
1663 | unsigned int objsize; | 1693 | unsigned int objsize; |
1664 | 1694 | ||
1695 | gfpflags &= slab_gfp_mask; | ||
1696 | |||
1665 | lockdep_trace_alloc(gfpflags); | 1697 | lockdep_trace_alloc(gfpflags); |
1666 | might_sleep_if(gfpflags & __GFP_WAIT); | 1698 | might_sleep_if(gfpflags & __GFP_WAIT); |
1667 | 1699 | ||
@@ -1685,6 +1717,9 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, | |||
1685 | if (unlikely((gfpflags & __GFP_ZERO) && object)) | 1717 | if (unlikely((gfpflags & __GFP_ZERO) && object)) |
1686 | memset(object, 0, objsize); | 1718 | memset(object, 0, objsize); |
1687 | 1719 | ||
1720 | kmemcheck_slab_alloc(s, gfpflags, object, c->objsize); | ||
1721 | kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags); | ||
1722 | |||
1688 | return object; | 1723 | return object; |
1689 | } | 1724 | } |
1690 | 1725 | ||
@@ -1814,8 +1849,10 @@ static __always_inline void slab_free(struct kmem_cache *s, | |||
1814 | struct kmem_cache_cpu *c; | 1849 | struct kmem_cache_cpu *c; |
1815 | unsigned long flags; | 1850 | unsigned long flags; |
1816 | 1851 | ||
1852 | kmemleak_free_recursive(x, s->flags); | ||
1817 | local_irq_save(flags); | 1853 | local_irq_save(flags); |
1818 | c = get_cpu_slab(s, smp_processor_id()); | 1854 | c = get_cpu_slab(s, smp_processor_id()); |
1855 | kmemcheck_slab_free(s, object, c->objsize); | ||
1819 | debug_check_no_locks_freed(object, c->objsize); | 1856 | debug_check_no_locks_freed(object, c->objsize); |
1820 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) | 1857 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) |
1821 | debug_check_no_obj_freed(object, c->objsize); | 1858 | debug_check_no_obj_freed(object, c->objsize); |
@@ -2625,13 +2662,16 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, | |||
2625 | if (gfp_flags & SLUB_DMA) | 2662 | if (gfp_flags & SLUB_DMA) |
2626 | flags = SLAB_CACHE_DMA; | 2663 | flags = SLAB_CACHE_DMA; |
2627 | 2664 | ||
2628 | down_write(&slub_lock); | 2665 | /* |
2666 | * This function is called with IRQs disabled during early-boot on | ||
2667 | * single CPU so there's no need to take slub_lock here. | ||
2668 | */ | ||
2629 | if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, | 2669 | if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, |
2630 | flags, NULL)) | 2670 | flags, NULL)) |
2631 | goto panic; | 2671 | goto panic; |
2632 | 2672 | ||
2633 | list_add(&s->list, &slab_caches); | 2673 | list_add(&s->list, &slab_caches); |
2634 | up_write(&slub_lock); | 2674 | |
2635 | if (sysfs_slab_add(s)) | 2675 | if (sysfs_slab_add(s)) |
2636 | goto panic; | 2676 | goto panic; |
2637 | return s; | 2677 | return s; |
@@ -2687,7 +2727,8 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) | |||
2687 | 2727 | ||
2688 | if (!s || !text || !kmem_cache_open(s, flags, text, | 2728 | if (!s || !text || !kmem_cache_open(s, flags, text, |
2689 | realsize, ARCH_KMALLOC_MINALIGN, | 2729 | realsize, ARCH_KMALLOC_MINALIGN, |
2690 | SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) { | 2730 | SLAB_CACHE_DMA|SLAB_NOTRACK|__SYSFS_ADD_DEFERRED, |
2731 | NULL)) { | ||
2691 | kfree(s); | 2732 | kfree(s); |
2692 | kfree(text); | 2733 | kfree(text); |
2693 | goto unlock_out; | 2734 | goto unlock_out; |
@@ -2781,9 +2822,10 @@ EXPORT_SYMBOL(__kmalloc); | |||
2781 | 2822 | ||
2782 | static void *kmalloc_large_node(size_t size, gfp_t flags, int node) | 2823 | static void *kmalloc_large_node(size_t size, gfp_t flags, int node) |
2783 | { | 2824 | { |
2784 | struct page *page = alloc_pages_node(node, flags | __GFP_COMP, | 2825 | struct page *page; |
2785 | get_order(size)); | ||
2786 | 2826 | ||
2827 | flags |= __GFP_COMP | __GFP_NOTRACK; | ||
2828 | page = alloc_pages_node(node, flags, get_order(size)); | ||
2787 | if (page) | 2829 | if (page) |
2788 | return page_address(page); | 2830 | return page_address(page); |
2789 | else | 2831 | else |
@@ -3089,7 +3131,7 @@ void __init kmem_cache_init(void) | |||
3089 | * kmem_cache_open for slab_state == DOWN. | 3131 | * kmem_cache_open for slab_state == DOWN. |
3090 | */ | 3132 | */ |
3091 | create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", | 3133 | create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", |
3092 | sizeof(struct kmem_cache_node), GFP_KERNEL); | 3134 | sizeof(struct kmem_cache_node), GFP_NOWAIT); |
3093 | kmalloc_caches[0].refcount = -1; | 3135 | kmalloc_caches[0].refcount = -1; |
3094 | caches++; | 3136 | caches++; |
3095 | 3137 | ||
@@ -3102,16 +3144,16 @@ void __init kmem_cache_init(void) | |||
3102 | /* Caches that are not of the two-to-the-power-of size */ | 3144 | /* Caches that are not of the two-to-the-power-of size */ |
3103 | if (KMALLOC_MIN_SIZE <= 64) { | 3145 | if (KMALLOC_MIN_SIZE <= 64) { |
3104 | create_kmalloc_cache(&kmalloc_caches[1], | 3146 | create_kmalloc_cache(&kmalloc_caches[1], |
3105 | "kmalloc-96", 96, GFP_KERNEL); | 3147 | "kmalloc-96", 96, GFP_NOWAIT); |
3106 | caches++; | 3148 | caches++; |
3107 | create_kmalloc_cache(&kmalloc_caches[2], | 3149 | create_kmalloc_cache(&kmalloc_caches[2], |
3108 | "kmalloc-192", 192, GFP_KERNEL); | 3150 | "kmalloc-192", 192, GFP_NOWAIT); |
3109 | caches++; | 3151 | caches++; |
3110 | } | 3152 | } |
3111 | 3153 | ||
3112 | for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { | 3154 | for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { |
3113 | create_kmalloc_cache(&kmalloc_caches[i], | 3155 | create_kmalloc_cache(&kmalloc_caches[i], |
3114 | "kmalloc", 1 << i, GFP_KERNEL); | 3156 | "kmalloc", 1 << i, GFP_NOWAIT); |
3115 | caches++; | 3157 | caches++; |
3116 | } | 3158 | } |
3117 | 3159 | ||
@@ -3148,7 +3190,7 @@ void __init kmem_cache_init(void) | |||
3148 | /* Provide the correct kmalloc names now that the caches are up */ | 3190 | /* Provide the correct kmalloc names now that the caches are up */ |
3149 | for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) | 3191 | for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) |
3150 | kmalloc_caches[i]. name = | 3192 | kmalloc_caches[i]. name = |
3151 | kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); | 3193 | kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i); |
3152 | 3194 | ||
3153 | #ifdef CONFIG_SMP | 3195 | #ifdef CONFIG_SMP |
3154 | register_cpu_notifier(&slab_notifier); | 3196 | register_cpu_notifier(&slab_notifier); |
@@ -3166,6 +3208,14 @@ void __init kmem_cache_init(void) | |||
3166 | nr_cpu_ids, nr_node_ids); | 3208 | nr_cpu_ids, nr_node_ids); |
3167 | } | 3209 | } |
3168 | 3210 | ||
3211 | void __init kmem_cache_init_late(void) | ||
3212 | { | ||
3213 | /* | ||
3214 | * Interrupts are enabled now so all GFP allocations are safe. | ||
3215 | */ | ||
3216 | slab_gfp_mask = __GFP_BITS_MASK; | ||
3217 | } | ||
3218 | |||
3169 | /* | 3219 | /* |
3170 | * Find a mergeable slab cache | 3220 | * Find a mergeable slab cache |
3171 | */ | 3221 | */ |
@@ -3764,7 +3814,7 @@ static int list_locations(struct kmem_cache *s, char *buf, | |||
3764 | to_cpumask(l->cpus)); | 3814 | to_cpumask(l->cpus)); |
3765 | } | 3815 | } |
3766 | 3816 | ||
3767 | if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && | 3817 | if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && |
3768 | len < PAGE_SIZE - 60) { | 3818 | len < PAGE_SIZE - 60) { |
3769 | len += sprintf(buf + len, " nodes="); | 3819 | len += sprintf(buf + len, " nodes="); |
3770 | len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, | 3820 | len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, |
@@ -4439,6 +4489,8 @@ static char *create_unique_id(struct kmem_cache *s) | |||
4439 | *p++ = 'a'; | 4489 | *p++ = 'a'; |
4440 | if (s->flags & SLAB_DEBUG_FREE) | 4490 | if (s->flags & SLAB_DEBUG_FREE) |
4441 | *p++ = 'F'; | 4491 | *p++ = 'F'; |
4492 | if (!(s->flags & SLAB_NOTRACK)) | ||
4493 | *p++ = 't'; | ||
4442 | if (p != name + 1) | 4494 | if (p != name + 1) |
4443 | *p++ = '-'; | 4495 | *p++ = '-'; |
4444 | p += sprintf(p, "%07d", s->size); | 4496 | p += sprintf(p, "%07d", s->size); |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 3ecea98ecb45..42cd38eba79f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -109,8 +109,6 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) | |||
109 | */ | 109 | */ |
110 | void __delete_from_swap_cache(struct page *page) | 110 | void __delete_from_swap_cache(struct page *page) |
111 | { | 111 | { |
112 | swp_entry_t ent = {.val = page_private(page)}; | ||
113 | |||
114 | VM_BUG_ON(!PageLocked(page)); | 112 | VM_BUG_ON(!PageLocked(page)); |
115 | VM_BUG_ON(!PageSwapCache(page)); | 113 | VM_BUG_ON(!PageSwapCache(page)); |
116 | VM_BUG_ON(PageWriteback(page)); | 114 | VM_BUG_ON(PageWriteback(page)); |
@@ -121,13 +119,11 @@ void __delete_from_swap_cache(struct page *page) | |||
121 | total_swapcache_pages--; | 119 | total_swapcache_pages--; |
122 | __dec_zone_page_state(page, NR_FILE_PAGES); | 120 | __dec_zone_page_state(page, NR_FILE_PAGES); |
123 | INC_CACHE_INFO(del_total); | 121 | INC_CACHE_INFO(del_total); |
124 | mem_cgroup_uncharge_swapcache(page, ent); | ||
125 | } | 122 | } |
126 | 123 | ||
127 | /** | 124 | /** |
128 | * add_to_swap - allocate swap space for a page | 125 | * add_to_swap - allocate swap space for a page |
129 | * @page: page we want to move to swap | 126 | * @page: page we want to move to swap |
130 | * @gfp_mask: memory allocation flags | ||
131 | * | 127 | * |
132 | * Allocate swap space for the page and add the page to the | 128 | * Allocate swap space for the page and add the page to the |
133 | * swap cache. Caller needs to hold the page lock. | 129 | * swap cache. Caller needs to hold the page lock. |
@@ -165,11 +161,11 @@ int add_to_swap(struct page *page) | |||
165 | return 1; | 161 | return 1; |
166 | case -EEXIST: | 162 | case -EEXIST: |
167 | /* Raced with "speculative" read_swap_cache_async */ | 163 | /* Raced with "speculative" read_swap_cache_async */ |
168 | swap_free(entry); | 164 | swapcache_free(entry, NULL); |
169 | continue; | 165 | continue; |
170 | default: | 166 | default: |
171 | /* -ENOMEM radix-tree allocation failure */ | 167 | /* -ENOMEM radix-tree allocation failure */ |
172 | swap_free(entry); | 168 | swapcache_free(entry, NULL); |
173 | return 0; | 169 | return 0; |
174 | } | 170 | } |
175 | } | 171 | } |
@@ -191,7 +187,7 @@ void delete_from_swap_cache(struct page *page) | |||
191 | __delete_from_swap_cache(page); | 187 | __delete_from_swap_cache(page); |
192 | spin_unlock_irq(&swapper_space.tree_lock); | 188 | spin_unlock_irq(&swapper_space.tree_lock); |
193 | 189 | ||
194 | swap_free(entry); | 190 | swapcache_free(entry, page); |
195 | page_cache_release(page); | 191 | page_cache_release(page); |
196 | } | 192 | } |
197 | 193 | ||
@@ -295,7 +291,10 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
295 | /* | 291 | /* |
296 | * Swap entry may have been freed since our caller observed it. | 292 | * Swap entry may have been freed since our caller observed it. |
297 | */ | 293 | */ |
298 | if (!swap_duplicate(entry)) | 294 | err = swapcache_prepare(entry); |
295 | if (err == -EEXIST) /* seems racy */ | ||
296 | continue; | ||
297 | if (err) /* swp entry is obsolete ? */ | ||
299 | break; | 298 | break; |
300 | 299 | ||
301 | /* | 300 | /* |
@@ -314,12 +313,12 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
314 | * Initiate read into locked page and return. | 313 | * Initiate read into locked page and return. |
315 | */ | 314 | */ |
316 | lru_cache_add_anon(new_page); | 315 | lru_cache_add_anon(new_page); |
317 | swap_readpage(NULL, new_page); | 316 | swap_readpage(new_page); |
318 | return new_page; | 317 | return new_page; |
319 | } | 318 | } |
320 | ClearPageSwapBacked(new_page); | 319 | ClearPageSwapBacked(new_page); |
321 | __clear_page_locked(new_page); | 320 | __clear_page_locked(new_page); |
322 | swap_free(entry); | 321 | swapcache_free(entry, NULL); |
323 | } while (err != -ENOMEM); | 322 | } while (err != -ENOMEM); |
324 | 323 | ||
325 | if (new_page) | 324 | if (new_page) |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 312fafe0ab6e..28faa01cf578 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -53,6 +53,59 @@ static struct swap_info_struct swap_info[MAX_SWAPFILES]; | |||
53 | 53 | ||
54 | static DEFINE_MUTEX(swapon_mutex); | 54 | static DEFINE_MUTEX(swapon_mutex); |
55 | 55 | ||
56 | /* For reference count accounting in swap_map */ | ||
57 | /* enum for swap_map[] handling. internal use only */ | ||
58 | enum { | ||
59 | SWAP_MAP = 0, /* ops for reference from swap users */ | ||
60 | SWAP_CACHE, /* ops for reference from swap cache */ | ||
61 | }; | ||
62 | |||
63 | static inline int swap_count(unsigned short ent) | ||
64 | { | ||
65 | return ent & SWAP_COUNT_MASK; | ||
66 | } | ||
67 | |||
68 | static inline bool swap_has_cache(unsigned short ent) | ||
69 | { | ||
70 | return !!(ent & SWAP_HAS_CACHE); | ||
71 | } | ||
72 | |||
73 | static inline unsigned short encode_swapmap(int count, bool has_cache) | ||
74 | { | ||
75 | unsigned short ret = count; | ||
76 | |||
77 | if (has_cache) | ||
78 | return SWAP_HAS_CACHE | ret; | ||
79 | return ret; | ||
80 | } | ||
81 | |||
82 | /* returnes 1 if swap entry is freed */ | ||
83 | static int | ||
84 | __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) | ||
85 | { | ||
86 | int type = si - swap_info; | ||
87 | swp_entry_t entry = swp_entry(type, offset); | ||
88 | struct page *page; | ||
89 | int ret = 0; | ||
90 | |||
91 | page = find_get_page(&swapper_space, entry.val); | ||
92 | if (!page) | ||
93 | return 0; | ||
94 | /* | ||
95 | * This function is called from scan_swap_map() and it's called | ||
96 | * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here. | ||
97 | * We have to use trylock for avoiding deadlock. This is a special | ||
98 | * case and you should use try_to_free_swap() with explicit lock_page() | ||
99 | * in usual operations. | ||
100 | */ | ||
101 | if (trylock_page(page)) { | ||
102 | ret = try_to_free_swap(page); | ||
103 | unlock_page(page); | ||
104 | } | ||
105 | page_cache_release(page); | ||
106 | return ret; | ||
107 | } | ||
108 | |||
56 | /* | 109 | /* |
57 | * We need this because the bdev->unplug_fn can sleep and we cannot | 110 | * We need this because the bdev->unplug_fn can sleep and we cannot |
58 | * hold swap_lock while calling the unplug_fn. And swap_lock | 111 | * hold swap_lock while calling the unplug_fn. And swap_lock |
@@ -167,7 +220,8 @@ static int wait_for_discard(void *word) | |||
167 | #define SWAPFILE_CLUSTER 256 | 220 | #define SWAPFILE_CLUSTER 256 |
168 | #define LATENCY_LIMIT 256 | 221 | #define LATENCY_LIMIT 256 |
169 | 222 | ||
170 | static inline unsigned long scan_swap_map(struct swap_info_struct *si) | 223 | static inline unsigned long scan_swap_map(struct swap_info_struct *si, |
224 | int cache) | ||
171 | { | 225 | { |
172 | unsigned long offset; | 226 | unsigned long offset; |
173 | unsigned long scan_base; | 227 | unsigned long scan_base; |
@@ -273,6 +327,19 @@ checks: | |||
273 | goto no_page; | 327 | goto no_page; |
274 | if (offset > si->highest_bit) | 328 | if (offset > si->highest_bit) |
275 | scan_base = offset = si->lowest_bit; | 329 | scan_base = offset = si->lowest_bit; |
330 | |||
331 | /* reuse swap entry of cache-only swap if not busy. */ | ||
332 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { | ||
333 | int swap_was_freed; | ||
334 | spin_unlock(&swap_lock); | ||
335 | swap_was_freed = __try_to_reclaim_swap(si, offset); | ||
336 | spin_lock(&swap_lock); | ||
337 | /* entry was freed successfully, try to use this again */ | ||
338 | if (swap_was_freed) | ||
339 | goto checks; | ||
340 | goto scan; /* check next one */ | ||
341 | } | ||
342 | |||
276 | if (si->swap_map[offset]) | 343 | if (si->swap_map[offset]) |
277 | goto scan; | 344 | goto scan; |
278 | 345 | ||
@@ -285,7 +352,10 @@ checks: | |||
285 | si->lowest_bit = si->max; | 352 | si->lowest_bit = si->max; |
286 | si->highest_bit = 0; | 353 | si->highest_bit = 0; |
287 | } | 354 | } |
288 | si->swap_map[offset] = 1; | 355 | if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */ |
356 | si->swap_map[offset] = encode_swapmap(0, true); | ||
357 | else /* at suspend */ | ||
358 | si->swap_map[offset] = encode_swapmap(1, false); | ||
289 | si->cluster_next = offset + 1; | 359 | si->cluster_next = offset + 1; |
290 | si->flags -= SWP_SCANNING; | 360 | si->flags -= SWP_SCANNING; |
291 | 361 | ||
@@ -351,6 +421,10 @@ scan: | |||
351 | spin_lock(&swap_lock); | 421 | spin_lock(&swap_lock); |
352 | goto checks; | 422 | goto checks; |
353 | } | 423 | } |
424 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { | ||
425 | spin_lock(&swap_lock); | ||
426 | goto checks; | ||
427 | } | ||
354 | if (unlikely(--latency_ration < 0)) { | 428 | if (unlikely(--latency_ration < 0)) { |
355 | cond_resched(); | 429 | cond_resched(); |
356 | latency_ration = LATENCY_LIMIT; | 430 | latency_ration = LATENCY_LIMIT; |
@@ -362,6 +436,10 @@ scan: | |||
362 | spin_lock(&swap_lock); | 436 | spin_lock(&swap_lock); |
363 | goto checks; | 437 | goto checks; |
364 | } | 438 | } |
439 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { | ||
440 | spin_lock(&swap_lock); | ||
441 | goto checks; | ||
442 | } | ||
365 | if (unlikely(--latency_ration < 0)) { | 443 | if (unlikely(--latency_ration < 0)) { |
366 | cond_resched(); | 444 | cond_resched(); |
367 | latency_ration = LATENCY_LIMIT; | 445 | latency_ration = LATENCY_LIMIT; |
@@ -401,7 +479,8 @@ swp_entry_t get_swap_page(void) | |||
401 | continue; | 479 | continue; |
402 | 480 | ||
403 | swap_list.next = next; | 481 | swap_list.next = next; |
404 | offset = scan_swap_map(si); | 482 | /* This is called for allocating swap entry for cache */ |
483 | offset = scan_swap_map(si, SWAP_CACHE); | ||
405 | if (offset) { | 484 | if (offset) { |
406 | spin_unlock(&swap_lock); | 485 | spin_unlock(&swap_lock); |
407 | return swp_entry(type, offset); | 486 | return swp_entry(type, offset); |
@@ -415,6 +494,7 @@ noswap: | |||
415 | return (swp_entry_t) {0}; | 494 | return (swp_entry_t) {0}; |
416 | } | 495 | } |
417 | 496 | ||
497 | /* The only caller of this function is now susupend routine */ | ||
418 | swp_entry_t get_swap_page_of_type(int type) | 498 | swp_entry_t get_swap_page_of_type(int type) |
419 | { | 499 | { |
420 | struct swap_info_struct *si; | 500 | struct swap_info_struct *si; |
@@ -424,7 +504,8 @@ swp_entry_t get_swap_page_of_type(int type) | |||
424 | si = swap_info + type; | 504 | si = swap_info + type; |
425 | if (si->flags & SWP_WRITEOK) { | 505 | if (si->flags & SWP_WRITEOK) { |
426 | nr_swap_pages--; | 506 | nr_swap_pages--; |
427 | offset = scan_swap_map(si); | 507 | /* This is called for allocating swap entry, not cache */ |
508 | offset = scan_swap_map(si, SWAP_MAP); | ||
428 | if (offset) { | 509 | if (offset) { |
429 | spin_unlock(&swap_lock); | 510 | spin_unlock(&swap_lock); |
430 | return swp_entry(type, offset); | 511 | return swp_entry(type, offset); |
@@ -471,25 +552,38 @@ out: | |||
471 | return NULL; | 552 | return NULL; |
472 | } | 553 | } |
473 | 554 | ||
474 | static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent) | 555 | static int swap_entry_free(struct swap_info_struct *p, |
556 | swp_entry_t ent, int cache) | ||
475 | { | 557 | { |
476 | unsigned long offset = swp_offset(ent); | 558 | unsigned long offset = swp_offset(ent); |
477 | int count = p->swap_map[offset]; | 559 | int count = swap_count(p->swap_map[offset]); |
478 | 560 | bool has_cache; | |
479 | if (count < SWAP_MAP_MAX) { | 561 | |
480 | count--; | 562 | has_cache = swap_has_cache(p->swap_map[offset]); |
481 | p->swap_map[offset] = count; | 563 | |
482 | if (!count) { | 564 | if (cache == SWAP_MAP) { /* dropping usage count of swap */ |
483 | if (offset < p->lowest_bit) | 565 | if (count < SWAP_MAP_MAX) { |
484 | p->lowest_bit = offset; | 566 | count--; |
485 | if (offset > p->highest_bit) | 567 | p->swap_map[offset] = encode_swapmap(count, has_cache); |
486 | p->highest_bit = offset; | ||
487 | if (p->prio > swap_info[swap_list.next].prio) | ||
488 | swap_list.next = p - swap_info; | ||
489 | nr_swap_pages++; | ||
490 | p->inuse_pages--; | ||
491 | mem_cgroup_uncharge_swap(ent); | ||
492 | } | 568 | } |
569 | } else { /* dropping swap cache flag */ | ||
570 | VM_BUG_ON(!has_cache); | ||
571 | p->swap_map[offset] = encode_swapmap(count, false); | ||
572 | |||
573 | } | ||
574 | /* return code. */ | ||
575 | count = p->swap_map[offset]; | ||
576 | /* free if no reference */ | ||
577 | if (!count) { | ||
578 | if (offset < p->lowest_bit) | ||
579 | p->lowest_bit = offset; | ||
580 | if (offset > p->highest_bit) | ||
581 | p->highest_bit = offset; | ||
582 | if (p->prio > swap_info[swap_list.next].prio) | ||
583 | swap_list.next = p - swap_info; | ||
584 | nr_swap_pages++; | ||
585 | p->inuse_pages--; | ||
586 | mem_cgroup_uncharge_swap(ent); | ||
493 | } | 587 | } |
494 | return count; | 588 | return count; |
495 | } | 589 | } |
@@ -504,9 +598,26 @@ void swap_free(swp_entry_t entry) | |||
504 | 598 | ||
505 | p = swap_info_get(entry); | 599 | p = swap_info_get(entry); |
506 | if (p) { | 600 | if (p) { |
507 | swap_entry_free(p, entry); | 601 | swap_entry_free(p, entry, SWAP_MAP); |
602 | spin_unlock(&swap_lock); | ||
603 | } | ||
604 | } | ||
605 | |||
606 | /* | ||
607 | * Called after dropping swapcache to decrease refcnt to swap entries. | ||
608 | */ | ||
609 | void swapcache_free(swp_entry_t entry, struct page *page) | ||
610 | { | ||
611 | struct swap_info_struct *p; | ||
612 | |||
613 | if (page) | ||
614 | mem_cgroup_uncharge_swapcache(page, entry); | ||
615 | p = swap_info_get(entry); | ||
616 | if (p) { | ||
617 | swap_entry_free(p, entry, SWAP_CACHE); | ||
508 | spin_unlock(&swap_lock); | 618 | spin_unlock(&swap_lock); |
509 | } | 619 | } |
620 | return; | ||
510 | } | 621 | } |
511 | 622 | ||
512 | /* | 623 | /* |
@@ -521,8 +632,7 @@ static inline int page_swapcount(struct page *page) | |||
521 | entry.val = page_private(page); | 632 | entry.val = page_private(page); |
522 | p = swap_info_get(entry); | 633 | p = swap_info_get(entry); |
523 | if (p) { | 634 | if (p) { |
524 | /* Subtract the 1 for the swap cache itself */ | 635 | count = swap_count(p->swap_map[swp_offset(entry)]); |
525 | count = p->swap_map[swp_offset(entry)] - 1; | ||
526 | spin_unlock(&swap_lock); | 636 | spin_unlock(&swap_lock); |
527 | } | 637 | } |
528 | return count; | 638 | return count; |
@@ -584,7 +694,7 @@ int free_swap_and_cache(swp_entry_t entry) | |||
584 | 694 | ||
585 | p = swap_info_get(entry); | 695 | p = swap_info_get(entry); |
586 | if (p) { | 696 | if (p) { |
587 | if (swap_entry_free(p, entry) == 1) { | 697 | if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) { |
588 | page = find_get_page(&swapper_space, entry.val); | 698 | page = find_get_page(&swapper_space, entry.val); |
589 | if (page && !trylock_page(page)) { | 699 | if (page && !trylock_page(page)) { |
590 | page_cache_release(page); | 700 | page_cache_release(page); |
@@ -891,7 +1001,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
891 | i = 1; | 1001 | i = 1; |
892 | } | 1002 | } |
893 | count = si->swap_map[i]; | 1003 | count = si->swap_map[i]; |
894 | if (count && count != SWAP_MAP_BAD) | 1004 | if (count && swap_count(count) != SWAP_MAP_BAD) |
895 | break; | 1005 | break; |
896 | } | 1006 | } |
897 | return i; | 1007 | return i; |
@@ -995,13 +1105,13 @@ static int try_to_unuse(unsigned int type) | |||
995 | */ | 1105 | */ |
996 | shmem = 0; | 1106 | shmem = 0; |
997 | swcount = *swap_map; | 1107 | swcount = *swap_map; |
998 | if (swcount > 1) { | 1108 | if (swap_count(swcount)) { |
999 | if (start_mm == &init_mm) | 1109 | if (start_mm == &init_mm) |
1000 | shmem = shmem_unuse(entry, page); | 1110 | shmem = shmem_unuse(entry, page); |
1001 | else | 1111 | else |
1002 | retval = unuse_mm(start_mm, entry, page); | 1112 | retval = unuse_mm(start_mm, entry, page); |
1003 | } | 1113 | } |
1004 | if (*swap_map > 1) { | 1114 | if (swap_count(*swap_map)) { |
1005 | int set_start_mm = (*swap_map >= swcount); | 1115 | int set_start_mm = (*swap_map >= swcount); |
1006 | struct list_head *p = &start_mm->mmlist; | 1116 | struct list_head *p = &start_mm->mmlist; |
1007 | struct mm_struct *new_start_mm = start_mm; | 1117 | struct mm_struct *new_start_mm = start_mm; |
@@ -1011,7 +1121,7 @@ static int try_to_unuse(unsigned int type) | |||
1011 | atomic_inc(&new_start_mm->mm_users); | 1121 | atomic_inc(&new_start_mm->mm_users); |
1012 | atomic_inc(&prev_mm->mm_users); | 1122 | atomic_inc(&prev_mm->mm_users); |
1013 | spin_lock(&mmlist_lock); | 1123 | spin_lock(&mmlist_lock); |
1014 | while (*swap_map > 1 && !retval && !shmem && | 1124 | while (swap_count(*swap_map) && !retval && !shmem && |
1015 | (p = p->next) != &start_mm->mmlist) { | 1125 | (p = p->next) != &start_mm->mmlist) { |
1016 | mm = list_entry(p, struct mm_struct, mmlist); | 1126 | mm = list_entry(p, struct mm_struct, mmlist); |
1017 | if (!atomic_inc_not_zero(&mm->mm_users)) | 1127 | if (!atomic_inc_not_zero(&mm->mm_users)) |
@@ -1023,14 +1133,16 @@ static int try_to_unuse(unsigned int type) | |||
1023 | cond_resched(); | 1133 | cond_resched(); |
1024 | 1134 | ||
1025 | swcount = *swap_map; | 1135 | swcount = *swap_map; |
1026 | if (swcount <= 1) | 1136 | if (!swap_count(swcount)) /* any usage ? */ |
1027 | ; | 1137 | ; |
1028 | else if (mm == &init_mm) { | 1138 | else if (mm == &init_mm) { |
1029 | set_start_mm = 1; | 1139 | set_start_mm = 1; |
1030 | shmem = shmem_unuse(entry, page); | 1140 | shmem = shmem_unuse(entry, page); |
1031 | } else | 1141 | } else |
1032 | retval = unuse_mm(mm, entry, page); | 1142 | retval = unuse_mm(mm, entry, page); |
1033 | if (set_start_mm && *swap_map < swcount) { | 1143 | |
1144 | if (set_start_mm && | ||
1145 | swap_count(*swap_map) < swcount) { | ||
1034 | mmput(new_start_mm); | 1146 | mmput(new_start_mm); |
1035 | atomic_inc(&mm->mm_users); | 1147 | atomic_inc(&mm->mm_users); |
1036 | new_start_mm = mm; | 1148 | new_start_mm = mm; |
@@ -1057,21 +1169,25 @@ static int try_to_unuse(unsigned int type) | |||
1057 | } | 1169 | } |
1058 | 1170 | ||
1059 | /* | 1171 | /* |
1060 | * How could swap count reach 0x7fff when the maximum | 1172 | * How could swap count reach 0x7ffe ? |
1061 | * pid is 0x7fff, and there's no way to repeat a swap | 1173 | * There's no way to repeat a swap page within an mm |
1062 | * page within an mm (except in shmem, where it's the | 1174 | * (except in shmem, where it's the shared object which takes |
1063 | * shared object which takes the reference count)? | 1175 | * the reference count)? |
1064 | * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. | 1176 | * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned |
1065 | * | 1177 | * short is too small....) |
1066 | * If that's wrong, then we should worry more about | 1178 | * If that's wrong, then we should worry more about |
1067 | * exit_mmap() and do_munmap() cases described above: | 1179 | * exit_mmap() and do_munmap() cases described above: |
1068 | * we might be resetting SWAP_MAP_MAX too early here. | 1180 | * we might be resetting SWAP_MAP_MAX too early here. |
1069 | * We know "Undead"s can happen, they're okay, so don't | 1181 | * We know "Undead"s can happen, they're okay, so don't |
1070 | * report them; but do report if we reset SWAP_MAP_MAX. | 1182 | * report them; but do report if we reset SWAP_MAP_MAX. |
1071 | */ | 1183 | */ |
1072 | if (*swap_map == SWAP_MAP_MAX) { | 1184 | /* We might release the lock_page() in unuse_mm(). */ |
1185 | if (!PageSwapCache(page) || page_private(page) != entry.val) | ||
1186 | goto retry; | ||
1187 | |||
1188 | if (swap_count(*swap_map) == SWAP_MAP_MAX) { | ||
1073 | spin_lock(&swap_lock); | 1189 | spin_lock(&swap_lock); |
1074 | *swap_map = 1; | 1190 | *swap_map = encode_swapmap(0, true); |
1075 | spin_unlock(&swap_lock); | 1191 | spin_unlock(&swap_lock); |
1076 | reset_overflow = 1; | 1192 | reset_overflow = 1; |
1077 | } | 1193 | } |
@@ -1089,7 +1205,8 @@ static int try_to_unuse(unsigned int type) | |||
1089 | * pages would be incorrect if swap supported "shared | 1205 | * pages would be incorrect if swap supported "shared |
1090 | * private" pages, but they are handled by tmpfs files. | 1206 | * private" pages, but they are handled by tmpfs files. |
1091 | */ | 1207 | */ |
1092 | if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { | 1208 | if (swap_count(*swap_map) && |
1209 | PageDirty(page) && PageSwapCache(page)) { | ||
1093 | struct writeback_control wbc = { | 1210 | struct writeback_control wbc = { |
1094 | .sync_mode = WB_SYNC_NONE, | 1211 | .sync_mode = WB_SYNC_NONE, |
1095 | }; | 1212 | }; |
@@ -1116,6 +1233,7 @@ static int try_to_unuse(unsigned int type) | |||
1116 | * mark page dirty so shrink_page_list will preserve it. | 1233 | * mark page dirty so shrink_page_list will preserve it. |
1117 | */ | 1234 | */ |
1118 | SetPageDirty(page); | 1235 | SetPageDirty(page); |
1236 | retry: | ||
1119 | unlock_page(page); | 1237 | unlock_page(page); |
1120 | page_cache_release(page); | 1238 | page_cache_release(page); |
1121 | 1239 | ||
@@ -1942,15 +2060,23 @@ void si_swapinfo(struct sysinfo *val) | |||
1942 | * | 2060 | * |
1943 | * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as | 2061 | * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as |
1944 | * "permanent", but will be reclaimed by the next swapoff. | 2062 | * "permanent", but will be reclaimed by the next swapoff. |
2063 | * Returns error code in following case. | ||
2064 | * - success -> 0 | ||
2065 | * - swp_entry is invalid -> EINVAL | ||
2066 | * - swp_entry is migration entry -> EINVAL | ||
2067 | * - swap-cache reference is requested but there is already one. -> EEXIST | ||
2068 | * - swap-cache reference is requested but the entry is not used. -> ENOENT | ||
1945 | */ | 2069 | */ |
1946 | int swap_duplicate(swp_entry_t entry) | 2070 | static int __swap_duplicate(swp_entry_t entry, bool cache) |
1947 | { | 2071 | { |
1948 | struct swap_info_struct * p; | 2072 | struct swap_info_struct * p; |
1949 | unsigned long offset, type; | 2073 | unsigned long offset, type; |
1950 | int result = 0; | 2074 | int result = -EINVAL; |
2075 | int count; | ||
2076 | bool has_cache; | ||
1951 | 2077 | ||
1952 | if (is_migration_entry(entry)) | 2078 | if (is_migration_entry(entry)) |
1953 | return 1; | 2079 | return -EINVAL; |
1954 | 2080 | ||
1955 | type = swp_type(entry); | 2081 | type = swp_type(entry); |
1956 | if (type >= nr_swapfiles) | 2082 | if (type >= nr_swapfiles) |
@@ -1959,17 +2085,40 @@ int swap_duplicate(swp_entry_t entry) | |||
1959 | offset = swp_offset(entry); | 2085 | offset = swp_offset(entry); |
1960 | 2086 | ||
1961 | spin_lock(&swap_lock); | 2087 | spin_lock(&swap_lock); |
1962 | if (offset < p->max && p->swap_map[offset]) { | 2088 | |
1963 | if (p->swap_map[offset] < SWAP_MAP_MAX - 1) { | 2089 | if (unlikely(offset >= p->max)) |
1964 | p->swap_map[offset]++; | 2090 | goto unlock_out; |
1965 | result = 1; | 2091 | |
1966 | } else if (p->swap_map[offset] <= SWAP_MAP_MAX) { | 2092 | count = swap_count(p->swap_map[offset]); |
2093 | has_cache = swap_has_cache(p->swap_map[offset]); | ||
2094 | |||
2095 | if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */ | ||
2096 | |||
2097 | /* set SWAP_HAS_CACHE if there is no cache and entry is used */ | ||
2098 | if (!has_cache && count) { | ||
2099 | p->swap_map[offset] = encode_swapmap(count, true); | ||
2100 | result = 0; | ||
2101 | } else if (has_cache) /* someone added cache */ | ||
2102 | result = -EEXIST; | ||
2103 | else if (!count) /* no users */ | ||
2104 | result = -ENOENT; | ||
2105 | |||
2106 | } else if (count || has_cache) { | ||
2107 | if (count < SWAP_MAP_MAX - 1) { | ||
2108 | p->swap_map[offset] = encode_swapmap(count + 1, | ||
2109 | has_cache); | ||
2110 | result = 0; | ||
2111 | } else if (count <= SWAP_MAP_MAX) { | ||
1967 | if (swap_overflow++ < 5) | 2112 | if (swap_overflow++ < 5) |
1968 | printk(KERN_WARNING "swap_dup: swap entry overflow\n"); | 2113 | printk(KERN_WARNING |
1969 | p->swap_map[offset] = SWAP_MAP_MAX; | 2114 | "swap_dup: swap entry overflow\n"); |
1970 | result = 1; | 2115 | p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX, |
2116 | has_cache); | ||
2117 | result = 0; | ||
1971 | } | 2118 | } |
1972 | } | 2119 | } else |
2120 | result = -ENOENT; /* unused swap entry */ | ||
2121 | unlock_out: | ||
1973 | spin_unlock(&swap_lock); | 2122 | spin_unlock(&swap_lock); |
1974 | out: | 2123 | out: |
1975 | return result; | 2124 | return result; |
@@ -1978,6 +2127,27 @@ bad_file: | |||
1978 | printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); | 2127 | printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); |
1979 | goto out; | 2128 | goto out; |
1980 | } | 2129 | } |
2130 | /* | ||
2131 | * increase reference count of swap entry by 1. | ||
2132 | */ | ||
2133 | void swap_duplicate(swp_entry_t entry) | ||
2134 | { | ||
2135 | __swap_duplicate(entry, SWAP_MAP); | ||
2136 | } | ||
2137 | |||
2138 | /* | ||
2139 | * @entry: swap entry for which we allocate swap cache. | ||
2140 | * | ||
2141 | * Called when allocating swap cache for exising swap entry, | ||
2142 | * This can return error codes. Returns 0 at success. | ||
2143 | * -EBUSY means there is a swap cache. | ||
2144 | * Note: return code is different from swap_duplicate(). | ||
2145 | */ | ||
2146 | int swapcache_prepare(swp_entry_t entry) | ||
2147 | { | ||
2148 | return __swap_duplicate(entry, SWAP_CACHE); | ||
2149 | } | ||
2150 | |||
1981 | 2151 | ||
1982 | struct swap_info_struct * | 2152 | struct swap_info_struct * |
1983 | get_swap_info_struct(unsigned type) | 2153 | get_swap_info_struct(unsigned type) |
@@ -2016,7 +2186,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | |||
2016 | /* Don't read in free or bad pages */ | 2186 | /* Don't read in free or bad pages */ |
2017 | if (!si->swap_map[toff]) | 2187 | if (!si->swap_map[toff]) |
2018 | break; | 2188 | break; |
2019 | if (si->swap_map[toff] == SWAP_MAP_BAD) | 2189 | if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) |
2020 | break; | 2190 | break; |
2021 | } | 2191 | } |
2022 | /* Count contiguous allocated slots below our target */ | 2192 | /* Count contiguous allocated slots below our target */ |
@@ -2024,7 +2194,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | |||
2024 | /* Don't read in free or bad pages */ | 2194 | /* Don't read in free or bad pages */ |
2025 | if (!si->swap_map[toff]) | 2195 | if (!si->swap_map[toff]) |
2026 | break; | 2196 | break; |
2027 | if (si->swap_map[toff] == SWAP_MAP_BAD) | 2197 | if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) |
2028 | break; | 2198 | break; |
2029 | } | 2199 | } |
2030 | spin_unlock(&swap_lock); | 2200 | spin_unlock(&swap_lock); |
diff --git a/mm/truncate.c b/mm/truncate.c index 55206fab7b99..ccc3ecf7cb98 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -267,8 +267,21 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) | |||
267 | } | 267 | } |
268 | EXPORT_SYMBOL(truncate_inode_pages); | 268 | EXPORT_SYMBOL(truncate_inode_pages); |
269 | 269 | ||
270 | unsigned long __invalidate_mapping_pages(struct address_space *mapping, | 270 | /** |
271 | pgoff_t start, pgoff_t end, bool be_atomic) | 271 | * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode |
272 | * @mapping: the address_space which holds the pages to invalidate | ||
273 | * @start: the offset 'from' which to invalidate | ||
274 | * @end: the offset 'to' which to invalidate (inclusive) | ||
275 | * | ||
276 | * This function only removes the unlocked pages, if you want to | ||
277 | * remove all the pages of one inode, you must call truncate_inode_pages. | ||
278 | * | ||
279 | * invalidate_mapping_pages() will not block on IO activity. It will not | ||
280 | * invalidate pages which are dirty, locked, under writeback or mapped into | ||
281 | * pagetables. | ||
282 | */ | ||
283 | unsigned long invalidate_mapping_pages(struct address_space *mapping, | ||
284 | pgoff_t start, pgoff_t end) | ||
272 | { | 285 | { |
273 | struct pagevec pvec; | 286 | struct pagevec pvec; |
274 | pgoff_t next = start; | 287 | pgoff_t next = start; |
@@ -309,30 +322,10 @@ unlock: | |||
309 | break; | 322 | break; |
310 | } | 323 | } |
311 | pagevec_release(&pvec); | 324 | pagevec_release(&pvec); |
312 | if (likely(!be_atomic)) | 325 | cond_resched(); |
313 | cond_resched(); | ||
314 | } | 326 | } |
315 | return ret; | 327 | return ret; |
316 | } | 328 | } |
317 | |||
318 | /** | ||
319 | * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode | ||
320 | * @mapping: the address_space which holds the pages to invalidate | ||
321 | * @start: the offset 'from' which to invalidate | ||
322 | * @end: the offset 'to' which to invalidate (inclusive) | ||
323 | * | ||
324 | * This function only removes the unlocked pages, if you want to | ||
325 | * remove all the pages of one inode, you must call truncate_inode_pages. | ||
326 | * | ||
327 | * invalidate_mapping_pages() will not block on IO activity. It will not | ||
328 | * invalidate pages which are dirty, locked, under writeback or mapped into | ||
329 | * pagetables. | ||
330 | */ | ||
331 | unsigned long invalidate_mapping_pages(struct address_space *mapping, | ||
332 | pgoff_t start, pgoff_t end) | ||
333 | { | ||
334 | return __invalidate_mapping_pages(mapping, start, end, false); | ||
335 | } | ||
336 | EXPORT_SYMBOL(invalidate_mapping_pages); | 329 | EXPORT_SYMBOL(invalidate_mapping_pages); |
337 | 330 | ||
338 | /* | 331 | /* |
@@ -359,6 +352,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) | |||
359 | BUG_ON(page_has_private(page)); | 352 | BUG_ON(page_has_private(page)); |
360 | __remove_from_page_cache(page); | 353 | __remove_from_page_cache(page); |
361 | spin_unlock_irq(&mapping->tree_lock); | 354 | spin_unlock_irq(&mapping->tree_lock); |
355 | mem_cgroup_uncharge_cache_page(page); | ||
362 | page_cache_release(page); /* pagecache ref */ | 356 | page_cache_release(page); /* pagecache ref */ |
363 | return 1; | 357 | return 1; |
364 | failed: | 358 | failed: |
@@ -4,9 +4,11 @@ | |||
4 | #include <linux/module.h> | 4 | #include <linux/module.h> |
5 | #include <linux/err.h> | 5 | #include <linux/err.h> |
6 | #include <linux/sched.h> | 6 | #include <linux/sched.h> |
7 | #include <linux/tracepoint.h> | ||
8 | #include <asm/uaccess.h> | 7 | #include <asm/uaccess.h> |
9 | 8 | ||
9 | #define CREATE_TRACE_POINTS | ||
10 | #include <trace/events/kmem.h> | ||
11 | |||
10 | /** | 12 | /** |
11 | * kstrdup - allocate space for and copy an existing string | 13 | * kstrdup - allocate space for and copy an existing string |
12 | * @s: the string to duplicate | 14 | * @s: the string to duplicate |
@@ -166,6 +168,10 @@ EXPORT_SYMBOL(krealloc); | |||
166 | * | 168 | * |
167 | * The memory of the object @p points to is zeroed before freed. | 169 | * The memory of the object @p points to is zeroed before freed. |
168 | * If @p is %NULL, kzfree() does nothing. | 170 | * If @p is %NULL, kzfree() does nothing. |
171 | * | ||
172 | * Note: this function zeroes the whole allocated buffer which can be a good | ||
173 | * deal bigger than the requested buffer size passed to kmalloc(). So be | ||
174 | * careful when using this function in performance sensitive code. | ||
169 | */ | 175 | */ |
170 | void kzfree(const void *p) | 176 | void kzfree(const void *p) |
171 | { | 177 | { |
@@ -231,13 +237,21 @@ void arch_pick_mmap_layout(struct mm_struct *mm) | |||
231 | * @pages: array that receives pointers to the pages pinned. | 237 | * @pages: array that receives pointers to the pages pinned. |
232 | * Should be at least nr_pages long. | 238 | * Should be at least nr_pages long. |
233 | * | 239 | * |
234 | * Attempt to pin user pages in memory without taking mm->mmap_sem. | ||
235 | * If not successful, it will fall back to taking the lock and | ||
236 | * calling get_user_pages(). | ||
237 | * | ||
238 | * Returns number of pages pinned. This may be fewer than the number | 240 | * Returns number of pages pinned. This may be fewer than the number |
239 | * requested. If nr_pages is 0 or negative, returns 0. If no pages | 241 | * requested. If nr_pages is 0 or negative, returns 0. If no pages |
240 | * were pinned, returns -errno. | 242 | * were pinned, returns -errno. |
243 | * | ||
244 | * get_user_pages_fast provides equivalent functionality to get_user_pages, | ||
245 | * operating on current and current->mm, with force=0 and vma=NULL. However | ||
246 | * unlike get_user_pages, it must be called without mmap_sem held. | ||
247 | * | ||
248 | * get_user_pages_fast may take mmap_sem and page table locks, so no | ||
249 | * assumptions can be made about lack of locking. get_user_pages_fast is to be | ||
250 | * implemented in a way that is advantageous (vs get_user_pages()) when the | ||
251 | * user memory area is already faulted in and present in ptes. However if the | ||
252 | * pages have to be faulted in, it may turn out to be slightly slower so | ||
253 | * callers need to carefully consider what to use. On many architectures, | ||
254 | * get_user_pages_fast simply falls back to get_user_pages. | ||
241 | */ | 255 | */ |
242 | int __attribute__((weak)) get_user_pages_fast(unsigned long start, | 256 | int __attribute__((weak)) get_user_pages_fast(unsigned long start, |
243 | int nr_pages, int write, struct page **pages) | 257 | int nr_pages, int write, struct page **pages) |
@@ -255,13 +269,6 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start, | |||
255 | EXPORT_SYMBOL_GPL(get_user_pages_fast); | 269 | EXPORT_SYMBOL_GPL(get_user_pages_fast); |
256 | 270 | ||
257 | /* Tracepoints definitions. */ | 271 | /* Tracepoints definitions. */ |
258 | DEFINE_TRACE(kmalloc); | ||
259 | DEFINE_TRACE(kmem_cache_alloc); | ||
260 | DEFINE_TRACE(kmalloc_node); | ||
261 | DEFINE_TRACE(kmem_cache_alloc_node); | ||
262 | DEFINE_TRACE(kfree); | ||
263 | DEFINE_TRACE(kmem_cache_free); | ||
264 | |||
265 | EXPORT_TRACEPOINT_SYMBOL(kmalloc); | 272 | EXPORT_TRACEPOINT_SYMBOL(kmalloc); |
266 | EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); | 273 | EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); |
267 | EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); | 274 | EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 083716ea38c9..f8189a4b3e13 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -23,8 +23,8 @@ | |||
23 | #include <linux/rbtree.h> | 23 | #include <linux/rbtree.h> |
24 | #include <linux/radix-tree.h> | 24 | #include <linux/radix-tree.h> |
25 | #include <linux/rcupdate.h> | 25 | #include <linux/rcupdate.h> |
26 | #include <linux/bootmem.h> | ||
27 | #include <linux/pfn.h> | 26 | #include <linux/pfn.h> |
27 | #include <linux/kmemleak.h> | ||
28 | 28 | ||
29 | #include <asm/atomic.h> | 29 | #include <asm/atomic.h> |
30 | #include <asm/uaccess.h> | 30 | #include <asm/uaccess.h> |
@@ -1032,7 +1032,7 @@ void __init vmalloc_init(void) | |||
1032 | 1032 | ||
1033 | /* Import existing vmlist entries. */ | 1033 | /* Import existing vmlist entries. */ |
1034 | for (tmp = vmlist; tmp; tmp = tmp->next) { | 1034 | for (tmp = vmlist; tmp; tmp = tmp->next) { |
1035 | va = alloc_bootmem(sizeof(struct vmap_area)); | 1035 | va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); |
1036 | va->flags = tmp->flags | VM_VM_AREA; | 1036 | va->flags = tmp->flags | VM_VM_AREA; |
1037 | va->va_start = (unsigned long)tmp->addr; | 1037 | va->va_start = (unsigned long)tmp->addr; |
1038 | va->va_end = va->va_start + tmp->size; | 1038 | va->va_end = va->va_start + tmp->size; |
@@ -1327,6 +1327,9 @@ static void __vunmap(const void *addr, int deallocate_pages) | |||
1327 | void vfree(const void *addr) | 1327 | void vfree(const void *addr) |
1328 | { | 1328 | { |
1329 | BUG_ON(in_interrupt()); | 1329 | BUG_ON(in_interrupt()); |
1330 | |||
1331 | kmemleak_free(addr); | ||
1332 | |||
1330 | __vunmap(addr, 1); | 1333 | __vunmap(addr, 1); |
1331 | } | 1334 | } |
1332 | EXPORT_SYMBOL(vfree); | 1335 | EXPORT_SYMBOL(vfree); |
@@ -1439,8 +1442,17 @@ fail: | |||
1439 | 1442 | ||
1440 | void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) | 1443 | void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) |
1441 | { | 1444 | { |
1442 | return __vmalloc_area_node(area, gfp_mask, prot, -1, | 1445 | void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1, |
1443 | __builtin_return_address(0)); | 1446 | __builtin_return_address(0)); |
1447 | |||
1448 | /* | ||
1449 | * A ref_count = 3 is needed because the vm_struct and vmap_area | ||
1450 | * structures allocated in the __get_vm_area_node() function contain | ||
1451 | * references to the virtual address of the vmalloc'ed block. | ||
1452 | */ | ||
1453 | kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask); | ||
1454 | |||
1455 | return addr; | ||
1444 | } | 1456 | } |
1445 | 1457 | ||
1446 | /** | 1458 | /** |
@@ -1459,6 +1471,8 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, | |||
1459 | int node, void *caller) | 1471 | int node, void *caller) |
1460 | { | 1472 | { |
1461 | struct vm_struct *area; | 1473 | struct vm_struct *area; |
1474 | void *addr; | ||
1475 | unsigned long real_size = size; | ||
1462 | 1476 | ||
1463 | size = PAGE_ALIGN(size); | 1477 | size = PAGE_ALIGN(size); |
1464 | if (!size || (size >> PAGE_SHIFT) > num_physpages) | 1478 | if (!size || (size >> PAGE_SHIFT) > num_physpages) |
@@ -1470,7 +1484,16 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, | |||
1470 | if (!area) | 1484 | if (!area) |
1471 | return NULL; | 1485 | return NULL; |
1472 | 1486 | ||
1473 | return __vmalloc_area_node(area, gfp_mask, prot, node, caller); | 1487 | addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); |
1488 | |||
1489 | /* | ||
1490 | * A ref_count = 3 is needed because the vm_struct and vmap_area | ||
1491 | * structures allocated in the __get_vm_area_node() function contain | ||
1492 | * references to the virtual address of the vmalloc'ed block. | ||
1493 | */ | ||
1494 | kmemleak_alloc(addr, real_size, 3, gfp_mask); | ||
1495 | |||
1496 | return addr; | ||
1474 | } | 1497 | } |
1475 | 1498 | ||
1476 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) | 1499 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 5fa3eda1f03f..4139aa52b941 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -470,10 +470,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) | |||
470 | swp_entry_t swap = { .val = page_private(page) }; | 470 | swp_entry_t swap = { .val = page_private(page) }; |
471 | __delete_from_swap_cache(page); | 471 | __delete_from_swap_cache(page); |
472 | spin_unlock_irq(&mapping->tree_lock); | 472 | spin_unlock_irq(&mapping->tree_lock); |
473 | swap_free(swap); | 473 | swapcache_free(swap, page); |
474 | } else { | 474 | } else { |
475 | __remove_from_page_cache(page); | 475 | __remove_from_page_cache(page); |
476 | spin_unlock_irq(&mapping->tree_lock); | 476 | spin_unlock_irq(&mapping->tree_lock); |
477 | mem_cgroup_uncharge_cache_page(page); | ||
477 | } | 478 | } |
478 | 479 | ||
479 | return 1; | 480 | return 1; |
@@ -512,7 +513,6 @@ int remove_mapping(struct address_space *mapping, struct page *page) | |||
512 | * | 513 | * |
513 | * lru_lock must not be held, interrupts must be enabled. | 514 | * lru_lock must not be held, interrupts must be enabled. |
514 | */ | 515 | */ |
515 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
516 | void putback_lru_page(struct page *page) | 516 | void putback_lru_page(struct page *page) |
517 | { | 517 | { |
518 | int lru; | 518 | int lru; |
@@ -566,20 +566,6 @@ redo: | |||
566 | put_page(page); /* drop ref from isolate */ | 566 | put_page(page); /* drop ref from isolate */ |
567 | } | 567 | } |
568 | 568 | ||
569 | #else /* CONFIG_UNEVICTABLE_LRU */ | ||
570 | |||
571 | void putback_lru_page(struct page *page) | ||
572 | { | ||
573 | int lru; | ||
574 | VM_BUG_ON(PageLRU(page)); | ||
575 | |||
576 | lru = !!TestClearPageActive(page) + page_is_file_cache(page); | ||
577 | lru_cache_add_lru(page, lru); | ||
578 | put_page(page); | ||
579 | } | ||
580 | #endif /* CONFIG_UNEVICTABLE_LRU */ | ||
581 | |||
582 | |||
583 | /* | 569 | /* |
584 | * shrink_page_list() returns the number of reclaimed pages | 570 | * shrink_page_list() returns the number of reclaimed pages |
585 | */ | 571 | */ |
@@ -591,6 +577,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
591 | struct pagevec freed_pvec; | 577 | struct pagevec freed_pvec; |
592 | int pgactivate = 0; | 578 | int pgactivate = 0; |
593 | unsigned long nr_reclaimed = 0; | 579 | unsigned long nr_reclaimed = 0; |
580 | unsigned long vm_flags; | ||
594 | 581 | ||
595 | cond_resched(); | 582 | cond_resched(); |
596 | 583 | ||
@@ -641,7 +628,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
641 | goto keep_locked; | 628 | goto keep_locked; |
642 | } | 629 | } |
643 | 630 | ||
644 | referenced = page_referenced(page, 1, sc->mem_cgroup); | 631 | referenced = page_referenced(page, 1, |
632 | sc->mem_cgroup, &vm_flags); | ||
645 | /* In active use or really unfreeable? Activate it. */ | 633 | /* In active use or really unfreeable? Activate it. */ |
646 | if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && | 634 | if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && |
647 | referenced && page_mapping_inuse(page)) | 635 | referenced && page_mapping_inuse(page)) |
@@ -941,18 +929,10 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
941 | /* Check that we have not crossed a zone boundary. */ | 929 | /* Check that we have not crossed a zone boundary. */ |
942 | if (unlikely(page_zone_id(cursor_page) != zone_id)) | 930 | if (unlikely(page_zone_id(cursor_page) != zone_id)) |
943 | continue; | 931 | continue; |
944 | switch (__isolate_lru_page(cursor_page, mode, file)) { | 932 | if (__isolate_lru_page(cursor_page, mode, file) == 0) { |
945 | case 0: | ||
946 | list_move(&cursor_page->lru, dst); | 933 | list_move(&cursor_page->lru, dst); |
947 | nr_taken++; | 934 | nr_taken++; |
948 | scan++; | 935 | scan++; |
949 | break; | ||
950 | |||
951 | case -EBUSY: | ||
952 | /* else it is being freed elsewhere */ | ||
953 | list_move(&cursor_page->lru, src); | ||
954 | default: | ||
955 | break; /* ! on LRU or wrong list */ | ||
956 | } | 936 | } |
957 | } | 937 | } |
958 | } | 938 | } |
@@ -1059,6 +1039,19 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
1059 | unsigned long nr_scanned = 0; | 1039 | unsigned long nr_scanned = 0; |
1060 | unsigned long nr_reclaimed = 0; | 1040 | unsigned long nr_reclaimed = 0; |
1061 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1041 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
1042 | int lumpy_reclaim = 0; | ||
1043 | |||
1044 | /* | ||
1045 | * If we need a large contiguous chunk of memory, or have | ||
1046 | * trouble getting a small set of contiguous pages, we | ||
1047 | * will reclaim both active and inactive pages. | ||
1048 | * | ||
1049 | * We use the same threshold as pageout congestion_wait below. | ||
1050 | */ | ||
1051 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | ||
1052 | lumpy_reclaim = 1; | ||
1053 | else if (sc->order && priority < DEF_PRIORITY - 2) | ||
1054 | lumpy_reclaim = 1; | ||
1062 | 1055 | ||
1063 | pagevec_init(&pvec, 1); | 1056 | pagevec_init(&pvec, 1); |
1064 | 1057 | ||
@@ -1071,19 +1064,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
1071 | unsigned long nr_freed; | 1064 | unsigned long nr_freed; |
1072 | unsigned long nr_active; | 1065 | unsigned long nr_active; |
1073 | unsigned int count[NR_LRU_LISTS] = { 0, }; | 1066 | unsigned int count[NR_LRU_LISTS] = { 0, }; |
1074 | int mode = ISOLATE_INACTIVE; | 1067 | int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE; |
1075 | |||
1076 | /* | ||
1077 | * If we need a large contiguous chunk of memory, or have | ||
1078 | * trouble getting a small set of contiguous pages, we | ||
1079 | * will reclaim both active and inactive pages. | ||
1080 | * | ||
1081 | * We use the same threshold as pageout congestion_wait below. | ||
1082 | */ | ||
1083 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | ||
1084 | mode = ISOLATE_BOTH; | ||
1085 | else if (sc->order && priority < DEF_PRIORITY - 2) | ||
1086 | mode = ISOLATE_BOTH; | ||
1087 | 1068 | ||
1088 | nr_taken = sc->isolate_pages(sc->swap_cluster_max, | 1069 | nr_taken = sc->isolate_pages(sc->swap_cluster_max, |
1089 | &page_list, &nr_scan, sc->order, mode, | 1070 | &page_list, &nr_scan, sc->order, mode, |
@@ -1120,7 +1101,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
1120 | * but that should be acceptable to the caller | 1101 | * but that should be acceptable to the caller |
1121 | */ | 1102 | */ |
1122 | if (nr_freed < nr_taken && !current_is_kswapd() && | 1103 | if (nr_freed < nr_taken && !current_is_kswapd() && |
1123 | sc->order > PAGE_ALLOC_COSTLY_ORDER) { | 1104 | lumpy_reclaim) { |
1124 | congestion_wait(WRITE, HZ/10); | 1105 | congestion_wait(WRITE, HZ/10); |
1125 | 1106 | ||
1126 | /* | 1107 | /* |
@@ -1215,18 +1196,54 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority) | |||
1215 | * But we had to alter page->flags anyway. | 1196 | * But we had to alter page->flags anyway. |
1216 | */ | 1197 | */ |
1217 | 1198 | ||
1199 | static void move_active_pages_to_lru(struct zone *zone, | ||
1200 | struct list_head *list, | ||
1201 | enum lru_list lru) | ||
1202 | { | ||
1203 | unsigned long pgmoved = 0; | ||
1204 | struct pagevec pvec; | ||
1205 | struct page *page; | ||
1206 | |||
1207 | pagevec_init(&pvec, 1); | ||
1208 | |||
1209 | while (!list_empty(list)) { | ||
1210 | page = lru_to_page(list); | ||
1211 | prefetchw_prev_lru_page(page, list, flags); | ||
1212 | |||
1213 | VM_BUG_ON(PageLRU(page)); | ||
1214 | SetPageLRU(page); | ||
1215 | |||
1216 | VM_BUG_ON(!PageActive(page)); | ||
1217 | if (!is_active_lru(lru)) | ||
1218 | ClearPageActive(page); /* we are de-activating */ | ||
1219 | |||
1220 | list_move(&page->lru, &zone->lru[lru].list); | ||
1221 | mem_cgroup_add_lru_list(page, lru); | ||
1222 | pgmoved++; | ||
1223 | |||
1224 | if (!pagevec_add(&pvec, page) || list_empty(list)) { | ||
1225 | spin_unlock_irq(&zone->lru_lock); | ||
1226 | if (buffer_heads_over_limit) | ||
1227 | pagevec_strip(&pvec); | ||
1228 | __pagevec_release(&pvec); | ||
1229 | spin_lock_irq(&zone->lru_lock); | ||
1230 | } | ||
1231 | } | ||
1232 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); | ||
1233 | if (!is_active_lru(lru)) | ||
1234 | __count_vm_events(PGDEACTIVATE, pgmoved); | ||
1235 | } | ||
1218 | 1236 | ||
1219 | static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | 1237 | static void shrink_active_list(unsigned long nr_pages, struct zone *zone, |
1220 | struct scan_control *sc, int priority, int file) | 1238 | struct scan_control *sc, int priority, int file) |
1221 | { | 1239 | { |
1222 | unsigned long pgmoved; | 1240 | unsigned long pgmoved; |
1223 | int pgdeactivate = 0; | ||
1224 | unsigned long pgscanned; | 1241 | unsigned long pgscanned; |
1242 | unsigned long vm_flags; | ||
1225 | LIST_HEAD(l_hold); /* The pages which were snipped off */ | 1243 | LIST_HEAD(l_hold); /* The pages which were snipped off */ |
1244 | LIST_HEAD(l_active); | ||
1226 | LIST_HEAD(l_inactive); | 1245 | LIST_HEAD(l_inactive); |
1227 | struct page *page; | 1246 | struct page *page; |
1228 | struct pagevec pvec; | ||
1229 | enum lru_list lru; | ||
1230 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1247 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
1231 | 1248 | ||
1232 | lru_add_drain(); | 1249 | lru_add_drain(); |
@@ -1243,13 +1260,14 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1243 | } | 1260 | } |
1244 | reclaim_stat->recent_scanned[!!file] += pgmoved; | 1261 | reclaim_stat->recent_scanned[!!file] += pgmoved; |
1245 | 1262 | ||
1263 | __count_zone_vm_events(PGREFILL, zone, pgscanned); | ||
1246 | if (file) | 1264 | if (file) |
1247 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); | 1265 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); |
1248 | else | 1266 | else |
1249 | __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); | 1267 | __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); |
1250 | spin_unlock_irq(&zone->lru_lock); | 1268 | spin_unlock_irq(&zone->lru_lock); |
1251 | 1269 | ||
1252 | pgmoved = 0; | 1270 | pgmoved = 0; /* count referenced (mapping) mapped pages */ |
1253 | while (!list_empty(&l_hold)) { | 1271 | while (!list_empty(&l_hold)) { |
1254 | cond_resched(); | 1272 | cond_resched(); |
1255 | page = lru_to_page(&l_hold); | 1273 | page = lru_to_page(&l_hold); |
@@ -1262,58 +1280,44 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1262 | 1280 | ||
1263 | /* page_referenced clears PageReferenced */ | 1281 | /* page_referenced clears PageReferenced */ |
1264 | if (page_mapping_inuse(page) && | 1282 | if (page_mapping_inuse(page) && |
1265 | page_referenced(page, 0, sc->mem_cgroup)) | 1283 | page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { |
1266 | pgmoved++; | 1284 | pgmoved++; |
1285 | /* | ||
1286 | * Identify referenced, file-backed active pages and | ||
1287 | * give them one more trip around the active list. So | ||
1288 | * that executable code get better chances to stay in | ||
1289 | * memory under moderate memory pressure. Anon pages | ||
1290 | * are not likely to be evicted by use-once streaming | ||
1291 | * IO, plus JVM can create lots of anon VM_EXEC pages, | ||
1292 | * so we ignore them here. | ||
1293 | */ | ||
1294 | if ((vm_flags & VM_EXEC) && !PageAnon(page)) { | ||
1295 | list_add(&page->lru, &l_active); | ||
1296 | continue; | ||
1297 | } | ||
1298 | } | ||
1267 | 1299 | ||
1268 | list_add(&page->lru, &l_inactive); | 1300 | list_add(&page->lru, &l_inactive); |
1269 | } | 1301 | } |
1270 | 1302 | ||
1271 | /* | 1303 | /* |
1272 | * Move the pages to the [file or anon] inactive list. | 1304 | * Move pages back to the lru list. |
1273 | */ | 1305 | */ |
1274 | pagevec_init(&pvec, 1); | ||
1275 | lru = LRU_BASE + file * LRU_FILE; | ||
1276 | |||
1277 | spin_lock_irq(&zone->lru_lock); | 1306 | spin_lock_irq(&zone->lru_lock); |
1278 | /* | 1307 | /* |
1279 | * Count referenced pages from currently used mappings as | 1308 | * Count referenced pages from currently used mappings as rotated, |
1280 | * rotated, even though they are moved to the inactive list. | 1309 | * even though only some of them are actually re-activated. This |
1281 | * This helps balance scan pressure between file and anonymous | 1310 | * helps balance scan pressure between file and anonymous pages in |
1282 | * pages in get_scan_ratio. | 1311 | * get_scan_ratio. |
1283 | */ | 1312 | */ |
1284 | reclaim_stat->recent_rotated[!!file] += pgmoved; | 1313 | reclaim_stat->recent_rotated[!!file] += pgmoved; |
1285 | 1314 | ||
1286 | pgmoved = 0; | 1315 | move_active_pages_to_lru(zone, &l_active, |
1287 | while (!list_empty(&l_inactive)) { | 1316 | LRU_ACTIVE + file * LRU_FILE); |
1288 | page = lru_to_page(&l_inactive); | 1317 | move_active_pages_to_lru(zone, &l_inactive, |
1289 | prefetchw_prev_lru_page(page, &l_inactive, flags); | 1318 | LRU_BASE + file * LRU_FILE); |
1290 | VM_BUG_ON(PageLRU(page)); | ||
1291 | SetPageLRU(page); | ||
1292 | VM_BUG_ON(!PageActive(page)); | ||
1293 | ClearPageActive(page); | ||
1294 | 1319 | ||
1295 | list_move(&page->lru, &zone->lru[lru].list); | ||
1296 | mem_cgroup_add_lru_list(page, lru); | ||
1297 | pgmoved++; | ||
1298 | if (!pagevec_add(&pvec, page)) { | ||
1299 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); | ||
1300 | spin_unlock_irq(&zone->lru_lock); | ||
1301 | pgdeactivate += pgmoved; | ||
1302 | pgmoved = 0; | ||
1303 | if (buffer_heads_over_limit) | ||
1304 | pagevec_strip(&pvec); | ||
1305 | __pagevec_release(&pvec); | ||
1306 | spin_lock_irq(&zone->lru_lock); | ||
1307 | } | ||
1308 | } | ||
1309 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); | ||
1310 | pgdeactivate += pgmoved; | ||
1311 | __count_zone_vm_events(PGREFILL, zone, pgscanned); | ||
1312 | __count_vm_events(PGDEACTIVATE, pgdeactivate); | ||
1313 | spin_unlock_irq(&zone->lru_lock); | 1320 | spin_unlock_irq(&zone->lru_lock); |
1314 | if (buffer_heads_over_limit) | ||
1315 | pagevec_strip(&pvec); | ||
1316 | pagevec_release(&pvec); | ||
1317 | } | 1321 | } |
1318 | 1322 | ||
1319 | static int inactive_anon_is_low_global(struct zone *zone) | 1323 | static int inactive_anon_is_low_global(struct zone *zone) |
@@ -1348,12 +1352,48 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) | |||
1348 | return low; | 1352 | return low; |
1349 | } | 1353 | } |
1350 | 1354 | ||
1355 | static int inactive_file_is_low_global(struct zone *zone) | ||
1356 | { | ||
1357 | unsigned long active, inactive; | ||
1358 | |||
1359 | active = zone_page_state(zone, NR_ACTIVE_FILE); | ||
1360 | inactive = zone_page_state(zone, NR_INACTIVE_FILE); | ||
1361 | |||
1362 | return (active > inactive); | ||
1363 | } | ||
1364 | |||
1365 | /** | ||
1366 | * inactive_file_is_low - check if file pages need to be deactivated | ||
1367 | * @zone: zone to check | ||
1368 | * @sc: scan control of this context | ||
1369 | * | ||
1370 | * When the system is doing streaming IO, memory pressure here | ||
1371 | * ensures that active file pages get deactivated, until more | ||
1372 | * than half of the file pages are on the inactive list. | ||
1373 | * | ||
1374 | * Once we get to that situation, protect the system's working | ||
1375 | * set from being evicted by disabling active file page aging. | ||
1376 | * | ||
1377 | * This uses a different ratio than the anonymous pages, because | ||
1378 | * the page cache uses a use-once replacement algorithm. | ||
1379 | */ | ||
1380 | static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) | ||
1381 | { | ||
1382 | int low; | ||
1383 | |||
1384 | if (scanning_global_lru(sc)) | ||
1385 | low = inactive_file_is_low_global(zone); | ||
1386 | else | ||
1387 | low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup); | ||
1388 | return low; | ||
1389 | } | ||
1390 | |||
1351 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, | 1391 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, |
1352 | struct zone *zone, struct scan_control *sc, int priority) | 1392 | struct zone *zone, struct scan_control *sc, int priority) |
1353 | { | 1393 | { |
1354 | int file = is_file_lru(lru); | 1394 | int file = is_file_lru(lru); |
1355 | 1395 | ||
1356 | if (lru == LRU_ACTIVE_FILE) { | 1396 | if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) { |
1357 | shrink_active_list(nr_to_scan, zone, sc, priority, file); | 1397 | shrink_active_list(nr_to_scan, zone, sc, priority, file); |
1358 | return 0; | 1398 | return 0; |
1359 | } | 1399 | } |
@@ -1382,13 +1422,6 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
1382 | unsigned long ap, fp; | 1422 | unsigned long ap, fp; |
1383 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1423 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
1384 | 1424 | ||
1385 | /* If we have no swap space, do not bother scanning anon pages. */ | ||
1386 | if (!sc->may_swap || (nr_swap_pages <= 0)) { | ||
1387 | percent[0] = 0; | ||
1388 | percent[1] = 100; | ||
1389 | return; | ||
1390 | } | ||
1391 | |||
1392 | anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) + | 1425 | anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) + |
1393 | zone_nr_pages(zone, sc, LRU_INACTIVE_ANON); | 1426 | zone_nr_pages(zone, sc, LRU_INACTIVE_ANON); |
1394 | file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) + | 1427 | file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) + |
@@ -1398,7 +1431,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
1398 | free = zone_page_state(zone, NR_FREE_PAGES); | 1431 | free = zone_page_state(zone, NR_FREE_PAGES); |
1399 | /* If we have very few page cache pages, | 1432 | /* If we have very few page cache pages, |
1400 | force-scan anon pages. */ | 1433 | force-scan anon pages. */ |
1401 | if (unlikely(file + free <= zone->pages_high)) { | 1434 | if (unlikely(file + free <= high_wmark_pages(zone))) { |
1402 | percent[0] = 100; | 1435 | percent[0] = 100; |
1403 | percent[1] = 0; | 1436 | percent[1] = 0; |
1404 | return; | 1437 | return; |
@@ -1453,6 +1486,26 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
1453 | percent[1] = 100 - percent[0]; | 1486 | percent[1] = 100 - percent[0]; |
1454 | } | 1487 | } |
1455 | 1488 | ||
1489 | /* | ||
1490 | * Smallish @nr_to_scan's are deposited in @nr_saved_scan, | ||
1491 | * until we collected @swap_cluster_max pages to scan. | ||
1492 | */ | ||
1493 | static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, | ||
1494 | unsigned long *nr_saved_scan, | ||
1495 | unsigned long swap_cluster_max) | ||
1496 | { | ||
1497 | unsigned long nr; | ||
1498 | |||
1499 | *nr_saved_scan += nr_to_scan; | ||
1500 | nr = *nr_saved_scan; | ||
1501 | |||
1502 | if (nr >= swap_cluster_max) | ||
1503 | *nr_saved_scan = 0; | ||
1504 | else | ||
1505 | nr = 0; | ||
1506 | |||
1507 | return nr; | ||
1508 | } | ||
1456 | 1509 | ||
1457 | /* | 1510 | /* |
1458 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | 1511 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. |
@@ -1466,26 +1519,30 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1466 | enum lru_list l; | 1519 | enum lru_list l; |
1467 | unsigned long nr_reclaimed = sc->nr_reclaimed; | 1520 | unsigned long nr_reclaimed = sc->nr_reclaimed; |
1468 | unsigned long swap_cluster_max = sc->swap_cluster_max; | 1521 | unsigned long swap_cluster_max = sc->swap_cluster_max; |
1522 | int noswap = 0; | ||
1469 | 1523 | ||
1470 | get_scan_ratio(zone, sc, percent); | 1524 | /* If we have no swap space, do not bother scanning anon pages. */ |
1525 | if (!sc->may_swap || (nr_swap_pages <= 0)) { | ||
1526 | noswap = 1; | ||
1527 | percent[0] = 0; | ||
1528 | percent[1] = 100; | ||
1529 | } else | ||
1530 | get_scan_ratio(zone, sc, percent); | ||
1471 | 1531 | ||
1472 | for_each_evictable_lru(l) { | 1532 | for_each_evictable_lru(l) { |
1473 | int file = is_file_lru(l); | 1533 | int file = is_file_lru(l); |
1474 | unsigned long scan; | 1534 | unsigned long scan; |
1475 | 1535 | ||
1476 | scan = zone_nr_pages(zone, sc, l); | 1536 | scan = zone_nr_pages(zone, sc, l); |
1477 | if (priority) { | 1537 | if (priority || noswap) { |
1478 | scan >>= priority; | 1538 | scan >>= priority; |
1479 | scan = (scan * percent[file]) / 100; | 1539 | scan = (scan * percent[file]) / 100; |
1480 | } | 1540 | } |
1481 | if (scanning_global_lru(sc)) { | 1541 | if (scanning_global_lru(sc)) |
1482 | zone->lru[l].nr_scan += scan; | 1542 | nr[l] = nr_scan_try_batch(scan, |
1483 | nr[l] = zone->lru[l].nr_scan; | 1543 | &zone->lru[l].nr_saved_scan, |
1484 | if (nr[l] >= swap_cluster_max) | 1544 | swap_cluster_max); |
1485 | zone->lru[l].nr_scan = 0; | 1545 | else |
1486 | else | ||
1487 | nr[l] = 0; | ||
1488 | } else | ||
1489 | nr[l] = scan; | 1546 | nr[l] = scan; |
1490 | } | 1547 | } |
1491 | 1548 | ||
@@ -1519,7 +1576,7 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1519 | * Even if we did not try to evict anon pages at all, we want to | 1576 | * Even if we did not try to evict anon pages at all, we want to |
1520 | * rebalance the anon lru active/inactive ratio. | 1577 | * rebalance the anon lru active/inactive ratio. |
1521 | */ | 1578 | */ |
1522 | if (inactive_anon_is_low(zone, sc)) | 1579 | if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0) |
1523 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); | 1580 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); |
1524 | 1581 | ||
1525 | throttle_vm_writeout(sc->gfp_mask); | 1582 | throttle_vm_writeout(sc->gfp_mask); |
@@ -1530,11 +1587,13 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1530 | * try to reclaim pages from zones which will satisfy the caller's allocation | 1587 | * try to reclaim pages from zones which will satisfy the caller's allocation |
1531 | * request. | 1588 | * request. |
1532 | * | 1589 | * |
1533 | * We reclaim from a zone even if that zone is over pages_high. Because: | 1590 | * We reclaim from a zone even if that zone is over high_wmark_pages(zone). |
1591 | * Because: | ||
1534 | * a) The caller may be trying to free *extra* pages to satisfy a higher-order | 1592 | * a) The caller may be trying to free *extra* pages to satisfy a higher-order |
1535 | * allocation or | 1593 | * allocation or |
1536 | * b) The zones may be over pages_high but they must go *over* pages_high to | 1594 | * b) The target zone may be at high_wmark_pages(zone) but the lower zones |
1537 | * satisfy the `incremental min' zone defense algorithm. | 1595 | * must go *over* high_wmark_pages(zone) to satisfy the `incremental min' |
1596 | * zone defense algorithm. | ||
1538 | * | 1597 | * |
1539 | * If a zone is deemed to be full of pinned pages then just give it a light | 1598 | * If a zone is deemed to be full of pinned pages then just give it a light |
1540 | * scan then give up on it. | 1599 | * scan then give up on it. |
@@ -1740,7 +1799,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
1740 | 1799 | ||
1741 | /* | 1800 | /* |
1742 | * For kswapd, balance_pgdat() will work across all this node's zones until | 1801 | * For kswapd, balance_pgdat() will work across all this node's zones until |
1743 | * they are all at pages_high. | 1802 | * they are all at high_wmark_pages(zone). |
1744 | * | 1803 | * |
1745 | * Returns the number of pages which were actually freed. | 1804 | * Returns the number of pages which were actually freed. |
1746 | * | 1805 | * |
@@ -1753,11 +1812,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
1753 | * the zone for when the problem goes away. | 1812 | * the zone for when the problem goes away. |
1754 | * | 1813 | * |
1755 | * kswapd scans the zones in the highmem->normal->dma direction. It skips | 1814 | * kswapd scans the zones in the highmem->normal->dma direction. It skips |
1756 | * zones which have free_pages > pages_high, but once a zone is found to have | 1815 | * zones which have free_pages > high_wmark_pages(zone), but once a zone is |
1757 | * free_pages <= pages_high, we scan that zone and the lower zones regardless | 1816 | * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the |
1758 | * of the number of free pages in the lower zones. This interoperates with | 1817 | * lower zones regardless of the number of free pages in the lower zones. This |
1759 | * the page allocator fallback scheme to ensure that aging of pages is balanced | 1818 | * interoperates with the page allocator fallback scheme to ensure that aging |
1760 | * across the zones. | 1819 | * of pages is balanced across the zones. |
1761 | */ | 1820 | */ |
1762 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | 1821 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order) |
1763 | { | 1822 | { |
@@ -1778,7 +1837,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | |||
1778 | }; | 1837 | }; |
1779 | /* | 1838 | /* |
1780 | * temp_priority is used to remember the scanning priority at which | 1839 | * temp_priority is used to remember the scanning priority at which |
1781 | * this zone was successfully refilled to free_pages == pages_high. | 1840 | * this zone was successfully refilled to |
1841 | * free_pages == high_wmark_pages(zone). | ||
1782 | */ | 1842 | */ |
1783 | int temp_priority[MAX_NR_ZONES]; | 1843 | int temp_priority[MAX_NR_ZONES]; |
1784 | 1844 | ||
@@ -1823,8 +1883,8 @@ loop_again: | |||
1823 | shrink_active_list(SWAP_CLUSTER_MAX, zone, | 1883 | shrink_active_list(SWAP_CLUSTER_MAX, zone, |
1824 | &sc, priority, 0); | 1884 | &sc, priority, 0); |
1825 | 1885 | ||
1826 | if (!zone_watermark_ok(zone, order, zone->pages_high, | 1886 | if (!zone_watermark_ok(zone, order, |
1827 | 0, 0)) { | 1887 | high_wmark_pages(zone), 0, 0)) { |
1828 | end_zone = i; | 1888 | end_zone = i; |
1829 | break; | 1889 | break; |
1830 | } | 1890 | } |
@@ -1858,8 +1918,8 @@ loop_again: | |||
1858 | priority != DEF_PRIORITY) | 1918 | priority != DEF_PRIORITY) |
1859 | continue; | 1919 | continue; |
1860 | 1920 | ||
1861 | if (!zone_watermark_ok(zone, order, zone->pages_high, | 1921 | if (!zone_watermark_ok(zone, order, |
1862 | end_zone, 0)) | 1922 | high_wmark_pages(zone), end_zone, 0)) |
1863 | all_zones_ok = 0; | 1923 | all_zones_ok = 0; |
1864 | temp_priority[i] = priority; | 1924 | temp_priority[i] = priority; |
1865 | sc.nr_scanned = 0; | 1925 | sc.nr_scanned = 0; |
@@ -1868,8 +1928,8 @@ loop_again: | |||
1868 | * We put equal pressure on every zone, unless one | 1928 | * We put equal pressure on every zone, unless one |
1869 | * zone has way too many pages free already. | 1929 | * zone has way too many pages free already. |
1870 | */ | 1930 | */ |
1871 | if (!zone_watermark_ok(zone, order, 8*zone->pages_high, | 1931 | if (!zone_watermark_ok(zone, order, |
1872 | end_zone, 0)) | 1932 | 8*high_wmark_pages(zone), end_zone, 0)) |
1873 | shrink_zone(priority, zone, &sc); | 1933 | shrink_zone(priority, zone, &sc); |
1874 | reclaim_state->reclaimed_slab = 0; | 1934 | reclaim_state->reclaimed_slab = 0; |
1875 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, | 1935 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, |
@@ -2035,7 +2095,7 @@ void wakeup_kswapd(struct zone *zone, int order) | |||
2035 | return; | 2095 | return; |
2036 | 2096 | ||
2037 | pgdat = zone->zone_pgdat; | 2097 | pgdat = zone->zone_pgdat; |
2038 | if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) | 2098 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) |
2039 | return; | 2099 | return; |
2040 | if (pgdat->kswapd_max_order < order) | 2100 | if (pgdat->kswapd_max_order < order) |
2041 | pgdat->kswapd_max_order = order; | 2101 | pgdat->kswapd_max_order = order; |
@@ -2054,7 +2114,7 @@ unsigned long global_lru_pages(void) | |||
2054 | + global_page_state(NR_INACTIVE_FILE); | 2114 | + global_page_state(NR_INACTIVE_FILE); |
2055 | } | 2115 | } |
2056 | 2116 | ||
2057 | #ifdef CONFIG_PM | 2117 | #ifdef CONFIG_HIBERNATION |
2058 | /* | 2118 | /* |
2059 | * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages | 2119 | * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages |
2060 | * from LRU lists system-wide, for given pass and priority. | 2120 | * from LRU lists system-wide, for given pass and priority. |
@@ -2082,11 +2142,11 @@ static void shrink_all_zones(unsigned long nr_pages, int prio, | |||
2082 | l == LRU_ACTIVE_FILE)) | 2142 | l == LRU_ACTIVE_FILE)) |
2083 | continue; | 2143 | continue; |
2084 | 2144 | ||
2085 | zone->lru[l].nr_scan += (lru_pages >> prio) + 1; | 2145 | zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1; |
2086 | if (zone->lru[l].nr_scan >= nr_pages || pass > 3) { | 2146 | if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) { |
2087 | unsigned long nr_to_scan; | 2147 | unsigned long nr_to_scan; |
2088 | 2148 | ||
2089 | zone->lru[l].nr_scan = 0; | 2149 | zone->lru[l].nr_saved_scan = 0; |
2090 | nr_to_scan = min(nr_pages, lru_pages); | 2150 | nr_to_scan = min(nr_pages, lru_pages); |
2091 | nr_reclaimed += shrink_list(l, nr_to_scan, zone, | 2151 | nr_reclaimed += shrink_list(l, nr_to_scan, zone, |
2092 | sc, prio); | 2152 | sc, prio); |
@@ -2194,7 +2254,7 @@ out: | |||
2194 | 2254 | ||
2195 | return sc.nr_reclaimed; | 2255 | return sc.nr_reclaimed; |
2196 | } | 2256 | } |
2197 | #endif | 2257 | #endif /* CONFIG_HIBERNATION */ |
2198 | 2258 | ||
2199 | /* It's optimal to keep kswapds on the same CPUs as their memory, but | 2259 | /* It's optimal to keep kswapds on the same CPUs as their memory, but |
2200 | not required for correctness. So if the last cpu in a node goes | 2260 | not required for correctness. So if the last cpu in a node goes |
@@ -2288,6 +2348,48 @@ int sysctl_min_unmapped_ratio = 1; | |||
2288 | */ | 2348 | */ |
2289 | int sysctl_min_slab_ratio = 5; | 2349 | int sysctl_min_slab_ratio = 5; |
2290 | 2350 | ||
2351 | static inline unsigned long zone_unmapped_file_pages(struct zone *zone) | ||
2352 | { | ||
2353 | unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED); | ||
2354 | unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) + | ||
2355 | zone_page_state(zone, NR_ACTIVE_FILE); | ||
2356 | |||
2357 | /* | ||
2358 | * It's possible for there to be more file mapped pages than | ||
2359 | * accounted for by the pages on the file LRU lists because | ||
2360 | * tmpfs pages accounted for as ANON can also be FILE_MAPPED | ||
2361 | */ | ||
2362 | return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; | ||
2363 | } | ||
2364 | |||
2365 | /* Work out how many page cache pages we can reclaim in this reclaim_mode */ | ||
2366 | static long zone_pagecache_reclaimable(struct zone *zone) | ||
2367 | { | ||
2368 | long nr_pagecache_reclaimable; | ||
2369 | long delta = 0; | ||
2370 | |||
2371 | /* | ||
2372 | * If RECLAIM_SWAP is set, then all file pages are considered | ||
2373 | * potentially reclaimable. Otherwise, we have to worry about | ||
2374 | * pages like swapcache and zone_unmapped_file_pages() provides | ||
2375 | * a better estimate | ||
2376 | */ | ||
2377 | if (zone_reclaim_mode & RECLAIM_SWAP) | ||
2378 | nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); | ||
2379 | else | ||
2380 | nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); | ||
2381 | |||
2382 | /* If we can't clean pages, remove dirty pages from consideration */ | ||
2383 | if (!(zone_reclaim_mode & RECLAIM_WRITE)) | ||
2384 | delta += zone_page_state(zone, NR_FILE_DIRTY); | ||
2385 | |||
2386 | /* Watch for any possible underflows due to delta */ | ||
2387 | if (unlikely(delta > nr_pagecache_reclaimable)) | ||
2388 | delta = nr_pagecache_reclaimable; | ||
2389 | |||
2390 | return nr_pagecache_reclaimable - delta; | ||
2391 | } | ||
2392 | |||
2291 | /* | 2393 | /* |
2292 | * Try to free up some pages from this zone through reclaim. | 2394 | * Try to free up some pages from this zone through reclaim. |
2293 | */ | 2395 | */ |
@@ -2322,9 +2424,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
2322 | reclaim_state.reclaimed_slab = 0; | 2424 | reclaim_state.reclaimed_slab = 0; |
2323 | p->reclaim_state = &reclaim_state; | 2425 | p->reclaim_state = &reclaim_state; |
2324 | 2426 | ||
2325 | if (zone_page_state(zone, NR_FILE_PAGES) - | 2427 | if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) { |
2326 | zone_page_state(zone, NR_FILE_MAPPED) > | ||
2327 | zone->min_unmapped_pages) { | ||
2328 | /* | 2428 | /* |
2329 | * Free memory by calling shrink zone with increasing | 2429 | * Free memory by calling shrink zone with increasing |
2330 | * priorities until we have enough memory freed. | 2430 | * priorities until we have enough memory freed. |
@@ -2382,20 +2482,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
2382 | * if less than a specified percentage of the zone is used by | 2482 | * if less than a specified percentage of the zone is used by |
2383 | * unmapped file backed pages. | 2483 | * unmapped file backed pages. |
2384 | */ | 2484 | */ |
2385 | if (zone_page_state(zone, NR_FILE_PAGES) - | 2485 | if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages && |
2386 | zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages | 2486 | zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) |
2387 | && zone_page_state(zone, NR_SLAB_RECLAIMABLE) | 2487 | return ZONE_RECLAIM_FULL; |
2388 | <= zone->min_slab_pages) | ||
2389 | return 0; | ||
2390 | 2488 | ||
2391 | if (zone_is_all_unreclaimable(zone)) | 2489 | if (zone_is_all_unreclaimable(zone)) |
2392 | return 0; | 2490 | return ZONE_RECLAIM_FULL; |
2393 | 2491 | ||
2394 | /* | 2492 | /* |
2395 | * Do not scan if the allocation should not be delayed. | 2493 | * Do not scan if the allocation should not be delayed. |
2396 | */ | 2494 | */ |
2397 | if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) | 2495 | if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) |
2398 | return 0; | 2496 | return ZONE_RECLAIM_NOSCAN; |
2399 | 2497 | ||
2400 | /* | 2498 | /* |
2401 | * Only run zone reclaim on the local zone or on zones that do not | 2499 | * Only run zone reclaim on the local zone or on zones that do not |
@@ -2405,18 +2503,21 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
2405 | */ | 2503 | */ |
2406 | node_id = zone_to_nid(zone); | 2504 | node_id = zone_to_nid(zone); |
2407 | if (node_state(node_id, N_CPU) && node_id != numa_node_id()) | 2505 | if (node_state(node_id, N_CPU) && node_id != numa_node_id()) |
2408 | return 0; | 2506 | return ZONE_RECLAIM_NOSCAN; |
2409 | 2507 | ||
2410 | if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) | 2508 | if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) |
2411 | return 0; | 2509 | return ZONE_RECLAIM_NOSCAN; |
2510 | |||
2412 | ret = __zone_reclaim(zone, gfp_mask, order); | 2511 | ret = __zone_reclaim(zone, gfp_mask, order); |
2413 | zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); | 2512 | zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); |
2414 | 2513 | ||
2514 | if (!ret) | ||
2515 | count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); | ||
2516 | |||
2415 | return ret; | 2517 | return ret; |
2416 | } | 2518 | } |
2417 | #endif | 2519 | #endif |
2418 | 2520 | ||
2419 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
2420 | /* | 2521 | /* |
2421 | * page_evictable - test whether a page is evictable | 2522 | * page_evictable - test whether a page is evictable |
2422 | * @page: the page to test | 2523 | * @page: the page to test |
@@ -2663,4 +2764,3 @@ void scan_unevictable_unregister_node(struct node *node) | |||
2663 | sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); | 2764 | sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); |
2664 | } | 2765 | } |
2665 | 2766 | ||
2666 | #endif | ||
diff --git a/mm/vmstat.c b/mm/vmstat.c index 74d66dba0cbe..138bed53706e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -629,10 +629,8 @@ static const char * const vmstat_text[] = { | |||
629 | "nr_active_anon", | 629 | "nr_active_anon", |
630 | "nr_inactive_file", | 630 | "nr_inactive_file", |
631 | "nr_active_file", | 631 | "nr_active_file", |
632 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
633 | "nr_unevictable", | 632 | "nr_unevictable", |
634 | "nr_mlock", | 633 | "nr_mlock", |
635 | #endif | ||
636 | "nr_anon_pages", | 634 | "nr_anon_pages", |
637 | "nr_mapped", | 635 | "nr_mapped", |
638 | "nr_file_pages", | 636 | "nr_file_pages", |
@@ -675,6 +673,9 @@ static const char * const vmstat_text[] = { | |||
675 | TEXTS_FOR_ZONES("pgscan_kswapd") | 673 | TEXTS_FOR_ZONES("pgscan_kswapd") |
676 | TEXTS_FOR_ZONES("pgscan_direct") | 674 | TEXTS_FOR_ZONES("pgscan_direct") |
677 | 675 | ||
676 | #ifdef CONFIG_NUMA | ||
677 | "zone_reclaim_failed", | ||
678 | #endif | ||
678 | "pginodesteal", | 679 | "pginodesteal", |
679 | "slabs_scanned", | 680 | "slabs_scanned", |
680 | "kswapd_steal", | 681 | "kswapd_steal", |
@@ -687,7 +688,6 @@ static const char * const vmstat_text[] = { | |||
687 | "htlb_buddy_alloc_success", | 688 | "htlb_buddy_alloc_success", |
688 | "htlb_buddy_alloc_fail", | 689 | "htlb_buddy_alloc_fail", |
689 | #endif | 690 | #endif |
690 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
691 | "unevictable_pgs_culled", | 691 | "unevictable_pgs_culled", |
692 | "unevictable_pgs_scanned", | 692 | "unevictable_pgs_scanned", |
693 | "unevictable_pgs_rescued", | 693 | "unevictable_pgs_rescued", |
@@ -697,7 +697,6 @@ static const char * const vmstat_text[] = { | |||
697 | "unevictable_pgs_stranded", | 697 | "unevictable_pgs_stranded", |
698 | "unevictable_pgs_mlockfreed", | 698 | "unevictable_pgs_mlockfreed", |
699 | #endif | 699 | #endif |
700 | #endif | ||
701 | }; | 700 | }; |
702 | 701 | ||
703 | static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | 702 | static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, |
@@ -710,18 +709,14 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
710 | "\n min %lu" | 709 | "\n min %lu" |
711 | "\n low %lu" | 710 | "\n low %lu" |
712 | "\n high %lu" | 711 | "\n high %lu" |
713 | "\n scanned %lu (aa: %lu ia: %lu af: %lu if: %lu)" | 712 | "\n scanned %lu" |
714 | "\n spanned %lu" | 713 | "\n spanned %lu" |
715 | "\n present %lu", | 714 | "\n present %lu", |
716 | zone_page_state(zone, NR_FREE_PAGES), | 715 | zone_page_state(zone, NR_FREE_PAGES), |
717 | zone->pages_min, | 716 | min_wmark_pages(zone), |
718 | zone->pages_low, | 717 | low_wmark_pages(zone), |
719 | zone->pages_high, | 718 | high_wmark_pages(zone), |
720 | zone->pages_scanned, | 719 | zone->pages_scanned, |
721 | zone->lru[LRU_ACTIVE_ANON].nr_scan, | ||
722 | zone->lru[LRU_INACTIVE_ANON].nr_scan, | ||
723 | zone->lru[LRU_ACTIVE_FILE].nr_scan, | ||
724 | zone->lru[LRU_INACTIVE_FILE].nr_scan, | ||
725 | zone->spanned_pages, | 720 | zone->spanned_pages, |
726 | zone->present_pages); | 721 | zone->present_pages); |
727 | 722 | ||