diff options
author | Dan Williams <dan.j.williams@intel.com> | 2009-09-08 20:55:21 -0400 |
---|---|---|
committer | Dan Williams <dan.j.williams@intel.com> | 2009-09-08 20:55:21 -0400 |
commit | bbb20089a3275a19e475dbc21320c3742e3ca423 (patch) | |
tree | 216fdc1cbef450ca688135c5b8969169482d9a48 /mm | |
parent | 3e48e656903e9fd8bc805c6a2c4264d7808d315b (diff) | |
parent | 657a77fa7284d8ae28dfa48f1dc5d919bf5b2843 (diff) |
Merge branch 'dmaengine' into async-tx-next
Conflicts:
crypto/async_tx/async_xor.c
drivers/dma/ioat/dma_v2.h
drivers/dma/ioat/pci.c
drivers/md/raid5.c
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 37 | ||||
-rw-r--r-- | mm/Kconfig.debug | 1 | ||||
-rw-r--r-- | mm/Makefile | 4 | ||||
-rw-r--r-- | mm/bootmem.c | 26 | ||||
-rw-r--r-- | mm/bounce.c | 10 | ||||
-rw-r--r-- | mm/fadvise.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 169 | ||||
-rw-r--r-- | mm/highmem.c | 1 | ||||
-rw-r--r-- | mm/hugetlb.c | 123 | ||||
-rw-r--r-- | mm/init-mm.c | 20 | ||||
-rw-r--r-- | mm/internal.h | 33 | ||||
-rw-r--r-- | mm/kmemcheck.c | 122 | ||||
-rw-r--r-- | mm/kmemleak-test.c | 111 | ||||
-rw-r--r-- | mm/kmemleak.c | 1497 | ||||
-rw-r--r-- | mm/maccess.c | 2 | ||||
-rw-r--r-- | mm/madvise.c | 26 | ||||
-rw-r--r-- | mm/memcontrol.c | 137 | ||||
-rw-r--r-- | mm/memory.c | 178 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 6 | ||||
-rw-r--r-- | mm/mempolicy.c | 145 | ||||
-rw-r--r-- | mm/migrate.c | 6 | ||||
-rw-r--r-- | mm/mlock.c | 73 | ||||
-rw-r--r-- | mm/mmap.c | 8 | ||||
-rw-r--r-- | mm/mprotect.c | 2 | ||||
-rw-r--r-- | mm/nommu.c | 3 | ||||
-rw-r--r-- | mm/oom_kill.c | 64 | ||||
-rw-r--r-- | mm/page-writeback.c | 19 | ||||
-rw-r--r-- | mm/page_alloc.c | 860 | ||||
-rw-r--r-- | mm/page_cgroup.c | 41 | ||||
-rw-r--r-- | mm/page_io.c | 2 | ||||
-rw-r--r-- | mm/percpu.c | 141 | ||||
-rw-r--r-- | mm/readahead.c | 145 | ||||
-rw-r--r-- | mm/rmap.c | 45 | ||||
-rw-r--r-- | mm/shmem.c | 12 | ||||
-rw-r--r-- | mm/shmem_acl.c | 29 | ||||
-rw-r--r-- | mm/slab.c | 269 | ||||
-rw-r--r-- | mm/slob.c | 19 | ||||
-rw-r--r-- | mm/slub.c | 181 | ||||
-rw-r--r-- | mm/swap_state.c | 17 | ||||
-rw-r--r-- | mm/swapfile.c | 284 | ||||
-rw-r--r-- | mm/thrash.c | 32 | ||||
-rw-r--r-- | mm/truncate.c | 39 | ||||
-rw-r--r-- | mm/util.c | 31 | ||||
-rw-r--r-- | mm/vmalloc.c | 33 | ||||
-rw-r--r-- | mm/vmscan.c | 380 | ||||
-rw-r--r-- | mm/vmstat.c | 19 |
46 files changed, 4006 insertions, 1398 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index c2b57d81e153..c948d4ca8bde 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -128,11 +128,11 @@ config SPARSEMEM_VMEMMAP | |||
128 | config MEMORY_HOTPLUG | 128 | config MEMORY_HOTPLUG |
129 | bool "Allow for memory hot-add" | 129 | bool "Allow for memory hot-add" |
130 | depends on SPARSEMEM || X86_64_ACPI_NUMA | 130 | depends on SPARSEMEM || X86_64_ACPI_NUMA |
131 | depends on HOTPLUG && !HIBERNATION && ARCH_ENABLE_MEMORY_HOTPLUG | 131 | depends on HOTPLUG && !(HIBERNATION && !S390) && ARCH_ENABLE_MEMORY_HOTPLUG |
132 | depends on (IA64 || X86 || PPC64 || SUPERH || S390) | 132 | depends on (IA64 || X86 || PPC64 || SUPERH || S390) |
133 | 133 | ||
134 | comment "Memory hotplug is currently incompatible with Software Suspend" | 134 | comment "Memory hotplug is currently incompatible with Software Suspend" |
135 | depends on SPARSEMEM && HOTPLUG && HIBERNATION | 135 | depends on SPARSEMEM && HOTPLUG && HIBERNATION && !S390 |
136 | 136 | ||
137 | config MEMORY_HOTPLUG_SPARSE | 137 | config MEMORY_HOTPLUG_SPARSE |
138 | def_bool y | 138 | def_bool y |
@@ -203,29 +203,36 @@ config VIRT_TO_BUS | |||
203 | def_bool y | 203 | def_bool y |
204 | depends on !ARCH_NO_VIRT_TO_BUS | 204 | depends on !ARCH_NO_VIRT_TO_BUS |
205 | 205 | ||
206 | config UNEVICTABLE_LRU | ||
207 | bool "Add LRU list to track non-evictable pages" | ||
208 | default y | ||
209 | help | ||
210 | Keeps unevictable pages off of the active and inactive pageout | ||
211 | lists, so kswapd will not waste CPU time or have its balancing | ||
212 | algorithms thrown off by scanning these pages. Selecting this | ||
213 | will use one page flag and increase the code size a little, | ||
214 | say Y unless you know what you are doing. | ||
215 | |||
216 | See Documentation/vm/unevictable-lru.txt for more information. | ||
217 | |||
218 | config HAVE_MLOCK | 206 | config HAVE_MLOCK |
219 | bool | 207 | bool |
220 | default y if MMU=y | 208 | default y if MMU=y |
221 | 209 | ||
222 | config HAVE_MLOCKED_PAGE_BIT | 210 | config HAVE_MLOCKED_PAGE_BIT |
223 | bool | 211 | bool |
224 | default y if HAVE_MLOCK=y && UNEVICTABLE_LRU=y | 212 | default y if HAVE_MLOCK=y |
225 | 213 | ||
226 | config MMU_NOTIFIER | 214 | config MMU_NOTIFIER |
227 | bool | 215 | bool |
228 | 216 | ||
217 | config DEFAULT_MMAP_MIN_ADDR | ||
218 | int "Low address space to protect from user allocation" | ||
219 | default 4096 | ||
220 | help | ||
221 | This is the portion of low virtual memory which should be protected | ||
222 | from userspace allocation. Keeping a user from writing to low pages | ||
223 | can help reduce the impact of kernel NULL pointer bugs. | ||
224 | |||
225 | For most ia64, ppc64 and x86 users with lots of address space | ||
226 | a value of 65536 is reasonable and should cause no problems. | ||
227 | On arm and other archs it should not be higher than 32768. | ||
228 | Programs which use vm86 functionality would either need additional | ||
229 | permissions from either the LSM or the capabilities module or have | ||
230 | this protection disabled. | ||
231 | |||
232 | This value can be changed after boot using the | ||
233 | /proc/sys/vm/mmap_min_addr tunable. | ||
234 | |||
235 | |||
229 | config NOMMU_INITIAL_TRIM_EXCESS | 236 | config NOMMU_INITIAL_TRIM_EXCESS |
230 | int "Turn on mmap() excess space trimming before booting" | 237 | int "Turn on mmap() excess space trimming before booting" |
231 | depends on !MMU | 238 | depends on !MMU |
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index bb01e298f260..aa99fd1f7109 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug | |||
@@ -2,6 +2,7 @@ config DEBUG_PAGEALLOC | |||
2 | bool "Debug page memory allocations" | 2 | bool "Debug page memory allocations" |
3 | depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC | 3 | depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC |
4 | depends on !HIBERNATION || !PPC && !SPARC | 4 | depends on !HIBERNATION || !PPC && !SPARC |
5 | depends on !KMEMCHECK | ||
5 | ---help--- | 6 | ---help--- |
6 | Unmap pages from the kernel linear mapping after free_pages(). | 7 | Unmap pages from the kernel linear mapping after free_pages(). |
7 | This results in a large slowdown, but helps to find certain types | 8 | This results in a large slowdown, but helps to find certain types |
diff --git a/mm/Makefile b/mm/Makefile index ec73c68b6015..5e0bd6426693 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -12,6 +12,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ | |||
12 | readahead.o swap.o truncate.o vmscan.o shmem.o \ | 12 | readahead.o swap.o truncate.o vmscan.o shmem.o \ |
13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ | 13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ |
14 | page_isolation.o mm_init.o $(mmu-y) | 14 | page_isolation.o mm_init.o $(mmu-y) |
15 | obj-y += init-mm.o | ||
15 | 16 | ||
16 | obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o | 17 | obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o |
17 | obj-$(CONFIG_BOUNCE) += bounce.o | 18 | obj-$(CONFIG_BOUNCE) += bounce.o |
@@ -27,6 +28,7 @@ obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o | |||
27 | obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o | 28 | obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o |
28 | obj-$(CONFIG_SLAB) += slab.o | 29 | obj-$(CONFIG_SLAB) += slab.o |
29 | obj-$(CONFIG_SLUB) += slub.o | 30 | obj-$(CONFIG_SLUB) += slub.o |
31 | obj-$(CONFIG_KMEMCHECK) += kmemcheck.o | ||
30 | obj-$(CONFIG_FAILSLAB) += failslab.o | 32 | obj-$(CONFIG_FAILSLAB) += failslab.o |
31 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o | 33 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o |
32 | obj-$(CONFIG_FS_XIP) += filemap_xip.o | 34 | obj-$(CONFIG_FS_XIP) += filemap_xip.o |
@@ -38,3 +40,5 @@ obj-$(CONFIG_SMP) += allocpercpu.o | |||
38 | endif | 40 | endif |
39 | obj-$(CONFIG_QUICKLIST) += quicklist.o | 41 | obj-$(CONFIG_QUICKLIST) += quicklist.o |
40 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o | 42 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o |
43 | obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o | ||
44 | obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o | ||
diff --git a/mm/bootmem.c b/mm/bootmem.c index daf92713f7de..d2a9ce952768 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -532,12 +532,19 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, | |||
532 | unsigned long size, unsigned long align, | 532 | unsigned long size, unsigned long align, |
533 | unsigned long goal, unsigned long limit) | 533 | unsigned long goal, unsigned long limit) |
534 | { | 534 | { |
535 | #ifdef CONFIG_HAVE_ARCH_BOOTMEM | 535 | if (WARN_ON_ONCE(slab_is_available())) |
536 | bootmem_data_t *p_bdata; | 536 | return kzalloc(size, GFP_NOWAIT); |
537 | 537 | ||
538 | p_bdata = bootmem_arch_preferred_node(bdata, size, align, goal, limit); | 538 | #ifdef CONFIG_HAVE_ARCH_BOOTMEM |
539 | if (p_bdata) | 539 | { |
540 | return alloc_bootmem_core(p_bdata, size, align, goal, limit); | 540 | bootmem_data_t *p_bdata; |
541 | |||
542 | p_bdata = bootmem_arch_preferred_node(bdata, size, align, | ||
543 | goal, limit); | ||
544 | if (p_bdata) | ||
545 | return alloc_bootmem_core(p_bdata, size, align, | ||
546 | goal, limit); | ||
547 | } | ||
541 | #endif | 548 | #endif |
542 | return NULL; | 549 | return NULL; |
543 | } | 550 | } |
@@ -662,6 +669,9 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, | |||
662 | void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, | 669 | void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, |
663 | unsigned long align, unsigned long goal) | 670 | unsigned long align, unsigned long goal) |
664 | { | 671 | { |
672 | if (WARN_ON_ONCE(slab_is_available())) | ||
673 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | ||
674 | |||
665 | return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); | 675 | return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); |
666 | } | 676 | } |
667 | 677 | ||
@@ -693,6 +703,9 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, | |||
693 | { | 703 | { |
694 | void *ptr; | 704 | void *ptr; |
695 | 705 | ||
706 | if (WARN_ON_ONCE(slab_is_available())) | ||
707 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | ||
708 | |||
696 | ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); | 709 | ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); |
697 | if (ptr) | 710 | if (ptr) |
698 | return ptr; | 711 | return ptr; |
@@ -745,6 +758,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, | |||
745 | void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, | 758 | void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, |
746 | unsigned long align, unsigned long goal) | 759 | unsigned long align, unsigned long goal) |
747 | { | 760 | { |
761 | if (WARN_ON_ONCE(slab_is_available())) | ||
762 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | ||
763 | |||
748 | return ___alloc_bootmem_node(pgdat->bdata, size, align, | 764 | return ___alloc_bootmem_node(pgdat->bdata, size, align, |
749 | goal, ARCH_LOW_ADDRESS_LIMIT); | 765 | goal, ARCH_LOW_ADDRESS_LIMIT); |
750 | } | 766 | } |
diff --git a/mm/bounce.c b/mm/bounce.c index e590272fe7a8..a2b76a588e34 100644 --- a/mm/bounce.c +++ b/mm/bounce.c | |||
@@ -13,17 +13,15 @@ | |||
13 | #include <linux/init.h> | 13 | #include <linux/init.h> |
14 | #include <linux/hash.h> | 14 | #include <linux/hash.h> |
15 | #include <linux/highmem.h> | 15 | #include <linux/highmem.h> |
16 | #include <linux/blktrace_api.h> | ||
17 | #include <trace/block.h> | ||
18 | #include <asm/tlbflush.h> | 16 | #include <asm/tlbflush.h> |
19 | 17 | ||
18 | #include <trace/events/block.h> | ||
19 | |||
20 | #define POOL_SIZE 64 | 20 | #define POOL_SIZE 64 |
21 | #define ISA_POOL_SIZE 16 | 21 | #define ISA_POOL_SIZE 16 |
22 | 22 | ||
23 | static mempool_t *page_pool, *isa_page_pool; | 23 | static mempool_t *page_pool, *isa_page_pool; |
24 | 24 | ||
25 | DEFINE_TRACE(block_bio_bounce); | ||
26 | |||
27 | #ifdef CONFIG_HIGHMEM | 25 | #ifdef CONFIG_HIGHMEM |
28 | static __init int init_emergency_pool(void) | 26 | static __init int init_emergency_pool(void) |
29 | { | 27 | { |
@@ -192,7 +190,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, | |||
192 | /* | 190 | /* |
193 | * is destination page below bounce pfn? | 191 | * is destination page below bounce pfn? |
194 | */ | 192 | */ |
195 | if (page_to_pfn(page) <= q->bounce_pfn) | 193 | if (page_to_pfn(page) <= queue_bounce_pfn(q)) |
196 | continue; | 194 | continue; |
197 | 195 | ||
198 | /* | 196 | /* |
@@ -284,7 +282,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) | |||
284 | * don't waste time iterating over bio segments | 282 | * don't waste time iterating over bio segments |
285 | */ | 283 | */ |
286 | if (!(q->bounce_gfp & GFP_DMA)) { | 284 | if (!(q->bounce_gfp & GFP_DMA)) { |
287 | if (q->bounce_pfn >= blk_max_pfn) | 285 | if (queue_bounce_pfn(q) >= blk_max_pfn) |
288 | return; | 286 | return; |
289 | pool = page_pool; | 287 | pool = page_pool; |
290 | } else { | 288 | } else { |
diff --git a/mm/fadvise.c b/mm/fadvise.c index 54a0f8040afa..e43359214f6f 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
@@ -101,7 +101,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) | |||
101 | 101 | ||
102 | ret = force_page_cache_readahead(mapping, file, | 102 | ret = force_page_cache_readahead(mapping, file, |
103 | start_index, | 103 | start_index, |
104 | max_sane_readahead(nrpages)); | 104 | nrpages); |
105 | if (ret > 0) | 105 | if (ret > 0) |
106 | ret = 0; | 106 | ret = 0; |
107 | break; | 107 | break; |
diff --git a/mm/filemap.c b/mm/filemap.c index 1b60f30cebfa..22396713feb9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -521,7 +521,7 @@ struct page *__page_cache_alloc(gfp_t gfp) | |||
521 | { | 521 | { |
522 | if (cpuset_do_page_mem_spread()) { | 522 | if (cpuset_do_page_mem_spread()) { |
523 | int n = cpuset_mem_spread_node(); | 523 | int n = cpuset_mem_spread_node(); |
524 | return alloc_pages_node(n, gfp, 0); | 524 | return alloc_pages_exact_node(n, gfp, 0); |
525 | } | 525 | } |
526 | return alloc_pages(gfp, 0); | 526 | return alloc_pages(gfp, 0); |
527 | } | 527 | } |
@@ -1004,9 +1004,6 @@ EXPORT_SYMBOL(grab_cache_page_nowait); | |||
1004 | static void shrink_readahead_size_eio(struct file *filp, | 1004 | static void shrink_readahead_size_eio(struct file *filp, |
1005 | struct file_ra_state *ra) | 1005 | struct file_ra_state *ra) |
1006 | { | 1006 | { |
1007 | if (!ra->ra_pages) | ||
1008 | return; | ||
1009 | |||
1010 | ra->ra_pages /= 4; | 1007 | ra->ra_pages /= 4; |
1011 | } | 1008 | } |
1012 | 1009 | ||
@@ -1390,8 +1387,7 @@ do_readahead(struct address_space *mapping, struct file *filp, | |||
1390 | if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) | 1387 | if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) |
1391 | return -EINVAL; | 1388 | return -EINVAL; |
1392 | 1389 | ||
1393 | force_page_cache_readahead(mapping, filp, index, | 1390 | force_page_cache_readahead(mapping, filp, index, nr); |
1394 | max_sane_readahead(nr)); | ||
1395 | return 0; | 1391 | return 0; |
1396 | } | 1392 | } |
1397 | 1393 | ||
@@ -1457,6 +1453,73 @@ static int page_cache_read(struct file *file, pgoff_t offset) | |||
1457 | 1453 | ||
1458 | #define MMAP_LOTSAMISS (100) | 1454 | #define MMAP_LOTSAMISS (100) |
1459 | 1455 | ||
1456 | /* | ||
1457 | * Synchronous readahead happens when we don't even find | ||
1458 | * a page in the page cache at all. | ||
1459 | */ | ||
1460 | static void do_sync_mmap_readahead(struct vm_area_struct *vma, | ||
1461 | struct file_ra_state *ra, | ||
1462 | struct file *file, | ||
1463 | pgoff_t offset) | ||
1464 | { | ||
1465 | unsigned long ra_pages; | ||
1466 | struct address_space *mapping = file->f_mapping; | ||
1467 | |||
1468 | /* If we don't want any read-ahead, don't bother */ | ||
1469 | if (VM_RandomReadHint(vma)) | ||
1470 | return; | ||
1471 | |||
1472 | if (VM_SequentialReadHint(vma) || | ||
1473 | offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) { | ||
1474 | page_cache_sync_readahead(mapping, ra, file, offset, | ||
1475 | ra->ra_pages); | ||
1476 | return; | ||
1477 | } | ||
1478 | |||
1479 | if (ra->mmap_miss < INT_MAX) | ||
1480 | ra->mmap_miss++; | ||
1481 | |||
1482 | /* | ||
1483 | * Do we miss much more than hit in this file? If so, | ||
1484 | * stop bothering with read-ahead. It will only hurt. | ||
1485 | */ | ||
1486 | if (ra->mmap_miss > MMAP_LOTSAMISS) | ||
1487 | return; | ||
1488 | |||
1489 | /* | ||
1490 | * mmap read-around | ||
1491 | */ | ||
1492 | ra_pages = max_sane_readahead(ra->ra_pages); | ||
1493 | if (ra_pages) { | ||
1494 | ra->start = max_t(long, 0, offset - ra_pages/2); | ||
1495 | ra->size = ra_pages; | ||
1496 | ra->async_size = 0; | ||
1497 | ra_submit(ra, mapping, file); | ||
1498 | } | ||
1499 | } | ||
1500 | |||
1501 | /* | ||
1502 | * Asynchronous readahead happens when we find the page and PG_readahead, | ||
1503 | * so we want to possibly extend the readahead further.. | ||
1504 | */ | ||
1505 | static void do_async_mmap_readahead(struct vm_area_struct *vma, | ||
1506 | struct file_ra_state *ra, | ||
1507 | struct file *file, | ||
1508 | struct page *page, | ||
1509 | pgoff_t offset) | ||
1510 | { | ||
1511 | struct address_space *mapping = file->f_mapping; | ||
1512 | |||
1513 | /* If we don't want any read-ahead, don't bother */ | ||
1514 | if (VM_RandomReadHint(vma)) | ||
1515 | return; | ||
1516 | if (ra->mmap_miss > 0) | ||
1517 | ra->mmap_miss--; | ||
1518 | if (PageReadahead(page)) | ||
1519 | page_cache_async_readahead(mapping, ra, file, | ||
1520 | page, offset, ra->ra_pages); | ||
1521 | } | ||
1522 | |||
1460 | /** | 1523 | /** |
1461 | * filemap_fault - read in file data for page fault handling | 1524 | * filemap_fault - read in file data for page fault handling |
1462 | * @vma: vma in which the fault was taken | 1525 | * @vma: vma in which the fault was taken |
@@ -1476,78 +1539,44 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1476 | struct address_space *mapping = file->f_mapping; | 1539 | struct address_space *mapping = file->f_mapping; |
1477 | struct file_ra_state *ra = &file->f_ra; | 1540 | struct file_ra_state *ra = &file->f_ra; |
1478 | struct inode *inode = mapping->host; | 1541 | struct inode *inode = mapping->host; |
1542 | pgoff_t offset = vmf->pgoff; | ||
1479 | struct page *page; | 1543 | struct page *page; |
1480 | pgoff_t size; | 1544 | pgoff_t size; |
1481 | int did_readaround = 0; | ||
1482 | int ret = 0; | 1545 | int ret = 0; |
1483 | 1546 | ||
1484 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 1547 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
1485 | if (vmf->pgoff >= size) | 1548 | if (offset >= size) |
1486 | return VM_FAULT_SIGBUS; | 1549 | return VM_FAULT_SIGBUS; |
1487 | 1550 | ||
1488 | /* If we don't want any read-ahead, don't bother */ | ||
1489 | if (VM_RandomReadHint(vma)) | ||
1490 | goto no_cached_page; | ||
1491 | |||
1492 | /* | 1551 | /* |
1493 | * Do we have something in the page cache already? | 1552 | * Do we have something in the page cache already? |
1494 | */ | 1553 | */ |
1495 | retry_find: | 1554 | page = find_get_page(mapping, offset); |
1496 | page = find_lock_page(mapping, vmf->pgoff); | 1555 | if (likely(page)) { |
1497 | /* | ||
1498 | * For sequential accesses, we use the generic readahead logic. | ||
1499 | */ | ||
1500 | if (VM_SequentialReadHint(vma)) { | ||
1501 | if (!page) { | ||
1502 | page_cache_sync_readahead(mapping, ra, file, | ||
1503 | vmf->pgoff, 1); | ||
1504 | page = find_lock_page(mapping, vmf->pgoff); | ||
1505 | if (!page) | ||
1506 | goto no_cached_page; | ||
1507 | } | ||
1508 | if (PageReadahead(page)) { | ||
1509 | page_cache_async_readahead(mapping, ra, file, page, | ||
1510 | vmf->pgoff, 1); | ||
1511 | } | ||
1512 | } | ||
1513 | |||
1514 | if (!page) { | ||
1515 | unsigned long ra_pages; | ||
1516 | |||
1517 | ra->mmap_miss++; | ||
1518 | |||
1519 | /* | 1556 | /* |
1520 | * Do we miss much more than hit in this file? If so, | 1557 | * We found the page, so try async readahead before |
1521 | * stop bothering with read-ahead. It will only hurt. | 1558 | * waiting for the lock. |
1522 | */ | 1559 | */ |
1523 | if (ra->mmap_miss > MMAP_LOTSAMISS) | 1560 | do_async_mmap_readahead(vma, ra, file, page, offset); |
1524 | goto no_cached_page; | 1561 | lock_page(page); |
1525 | 1562 | ||
1526 | /* | 1563 | /* Did it get truncated? */ |
1527 | * To keep the pgmajfault counter straight, we need to | 1564 | if (unlikely(page->mapping != mapping)) { |
1528 | * check did_readaround, as this is an inner loop. | 1565 | unlock_page(page); |
1529 | */ | 1566 | put_page(page); |
1530 | if (!did_readaround) { | 1567 | goto no_cached_page; |
1531 | ret = VM_FAULT_MAJOR; | ||
1532 | count_vm_event(PGMAJFAULT); | ||
1533 | } | ||
1534 | did_readaround = 1; | ||
1535 | ra_pages = max_sane_readahead(file->f_ra.ra_pages); | ||
1536 | if (ra_pages) { | ||
1537 | pgoff_t start = 0; | ||
1538 | |||
1539 | if (vmf->pgoff > ra_pages / 2) | ||
1540 | start = vmf->pgoff - ra_pages / 2; | ||
1541 | do_page_cache_readahead(mapping, file, start, ra_pages); | ||
1542 | } | 1568 | } |
1543 | page = find_lock_page(mapping, vmf->pgoff); | 1569 | } else { |
1570 | /* No page in the page cache at all */ | ||
1571 | do_sync_mmap_readahead(vma, ra, file, offset); | ||
1572 | count_vm_event(PGMAJFAULT); | ||
1573 | ret = VM_FAULT_MAJOR; | ||
1574 | retry_find: | ||
1575 | page = find_lock_page(mapping, offset); | ||
1544 | if (!page) | 1576 | if (!page) |
1545 | goto no_cached_page; | 1577 | goto no_cached_page; |
1546 | } | 1578 | } |
1547 | 1579 | ||
1548 | if (!did_readaround) | ||
1549 | ra->mmap_miss--; | ||
1550 | |||
1551 | /* | 1580 | /* |
1552 | * We have a locked page in the page cache, now we need to check | 1581 | * We have a locked page in the page cache, now we need to check |
1553 | * that it's up-to-date. If not, it is going to be due to an error. | 1582 | * that it's up-to-date. If not, it is going to be due to an error. |
@@ -1555,18 +1584,18 @@ retry_find: | |||
1555 | if (unlikely(!PageUptodate(page))) | 1584 | if (unlikely(!PageUptodate(page))) |
1556 | goto page_not_uptodate; | 1585 | goto page_not_uptodate; |
1557 | 1586 | ||
1558 | /* Must recheck i_size under page lock */ | 1587 | /* |
1588 | * Found the page and have a reference on it. | ||
1589 | * We must recheck i_size under page lock. | ||
1590 | */ | ||
1559 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 1591 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
1560 | if (unlikely(vmf->pgoff >= size)) { | 1592 | if (unlikely(offset >= size)) { |
1561 | unlock_page(page); | 1593 | unlock_page(page); |
1562 | page_cache_release(page); | 1594 | page_cache_release(page); |
1563 | return VM_FAULT_SIGBUS; | 1595 | return VM_FAULT_SIGBUS; |
1564 | } | 1596 | } |
1565 | 1597 | ||
1566 | /* | 1598 | ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT; |
1567 | * Found the page and have a reference on it. | ||
1568 | */ | ||
1569 | ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT; | ||
1570 | vmf->page = page; | 1599 | vmf->page = page; |
1571 | return ret | VM_FAULT_LOCKED; | 1600 | return ret | VM_FAULT_LOCKED; |
1572 | 1601 | ||
@@ -1575,7 +1604,7 @@ no_cached_page: | |||
1575 | * We're only likely to ever get here if MADV_RANDOM is in | 1604 | * We're only likely to ever get here if MADV_RANDOM is in |
1576 | * effect. | 1605 | * effect. |
1577 | */ | 1606 | */ |
1578 | error = page_cache_read(file, vmf->pgoff); | 1607 | error = page_cache_read(file, offset); |
1579 | 1608 | ||
1580 | /* | 1609 | /* |
1581 | * The page we want has now been added to the page cache. | 1610 | * The page we want has now been added to the page cache. |
@@ -1595,12 +1624,6 @@ no_cached_page: | |||
1595 | return VM_FAULT_SIGBUS; | 1624 | return VM_FAULT_SIGBUS; |
1596 | 1625 | ||
1597 | page_not_uptodate: | 1626 | page_not_uptodate: |
1598 | /* IO error path */ | ||
1599 | if (!did_readaround) { | ||
1600 | ret = VM_FAULT_MAJOR; | ||
1601 | count_vm_event(PGMAJFAULT); | ||
1602 | } | ||
1603 | |||
1604 | /* | 1627 | /* |
1605 | * Umm, take care of errors if the page isn't up-to-date. | 1628 | * Umm, take care of errors if the page isn't up-to-date. |
1606 | * Try to re-read it _once_. We do this synchronously, | 1629 | * Try to re-read it _once_. We do this synchronously, |
diff --git a/mm/highmem.c b/mm/highmem.c index 68eb1d9b63fa..25878cc49daa 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
@@ -26,7 +26,6 @@ | |||
26 | #include <linux/init.h> | 26 | #include <linux/init.h> |
27 | #include <linux/hash.h> | 27 | #include <linux/hash.h> |
28 | #include <linux/highmem.h> | 28 | #include <linux/highmem.h> |
29 | #include <linux/blktrace_api.h> | ||
30 | #include <asm/tlbflush.h> | 29 | #include <asm/tlbflush.h> |
31 | 30 | ||
32 | /* | 31 | /* |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e83ad2c9228c..d0351e31f474 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -578,41 +578,6 @@ static void free_huge_page(struct page *page) | |||
578 | hugetlb_put_quota(mapping, 1); | 578 | hugetlb_put_quota(mapping, 1); |
579 | } | 579 | } |
580 | 580 | ||
581 | /* | ||
582 | * Increment or decrement surplus_huge_pages. Keep node-specific counters | ||
583 | * balanced by operating on them in a round-robin fashion. | ||
584 | * Returns 1 if an adjustment was made. | ||
585 | */ | ||
586 | static int adjust_pool_surplus(struct hstate *h, int delta) | ||
587 | { | ||
588 | static int prev_nid; | ||
589 | int nid = prev_nid; | ||
590 | int ret = 0; | ||
591 | |||
592 | VM_BUG_ON(delta != -1 && delta != 1); | ||
593 | do { | ||
594 | nid = next_node(nid, node_online_map); | ||
595 | if (nid == MAX_NUMNODES) | ||
596 | nid = first_node(node_online_map); | ||
597 | |||
598 | /* To shrink on this node, there must be a surplus page */ | ||
599 | if (delta < 0 && !h->surplus_huge_pages_node[nid]) | ||
600 | continue; | ||
601 | /* Surplus cannot exceed the total number of pages */ | ||
602 | if (delta > 0 && h->surplus_huge_pages_node[nid] >= | ||
603 | h->nr_huge_pages_node[nid]) | ||
604 | continue; | ||
605 | |||
606 | h->surplus_huge_pages += delta; | ||
607 | h->surplus_huge_pages_node[nid] += delta; | ||
608 | ret = 1; | ||
609 | break; | ||
610 | } while (nid != prev_nid); | ||
611 | |||
612 | prev_nid = nid; | ||
613 | return ret; | ||
614 | } | ||
615 | |||
616 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) | 581 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) |
617 | { | 582 | { |
618 | set_compound_page_dtor(page, free_huge_page); | 583 | set_compound_page_dtor(page, free_huge_page); |
@@ -623,6 +588,34 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) | |||
623 | put_page(page); /* free it into the hugepage allocator */ | 588 | put_page(page); /* free it into the hugepage allocator */ |
624 | } | 589 | } |
625 | 590 | ||
591 | static void prep_compound_gigantic_page(struct page *page, unsigned long order) | ||
592 | { | ||
593 | int i; | ||
594 | int nr_pages = 1 << order; | ||
595 | struct page *p = page + 1; | ||
596 | |||
597 | /* we rely on prep_new_huge_page to set the destructor */ | ||
598 | set_compound_order(page, order); | ||
599 | __SetPageHead(page); | ||
600 | for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { | ||
601 | __SetPageTail(p); | ||
602 | p->first_page = page; | ||
603 | } | ||
604 | } | ||
605 | |||
606 | int PageHuge(struct page *page) | ||
607 | { | ||
608 | compound_page_dtor *dtor; | ||
609 | |||
610 | if (!PageCompound(page)) | ||
611 | return 0; | ||
612 | |||
613 | page = compound_head(page); | ||
614 | dtor = get_compound_page_dtor(page); | ||
615 | |||
616 | return dtor == free_huge_page; | ||
617 | } | ||
618 | |||
626 | static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | 619 | static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) |
627 | { | 620 | { |
628 | struct page *page; | 621 | struct page *page; |
@@ -630,7 +623,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | |||
630 | if (h->order >= MAX_ORDER) | 623 | if (h->order >= MAX_ORDER) |
631 | return NULL; | 624 | return NULL; |
632 | 625 | ||
633 | page = alloc_pages_node(nid, | 626 | page = alloc_pages_exact_node(nid, |
634 | htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| | 627 | htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| |
635 | __GFP_REPEAT|__GFP_NOWARN, | 628 | __GFP_REPEAT|__GFP_NOWARN, |
636 | huge_page_order(h)); | 629 | huge_page_order(h)); |
@@ -649,7 +642,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | |||
649 | * Use a helper variable to find the next node and then | 642 | * Use a helper variable to find the next node and then |
650 | * copy it back to hugetlb_next_nid afterwards: | 643 | * copy it back to hugetlb_next_nid afterwards: |
651 | * otherwise there's a window in which a racer might | 644 | * otherwise there's a window in which a racer might |
652 | * pass invalid nid MAX_NUMNODES to alloc_pages_node. | 645 | * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. |
653 | * But we don't need to use a spin_lock here: it really | 646 | * But we don't need to use a spin_lock here: it really |
654 | * doesn't matter if occasionally a racer chooses the | 647 | * doesn't matter if occasionally a racer chooses the |
655 | * same nid as we do. Move nid forward in the mask even | 648 | * same nid as we do. Move nid forward in the mask even |
@@ -875,7 +868,7 @@ static void return_unused_surplus_pages(struct hstate *h, | |||
875 | * can no longer free unreserved surplus pages. This occurs when | 868 | * can no longer free unreserved surplus pages. This occurs when |
876 | * the nodes with surplus pages have no free pages. | 869 | * the nodes with surplus pages have no free pages. |
877 | */ | 870 | */ |
878 | unsigned long remaining_iterations = num_online_nodes(); | 871 | unsigned long remaining_iterations = nr_online_nodes; |
879 | 872 | ||
880 | /* Uncommit the reservation */ | 873 | /* Uncommit the reservation */ |
881 | h->resv_huge_pages -= unused_resv_pages; | 874 | h->resv_huge_pages -= unused_resv_pages; |
@@ -904,7 +897,7 @@ static void return_unused_surplus_pages(struct hstate *h, | |||
904 | h->surplus_huge_pages--; | 897 | h->surplus_huge_pages--; |
905 | h->surplus_huge_pages_node[nid]--; | 898 | h->surplus_huge_pages_node[nid]--; |
906 | nr_pages--; | 899 | nr_pages--; |
907 | remaining_iterations = num_online_nodes(); | 900 | remaining_iterations = nr_online_nodes; |
908 | } | 901 | } |
909 | } | 902 | } |
910 | } | 903 | } |
@@ -1140,6 +1133,41 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count) | |||
1140 | } | 1133 | } |
1141 | #endif | 1134 | #endif |
1142 | 1135 | ||
1136 | /* | ||
1137 | * Increment or decrement surplus_huge_pages. Keep node-specific counters | ||
1138 | * balanced by operating on them in a round-robin fashion. | ||
1139 | * Returns 1 if an adjustment was made. | ||
1140 | */ | ||
1141 | static int adjust_pool_surplus(struct hstate *h, int delta) | ||
1142 | { | ||
1143 | static int prev_nid; | ||
1144 | int nid = prev_nid; | ||
1145 | int ret = 0; | ||
1146 | |||
1147 | VM_BUG_ON(delta != -1 && delta != 1); | ||
1148 | do { | ||
1149 | nid = next_node(nid, node_online_map); | ||
1150 | if (nid == MAX_NUMNODES) | ||
1151 | nid = first_node(node_online_map); | ||
1152 | |||
1153 | /* To shrink on this node, there must be a surplus page */ | ||
1154 | if (delta < 0 && !h->surplus_huge_pages_node[nid]) | ||
1155 | continue; | ||
1156 | /* Surplus cannot exceed the total number of pages */ | ||
1157 | if (delta > 0 && h->surplus_huge_pages_node[nid] >= | ||
1158 | h->nr_huge_pages_node[nid]) | ||
1159 | continue; | ||
1160 | |||
1161 | h->surplus_huge_pages += delta; | ||
1162 | h->surplus_huge_pages_node[nid] += delta; | ||
1163 | ret = 1; | ||
1164 | break; | ||
1165 | } while (nid != prev_nid); | ||
1166 | |||
1167 | prev_nid = nid; | ||
1168 | return ret; | ||
1169 | } | ||
1170 | |||
1143 | #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) | 1171 | #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) |
1144 | static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | 1172 | static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) |
1145 | { | 1173 | { |
@@ -1957,7 +1985,7 @@ static struct page *hugetlbfs_pagecache_page(struct hstate *h, | |||
1957 | } | 1985 | } |
1958 | 1986 | ||
1959 | static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | 1987 | static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, |
1960 | unsigned long address, pte_t *ptep, int write_access) | 1988 | unsigned long address, pte_t *ptep, unsigned int flags) |
1961 | { | 1989 | { |
1962 | struct hstate *h = hstate_vma(vma); | 1990 | struct hstate *h = hstate_vma(vma); |
1963 | int ret = VM_FAULT_SIGBUS; | 1991 | int ret = VM_FAULT_SIGBUS; |
@@ -2025,7 +2053,7 @@ retry: | |||
2025 | * any allocations necessary to record that reservation occur outside | 2053 | * any allocations necessary to record that reservation occur outside |
2026 | * the spinlock. | 2054 | * the spinlock. |
2027 | */ | 2055 | */ |
2028 | if (write_access && !(vma->vm_flags & VM_SHARED)) | 2056 | if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) |
2029 | if (vma_needs_reservation(h, vma, address) < 0) { | 2057 | if (vma_needs_reservation(h, vma, address) < 0) { |
2030 | ret = VM_FAULT_OOM; | 2058 | ret = VM_FAULT_OOM; |
2031 | goto backout_unlocked; | 2059 | goto backout_unlocked; |
@@ -2044,7 +2072,7 @@ retry: | |||
2044 | && (vma->vm_flags & VM_SHARED))); | 2072 | && (vma->vm_flags & VM_SHARED))); |
2045 | set_huge_pte_at(mm, address, ptep, new_pte); | 2073 | set_huge_pte_at(mm, address, ptep, new_pte); |
2046 | 2074 | ||
2047 | if (write_access && !(vma->vm_flags & VM_SHARED)) { | 2075 | if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { |
2048 | /* Optimization, do the COW without a second fault */ | 2076 | /* Optimization, do the COW without a second fault */ |
2049 | ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); | 2077 | ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); |
2050 | } | 2078 | } |
@@ -2063,7 +2091,7 @@ backout_unlocked: | |||
2063 | } | 2091 | } |
2064 | 2092 | ||
2065 | int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 2093 | int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
2066 | unsigned long address, int write_access) | 2094 | unsigned long address, unsigned int flags) |
2067 | { | 2095 | { |
2068 | pte_t *ptep; | 2096 | pte_t *ptep; |
2069 | pte_t entry; | 2097 | pte_t entry; |
@@ -2084,7 +2112,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2084 | mutex_lock(&hugetlb_instantiation_mutex); | 2112 | mutex_lock(&hugetlb_instantiation_mutex); |
2085 | entry = huge_ptep_get(ptep); | 2113 | entry = huge_ptep_get(ptep); |
2086 | if (huge_pte_none(entry)) { | 2114 | if (huge_pte_none(entry)) { |
2087 | ret = hugetlb_no_page(mm, vma, address, ptep, write_access); | 2115 | ret = hugetlb_no_page(mm, vma, address, ptep, flags); |
2088 | goto out_mutex; | 2116 | goto out_mutex; |
2089 | } | 2117 | } |
2090 | 2118 | ||
@@ -2098,7 +2126,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2098 | * page now as it is used to determine if a reservation has been | 2126 | * page now as it is used to determine if a reservation has been |
2099 | * consumed. | 2127 | * consumed. |
2100 | */ | 2128 | */ |
2101 | if (write_access && !pte_write(entry)) { | 2129 | if ((flags & FAULT_FLAG_WRITE) && !pte_write(entry)) { |
2102 | if (vma_needs_reservation(h, vma, address) < 0) { | 2130 | if (vma_needs_reservation(h, vma, address) < 0) { |
2103 | ret = VM_FAULT_OOM; | 2131 | ret = VM_FAULT_OOM; |
2104 | goto out_mutex; | 2132 | goto out_mutex; |
@@ -2115,7 +2143,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2115 | goto out_page_table_lock; | 2143 | goto out_page_table_lock; |
2116 | 2144 | ||
2117 | 2145 | ||
2118 | if (write_access) { | 2146 | if (flags & FAULT_FLAG_WRITE) { |
2119 | if (!pte_write(entry)) { | 2147 | if (!pte_write(entry)) { |
2120 | ret = hugetlb_cow(mm, vma, address, ptep, entry, | 2148 | ret = hugetlb_cow(mm, vma, address, ptep, entry, |
2121 | pagecache_page); | 2149 | pagecache_page); |
@@ -2124,7 +2152,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2124 | entry = pte_mkdirty(entry); | 2152 | entry = pte_mkdirty(entry); |
2125 | } | 2153 | } |
2126 | entry = pte_mkyoung(entry); | 2154 | entry = pte_mkyoung(entry); |
2127 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, write_access)) | 2155 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, |
2156 | flags & FAULT_FLAG_WRITE)) | ||
2128 | update_mmu_cache(vma, address, entry); | 2157 | update_mmu_cache(vma, address, entry); |
2129 | 2158 | ||
2130 | out_page_table_lock: | 2159 | out_page_table_lock: |
diff --git a/mm/init-mm.c b/mm/init-mm.c new file mode 100644 index 000000000000..57aba0da9668 --- /dev/null +++ b/mm/init-mm.c | |||
@@ -0,0 +1,20 @@ | |||
1 | #include <linux/mm_types.h> | ||
2 | #include <linux/rbtree.h> | ||
3 | #include <linux/rwsem.h> | ||
4 | #include <linux/spinlock.h> | ||
5 | #include <linux/list.h> | ||
6 | #include <linux/cpumask.h> | ||
7 | |||
8 | #include <asm/atomic.h> | ||
9 | #include <asm/pgtable.h> | ||
10 | |||
11 | struct mm_struct init_mm = { | ||
12 | .mm_rb = RB_ROOT, | ||
13 | .pgd = swapper_pg_dir, | ||
14 | .mm_users = ATOMIC_INIT(2), | ||
15 | .mm_count = ATOMIC_INIT(1), | ||
16 | .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), | ||
17 | .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), | ||
18 | .mmlist = LIST_HEAD_INIT(init_mm.mmlist), | ||
19 | .cpu_vm_mask = CPU_MASK_ALL, | ||
20 | }; | ||
diff --git a/mm/internal.h b/mm/internal.h index 987bb03fbdd8..f290c4db528b 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -16,9 +16,6 @@ | |||
16 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, | 16 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, |
17 | unsigned long floor, unsigned long ceiling); | 17 | unsigned long floor, unsigned long ceiling); |
18 | 18 | ||
19 | extern void prep_compound_page(struct page *page, unsigned long order); | ||
20 | extern void prep_compound_gigantic_page(struct page *page, unsigned long order); | ||
21 | |||
22 | static inline void set_page_count(struct page *page, int v) | 19 | static inline void set_page_count(struct page *page, int v) |
23 | { | 20 | { |
24 | atomic_set(&page->_count, v); | 21 | atomic_set(&page->_count, v); |
@@ -51,6 +48,8 @@ extern void putback_lru_page(struct page *page); | |||
51 | */ | 48 | */ |
52 | extern unsigned long highest_memmap_pfn; | 49 | extern unsigned long highest_memmap_pfn; |
53 | extern void __free_pages_bootmem(struct page *page, unsigned int order); | 50 | extern void __free_pages_bootmem(struct page *page, unsigned int order); |
51 | extern void prep_compound_page(struct page *page, unsigned long order); | ||
52 | |||
54 | 53 | ||
55 | /* | 54 | /* |
56 | * function for dealing with page's order in buddy system. | 55 | * function for dealing with page's order in buddy system. |
@@ -74,7 +73,6 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) | |||
74 | } | 73 | } |
75 | #endif | 74 | #endif |
76 | 75 | ||
77 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
78 | /* | 76 | /* |
79 | * unevictable_migrate_page() called only from migrate_page_copy() to | 77 | * unevictable_migrate_page() called only from migrate_page_copy() to |
80 | * migrate unevictable flag to new page. | 78 | * migrate unevictable flag to new page. |
@@ -86,11 +84,6 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old) | |||
86 | if (TestClearPageUnevictable(old)) | 84 | if (TestClearPageUnevictable(old)) |
87 | SetPageUnevictable(new); | 85 | SetPageUnevictable(new); |
88 | } | 86 | } |
89 | #else | ||
90 | static inline void unevictable_migrate_page(struct page *new, struct page *old) | ||
91 | { | ||
92 | } | ||
93 | #endif | ||
94 | 87 | ||
95 | #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT | 88 | #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT |
96 | /* | 89 | /* |
@@ -150,23 +143,6 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) | |||
150 | } | 143 | } |
151 | } | 144 | } |
152 | 145 | ||
153 | /* | ||
154 | * free_page_mlock() -- clean up attempts to free and mlocked() page. | ||
155 | * Page should not be on lru, so no need to fix that up. | ||
156 | * free_pages_check() will verify... | ||
157 | */ | ||
158 | static inline void free_page_mlock(struct page *page) | ||
159 | { | ||
160 | if (unlikely(TestClearPageMlocked(page))) { | ||
161 | unsigned long flags; | ||
162 | |||
163 | local_irq_save(flags); | ||
164 | __dec_zone_page_state(page, NR_MLOCK); | ||
165 | __count_vm_event(UNEVICTABLE_MLOCKFREED); | ||
166 | local_irq_restore(flags); | ||
167 | } | ||
168 | } | ||
169 | |||
170 | #else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ | 146 | #else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ |
171 | static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) | 147 | static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) |
172 | { | 148 | { |
@@ -175,7 +151,6 @@ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) | |||
175 | static inline void clear_page_mlock(struct page *page) { } | 151 | static inline void clear_page_mlock(struct page *page) { } |
176 | static inline void mlock_vma_page(struct page *page) { } | 152 | static inline void mlock_vma_page(struct page *page) { } |
177 | static inline void mlock_migrate_page(struct page *new, struct page *old) { } | 153 | static inline void mlock_migrate_page(struct page *new, struct page *old) { } |
178 | static inline void free_page_mlock(struct page *page) { } | ||
179 | 154 | ||
180 | #endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ | 155 | #endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ |
181 | 156 | ||
@@ -284,4 +259,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
284 | unsigned long start, int len, int flags, | 259 | unsigned long start, int len, int flags, |
285 | struct page **pages, struct vm_area_struct **vmas); | 260 | struct page **pages, struct vm_area_struct **vmas); |
286 | 261 | ||
262 | #define ZONE_RECLAIM_NOSCAN -2 | ||
263 | #define ZONE_RECLAIM_FULL -1 | ||
264 | #define ZONE_RECLAIM_SOME 0 | ||
265 | #define ZONE_RECLAIM_SUCCESS 1 | ||
287 | #endif | 266 | #endif |
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c new file mode 100644 index 000000000000..fd814fd61319 --- /dev/null +++ b/mm/kmemcheck.c | |||
@@ -0,0 +1,122 @@ | |||
1 | #include <linux/gfp.h> | ||
2 | #include <linux/mm_types.h> | ||
3 | #include <linux/mm.h> | ||
4 | #include <linux/slab.h> | ||
5 | #include <linux/kmemcheck.h> | ||
6 | |||
7 | void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) | ||
8 | { | ||
9 | struct page *shadow; | ||
10 | int pages; | ||
11 | int i; | ||
12 | |||
13 | pages = 1 << order; | ||
14 | |||
15 | /* | ||
16 | * With kmemcheck enabled, we need to allocate a memory area for the | ||
17 | * shadow bits as well. | ||
18 | */ | ||
19 | shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order); | ||
20 | if (!shadow) { | ||
21 | if (printk_ratelimit()) | ||
22 | printk(KERN_ERR "kmemcheck: failed to allocate " | ||
23 | "shadow bitmap\n"); | ||
24 | return; | ||
25 | } | ||
26 | |||
27 | for(i = 0; i < pages; ++i) | ||
28 | page[i].shadow = page_address(&shadow[i]); | ||
29 | |||
30 | /* | ||
31 | * Mark it as non-present for the MMU so that our accesses to | ||
32 | * this memory will trigger a page fault and let us analyze | ||
33 | * the memory accesses. | ||
34 | */ | ||
35 | kmemcheck_hide_pages(page, pages); | ||
36 | } | ||
37 | |||
38 | void kmemcheck_free_shadow(struct page *page, int order) | ||
39 | { | ||
40 | struct page *shadow; | ||
41 | int pages; | ||
42 | int i; | ||
43 | |||
44 | if (!kmemcheck_page_is_tracked(page)) | ||
45 | return; | ||
46 | |||
47 | pages = 1 << order; | ||
48 | |||
49 | kmemcheck_show_pages(page, pages); | ||
50 | |||
51 | shadow = virt_to_page(page[0].shadow); | ||
52 | |||
53 | for(i = 0; i < pages; ++i) | ||
54 | page[i].shadow = NULL; | ||
55 | |||
56 | __free_pages(shadow, order); | ||
57 | } | ||
58 | |||
59 | void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, | ||
60 | size_t size) | ||
61 | { | ||
62 | /* | ||
63 | * Has already been memset(), which initializes the shadow for us | ||
64 | * as well. | ||
65 | */ | ||
66 | if (gfpflags & __GFP_ZERO) | ||
67 | return; | ||
68 | |||
69 | /* No need to initialize the shadow of a non-tracked slab. */ | ||
70 | if (s->flags & SLAB_NOTRACK) | ||
71 | return; | ||
72 | |||
73 | if (!kmemcheck_enabled || gfpflags & __GFP_NOTRACK) { | ||
74 | /* | ||
75 | * Allow notracked objects to be allocated from | ||
76 | * tracked caches. Note however that these objects | ||
77 | * will still get page faults on access, they just | ||
78 | * won't ever be flagged as uninitialized. If page | ||
79 | * faults are not acceptable, the slab cache itself | ||
80 | * should be marked NOTRACK. | ||
81 | */ | ||
82 | kmemcheck_mark_initialized(object, size); | ||
83 | } else if (!s->ctor) { | ||
84 | /* | ||
85 | * New objects should be marked uninitialized before | ||
86 | * they're returned to the called. | ||
87 | */ | ||
88 | kmemcheck_mark_uninitialized(object, size); | ||
89 | } | ||
90 | } | ||
91 | |||
92 | void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size) | ||
93 | { | ||
94 | /* TODO: RCU freeing is unsupported for now; hide false positives. */ | ||
95 | if (!s->ctor && !(s->flags & SLAB_DESTROY_BY_RCU)) | ||
96 | kmemcheck_mark_freed(object, size); | ||
97 | } | ||
98 | |||
99 | void kmemcheck_pagealloc_alloc(struct page *page, unsigned int order, | ||
100 | gfp_t gfpflags) | ||
101 | { | ||
102 | int pages; | ||
103 | |||
104 | if (gfpflags & (__GFP_HIGHMEM | __GFP_NOTRACK)) | ||
105 | return; | ||
106 | |||
107 | pages = 1 << order; | ||
108 | |||
109 | /* | ||
110 | * NOTE: We choose to track GFP_ZERO pages too; in fact, they | ||
111 | * can become uninitialized by copying uninitialized memory | ||
112 | * into them. | ||
113 | */ | ||
114 | |||
115 | /* XXX: Can use zone->node for node? */ | ||
116 | kmemcheck_alloc_shadow(page, order, gfpflags, -1); | ||
117 | |||
118 | if (gfpflags & __GFP_ZERO) | ||
119 | kmemcheck_mark_initialized_pages(page, pages); | ||
120 | else | ||
121 | kmemcheck_mark_uninitialized_pages(page, pages); | ||
122 | } | ||
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c new file mode 100644 index 000000000000..d5292fc6f523 --- /dev/null +++ b/mm/kmemleak-test.c | |||
@@ -0,0 +1,111 @@ | |||
1 | /* | ||
2 | * mm/kmemleak-test.c | ||
3 | * | ||
4 | * Copyright (C) 2008 ARM Limited | ||
5 | * Written by Catalin Marinas <catalin.marinas@arm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
14 | * GNU General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public License | ||
17 | * along with this program; if not, write to the Free Software | ||
18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
19 | */ | ||
20 | |||
21 | #include <linux/init.h> | ||
22 | #include <linux/kernel.h> | ||
23 | #include <linux/module.h> | ||
24 | #include <linux/slab.h> | ||
25 | #include <linux/vmalloc.h> | ||
26 | #include <linux/list.h> | ||
27 | #include <linux/percpu.h> | ||
28 | #include <linux/fdtable.h> | ||
29 | |||
30 | #include <linux/kmemleak.h> | ||
31 | |||
32 | struct test_node { | ||
33 | long header[25]; | ||
34 | struct list_head list; | ||
35 | long footer[25]; | ||
36 | }; | ||
37 | |||
38 | static LIST_HEAD(test_list); | ||
39 | static DEFINE_PER_CPU(void *, test_pointer); | ||
40 | |||
41 | /* | ||
42 | * Some very simple testing. This function needs to be extended for | ||
43 | * proper testing. | ||
44 | */ | ||
45 | static int __init kmemleak_test_init(void) | ||
46 | { | ||
47 | struct test_node *elem; | ||
48 | int i; | ||
49 | |||
50 | printk(KERN_INFO "Kmemleak testing\n"); | ||
51 | |||
52 | /* make some orphan objects */ | ||
53 | pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); | ||
54 | pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); | ||
55 | pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); | ||
56 | pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); | ||
57 | pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); | ||
58 | pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); | ||
59 | pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); | ||
60 | pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); | ||
61 | #ifndef CONFIG_MODULES | ||
62 | pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n", | ||
63 | kmem_cache_alloc(files_cachep, GFP_KERNEL)); | ||
64 | pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n", | ||
65 | kmem_cache_alloc(files_cachep, GFP_KERNEL)); | ||
66 | #endif | ||
67 | pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); | ||
68 | pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); | ||
69 | pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); | ||
70 | pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); | ||
71 | pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); | ||
72 | |||
73 | /* | ||
74 | * Add elements to a list. They should only appear as orphan | ||
75 | * after the module is removed. | ||
76 | */ | ||
77 | for (i = 0; i < 10; i++) { | ||
78 | elem = kmalloc(sizeof(*elem), GFP_KERNEL); | ||
79 | pr_info("kmemleak: kmalloc(sizeof(*elem)) = %p\n", elem); | ||
80 | if (!elem) | ||
81 | return -ENOMEM; | ||
82 | memset(elem, 0, sizeof(*elem)); | ||
83 | INIT_LIST_HEAD(&elem->list); | ||
84 | |||
85 | list_add_tail(&elem->list, &test_list); | ||
86 | } | ||
87 | |||
88 | for_each_possible_cpu(i) { | ||
89 | per_cpu(test_pointer, i) = kmalloc(129, GFP_KERNEL); | ||
90 | pr_info("kmemleak: kmalloc(129) = %p\n", | ||
91 | per_cpu(test_pointer, i)); | ||
92 | } | ||
93 | |||
94 | return 0; | ||
95 | } | ||
96 | module_init(kmemleak_test_init); | ||
97 | |||
98 | static void __exit kmemleak_test_exit(void) | ||
99 | { | ||
100 | struct test_node *elem, *tmp; | ||
101 | |||
102 | /* | ||
103 | * Remove the list elements without actually freeing the | ||
104 | * memory. | ||
105 | */ | ||
106 | list_for_each_entry_safe(elem, tmp, &test_list, list) | ||
107 | list_del(&elem->list); | ||
108 | } | ||
109 | module_exit(kmemleak_test_exit); | ||
110 | |||
111 | MODULE_LICENSE("GPL"); | ||
diff --git a/mm/kmemleak.c b/mm/kmemleak.c new file mode 100644 index 000000000000..c96f2c8700aa --- /dev/null +++ b/mm/kmemleak.c | |||
@@ -0,0 +1,1497 @@ | |||
1 | /* | ||
2 | * mm/kmemleak.c | ||
3 | * | ||
4 | * Copyright (C) 2008 ARM Limited | ||
5 | * Written by Catalin Marinas <catalin.marinas@arm.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
14 | * GNU General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public License | ||
17 | * along with this program; if not, write to the Free Software | ||
18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
19 | * | ||
20 | * | ||
21 | * For more information on the algorithm and kmemleak usage, please see | ||
22 | * Documentation/kmemleak.txt. | ||
23 | * | ||
24 | * Notes on locking | ||
25 | * ---------------- | ||
26 | * | ||
27 | * The following locks and mutexes are used by kmemleak: | ||
28 | * | ||
29 | * - kmemleak_lock (rwlock): protects the object_list modifications and | ||
30 | * accesses to the object_tree_root. The object_list is the main list | ||
31 | * holding the metadata (struct kmemleak_object) for the allocated memory | ||
32 | * blocks. The object_tree_root is a priority search tree used to look-up | ||
33 | * metadata based on a pointer to the corresponding memory block. The | ||
34 | * kmemleak_object structures are added to the object_list and | ||
35 | * object_tree_root in the create_object() function called from the | ||
36 | * kmemleak_alloc() callback and removed in delete_object() called from the | ||
37 | * kmemleak_free() callback | ||
38 | * - kmemleak_object.lock (spinlock): protects a kmemleak_object. Accesses to | ||
39 | * the metadata (e.g. count) are protected by this lock. Note that some | ||
40 | * members of this structure may be protected by other means (atomic or | ||
41 | * kmemleak_lock). This lock is also held when scanning the corresponding | ||
42 | * memory block to avoid the kernel freeing it via the kmemleak_free() | ||
43 | * callback. This is less heavyweight than holding a global lock like | ||
44 | * kmemleak_lock during scanning | ||
45 | * - scan_mutex (mutex): ensures that only one thread may scan the memory for | ||
46 | * unreferenced objects at a time. The gray_list contains the objects which | ||
47 | * are already referenced or marked as false positives and need to be | ||
48 | * scanned. This list is only modified during a scanning episode when the | ||
49 | * scan_mutex is held. At the end of a scan, the gray_list is always empty. | ||
50 | * Note that the kmemleak_object.use_count is incremented when an object is | ||
51 | * added to the gray_list and therefore cannot be freed | ||
52 | * - kmemleak_mutex (mutex): prevents multiple users of the "kmemleak" debugfs | ||
53 | * file together with modifications to the memory scanning parameters | ||
54 | * including the scan_thread pointer | ||
55 | * | ||
56 | * The kmemleak_object structures have a use_count incremented or decremented | ||
57 | * using the get_object()/put_object() functions. When the use_count becomes | ||
58 | * 0, this count can no longer be incremented and put_object() schedules the | ||
59 | * kmemleak_object freeing via an RCU callback. All calls to the get_object() | ||
60 | * function must be protected by rcu_read_lock() to avoid accessing a freed | ||
61 | * structure. | ||
62 | */ | ||
63 | |||
64 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
65 | |||
66 | #include <linux/init.h> | ||
67 | #include <linux/kernel.h> | ||
68 | #include <linux/list.h> | ||
69 | #include <linux/sched.h> | ||
70 | #include <linux/jiffies.h> | ||
71 | #include <linux/delay.h> | ||
72 | #include <linux/module.h> | ||
73 | #include <linux/kthread.h> | ||
74 | #include <linux/prio_tree.h> | ||
75 | #include <linux/gfp.h> | ||
76 | #include <linux/fs.h> | ||
77 | #include <linux/debugfs.h> | ||
78 | #include <linux/seq_file.h> | ||
79 | #include <linux/cpumask.h> | ||
80 | #include <linux/spinlock.h> | ||
81 | #include <linux/mutex.h> | ||
82 | #include <linux/rcupdate.h> | ||
83 | #include <linux/stacktrace.h> | ||
84 | #include <linux/cache.h> | ||
85 | #include <linux/percpu.h> | ||
86 | #include <linux/hardirq.h> | ||
87 | #include <linux/mmzone.h> | ||
88 | #include <linux/slab.h> | ||
89 | #include <linux/thread_info.h> | ||
90 | #include <linux/err.h> | ||
91 | #include <linux/uaccess.h> | ||
92 | #include <linux/string.h> | ||
93 | #include <linux/nodemask.h> | ||
94 | #include <linux/mm.h> | ||
95 | |||
96 | #include <asm/sections.h> | ||
97 | #include <asm/processor.h> | ||
98 | #include <asm/atomic.h> | ||
99 | |||
100 | #include <linux/kmemleak.h> | ||
101 | |||
102 | /* | ||
103 | * Kmemleak configuration and common defines. | ||
104 | */ | ||
105 | #define MAX_TRACE 16 /* stack trace length */ | ||
106 | #define REPORTS_NR 50 /* maximum number of reported leaks */ | ||
107 | #define MSECS_MIN_AGE 5000 /* minimum object age for reporting */ | ||
108 | #define MSECS_SCAN_YIELD 10 /* CPU yielding period */ | ||
109 | #define SECS_FIRST_SCAN 60 /* delay before the first scan */ | ||
110 | #define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ | ||
111 | |||
112 | #define BYTES_PER_POINTER sizeof(void *) | ||
113 | |||
114 | /* GFP bitmask for kmemleak internal allocations */ | ||
115 | #define GFP_KMEMLEAK_MASK (GFP_KERNEL | GFP_ATOMIC) | ||
116 | |||
117 | /* scanning area inside a memory block */ | ||
118 | struct kmemleak_scan_area { | ||
119 | struct hlist_node node; | ||
120 | unsigned long offset; | ||
121 | size_t length; | ||
122 | }; | ||
123 | |||
124 | /* | ||
125 | * Structure holding the metadata for each allocated memory block. | ||
126 | * Modifications to such objects should be made while holding the | ||
127 | * object->lock. Insertions or deletions from object_list, gray_list or | ||
128 | * tree_node are already protected by the corresponding locks or mutex (see | ||
129 | * the notes on locking above). These objects are reference-counted | ||
130 | * (use_count) and freed using the RCU mechanism. | ||
131 | */ | ||
132 | struct kmemleak_object { | ||
133 | spinlock_t lock; | ||
134 | unsigned long flags; /* object status flags */ | ||
135 | struct list_head object_list; | ||
136 | struct list_head gray_list; | ||
137 | struct prio_tree_node tree_node; | ||
138 | struct rcu_head rcu; /* object_list lockless traversal */ | ||
139 | /* object usage count; object freed when use_count == 0 */ | ||
140 | atomic_t use_count; | ||
141 | unsigned long pointer; | ||
142 | size_t size; | ||
143 | /* minimum number of a pointers found before it is considered leak */ | ||
144 | int min_count; | ||
145 | /* the total number of pointers found pointing to this object */ | ||
146 | int count; | ||
147 | /* memory ranges to be scanned inside an object (empty for all) */ | ||
148 | struct hlist_head area_list; | ||
149 | unsigned long trace[MAX_TRACE]; | ||
150 | unsigned int trace_len; | ||
151 | unsigned long jiffies; /* creation timestamp */ | ||
152 | pid_t pid; /* pid of the current task */ | ||
153 | char comm[TASK_COMM_LEN]; /* executable name */ | ||
154 | }; | ||
155 | |||
156 | /* flag representing the memory block allocation status */ | ||
157 | #define OBJECT_ALLOCATED (1 << 0) | ||
158 | /* flag set after the first reporting of an unreference object */ | ||
159 | #define OBJECT_REPORTED (1 << 1) | ||
160 | /* flag set to not scan the object */ | ||
161 | #define OBJECT_NO_SCAN (1 << 2) | ||
162 | |||
163 | /* the list of all allocated objects */ | ||
164 | static LIST_HEAD(object_list); | ||
165 | /* the list of gray-colored objects (see color_gray comment below) */ | ||
166 | static LIST_HEAD(gray_list); | ||
167 | /* prio search tree for object boundaries */ | ||
168 | static struct prio_tree_root object_tree_root; | ||
169 | /* rw_lock protecting the access to object_list and prio_tree_root */ | ||
170 | static DEFINE_RWLOCK(kmemleak_lock); | ||
171 | |||
172 | /* allocation caches for kmemleak internal data */ | ||
173 | static struct kmem_cache *object_cache; | ||
174 | static struct kmem_cache *scan_area_cache; | ||
175 | |||
176 | /* set if tracing memory operations is enabled */ | ||
177 | static atomic_t kmemleak_enabled = ATOMIC_INIT(0); | ||
178 | /* set in the late_initcall if there were no errors */ | ||
179 | static atomic_t kmemleak_initialized = ATOMIC_INIT(0); | ||
180 | /* enables or disables early logging of the memory operations */ | ||
181 | static atomic_t kmemleak_early_log = ATOMIC_INIT(1); | ||
182 | /* set if a fata kmemleak error has occurred */ | ||
183 | static atomic_t kmemleak_error = ATOMIC_INIT(0); | ||
184 | |||
185 | /* minimum and maximum address that may be valid pointers */ | ||
186 | static unsigned long min_addr = ULONG_MAX; | ||
187 | static unsigned long max_addr; | ||
188 | |||
189 | /* used for yielding the CPU to other tasks during scanning */ | ||
190 | static unsigned long next_scan_yield; | ||
191 | static struct task_struct *scan_thread; | ||
192 | static unsigned long jiffies_scan_yield; | ||
193 | static unsigned long jiffies_min_age; | ||
194 | /* delay between automatic memory scannings */ | ||
195 | static signed long jiffies_scan_wait; | ||
196 | /* enables or disables the task stacks scanning */ | ||
197 | static int kmemleak_stack_scan; | ||
198 | /* mutex protecting the memory scanning */ | ||
199 | static DEFINE_MUTEX(scan_mutex); | ||
200 | /* mutex protecting the access to the /sys/kernel/debug/kmemleak file */ | ||
201 | static DEFINE_MUTEX(kmemleak_mutex); | ||
202 | |||
203 | /* number of leaks reported (for limitation purposes) */ | ||
204 | static int reported_leaks; | ||
205 | |||
206 | /* | ||
207 | * Early object allocation/freeing logging. Kmemleak is initialized after the | ||
208 | * kernel allocator. However, both the kernel allocator and kmemleak may | ||
209 | * allocate memory blocks which need to be tracked. Kmemleak defines an | ||
210 | * arbitrary buffer to hold the allocation/freeing information before it is | ||
211 | * fully initialized. | ||
212 | */ | ||
213 | |||
214 | /* kmemleak operation type for early logging */ | ||
215 | enum { | ||
216 | KMEMLEAK_ALLOC, | ||
217 | KMEMLEAK_FREE, | ||
218 | KMEMLEAK_NOT_LEAK, | ||
219 | KMEMLEAK_IGNORE, | ||
220 | KMEMLEAK_SCAN_AREA, | ||
221 | KMEMLEAK_NO_SCAN | ||
222 | }; | ||
223 | |||
224 | /* | ||
225 | * Structure holding the information passed to kmemleak callbacks during the | ||
226 | * early logging. | ||
227 | */ | ||
228 | struct early_log { | ||
229 | int op_type; /* kmemleak operation type */ | ||
230 | const void *ptr; /* allocated/freed memory block */ | ||
231 | size_t size; /* memory block size */ | ||
232 | int min_count; /* minimum reference count */ | ||
233 | unsigned long offset; /* scan area offset */ | ||
234 | size_t length; /* scan area length */ | ||
235 | }; | ||
236 | |||
237 | /* early logging buffer and current position */ | ||
238 | static struct early_log early_log[200]; | ||
239 | static int crt_early_log; | ||
240 | |||
241 | static void kmemleak_disable(void); | ||
242 | |||
243 | /* | ||
244 | * Print a warning and dump the stack trace. | ||
245 | */ | ||
246 | #define kmemleak_warn(x...) do { \ | ||
247 | pr_warning(x); \ | ||
248 | dump_stack(); \ | ||
249 | } while (0) | ||
250 | |||
251 | /* | ||
252 | * Macro invoked when a serious kmemleak condition occured and cannot be | ||
253 | * recovered from. Kmemleak will be disabled and further allocation/freeing | ||
254 | * tracing no longer available. | ||
255 | */ | ||
256 | #define kmemleak_stop(x...) do { \ | ||
257 | kmemleak_warn(x); \ | ||
258 | kmemleak_disable(); \ | ||
259 | } while (0) | ||
260 | |||
261 | /* | ||
262 | * Object colors, encoded with count and min_count: | ||
263 | * - white - orphan object, not enough references to it (count < min_count) | ||
264 | * - gray - not orphan, not marked as false positive (min_count == 0) or | ||
265 | * sufficient references to it (count >= min_count) | ||
266 | * - black - ignore, it doesn't contain references (e.g. text section) | ||
267 | * (min_count == -1). No function defined for this color. | ||
268 | * Newly created objects don't have any color assigned (object->count == -1) | ||
269 | * before the next memory scan when they become white. | ||
270 | */ | ||
271 | static int color_white(const struct kmemleak_object *object) | ||
272 | { | ||
273 | return object->count != -1 && object->count < object->min_count; | ||
274 | } | ||
275 | |||
276 | static int color_gray(const struct kmemleak_object *object) | ||
277 | { | ||
278 | return object->min_count != -1 && object->count >= object->min_count; | ||
279 | } | ||
280 | |||
281 | /* | ||
282 | * Objects are considered referenced if their color is gray and they have not | ||
283 | * been deleted. | ||
284 | */ | ||
285 | static int referenced_object(struct kmemleak_object *object) | ||
286 | { | ||
287 | return (object->flags & OBJECT_ALLOCATED) && color_gray(object); | ||
288 | } | ||
289 | |||
290 | /* | ||
291 | * Objects are considered unreferenced only if their color is white, they have | ||
292 | * not be deleted and have a minimum age to avoid false positives caused by | ||
293 | * pointers temporarily stored in CPU registers. | ||
294 | */ | ||
295 | static int unreferenced_object(struct kmemleak_object *object) | ||
296 | { | ||
297 | return (object->flags & OBJECT_ALLOCATED) && color_white(object) && | ||
298 | time_is_before_eq_jiffies(object->jiffies + jiffies_min_age); | ||
299 | } | ||
300 | |||
301 | /* | ||
302 | * Printing of the (un)referenced objects information, either to the seq file | ||
303 | * or to the kernel log. The print_referenced/print_unreferenced functions | ||
304 | * must be called with the object->lock held. | ||
305 | */ | ||
306 | #define print_helper(seq, x...) do { \ | ||
307 | struct seq_file *s = (seq); \ | ||
308 | if (s) \ | ||
309 | seq_printf(s, x); \ | ||
310 | else \ | ||
311 | pr_info(x); \ | ||
312 | } while (0) | ||
313 | |||
314 | static void print_referenced(struct kmemleak_object *object) | ||
315 | { | ||
316 | pr_info("referenced object 0x%08lx (size %zu)\n", | ||
317 | object->pointer, object->size); | ||
318 | } | ||
319 | |||
320 | static void print_unreferenced(struct seq_file *seq, | ||
321 | struct kmemleak_object *object) | ||
322 | { | ||
323 | int i; | ||
324 | |||
325 | print_helper(seq, "unreferenced object 0x%08lx (size %zu):\n", | ||
326 | object->pointer, object->size); | ||
327 | print_helper(seq, " comm \"%s\", pid %d, jiffies %lu\n", | ||
328 | object->comm, object->pid, object->jiffies); | ||
329 | print_helper(seq, " backtrace:\n"); | ||
330 | |||
331 | for (i = 0; i < object->trace_len; i++) { | ||
332 | void *ptr = (void *)object->trace[i]; | ||
333 | print_helper(seq, " [<%p>] %pS\n", ptr, ptr); | ||
334 | } | ||
335 | } | ||
336 | |||
337 | /* | ||
338 | * Print the kmemleak_object information. This function is used mainly for | ||
339 | * debugging special cases when kmemleak operations. It must be called with | ||
340 | * the object->lock held. | ||
341 | */ | ||
342 | static void dump_object_info(struct kmemleak_object *object) | ||
343 | { | ||
344 | struct stack_trace trace; | ||
345 | |||
346 | trace.nr_entries = object->trace_len; | ||
347 | trace.entries = object->trace; | ||
348 | |||
349 | pr_notice("Object 0x%08lx (size %zu):\n", | ||
350 | object->tree_node.start, object->size); | ||
351 | pr_notice(" comm \"%s\", pid %d, jiffies %lu\n", | ||
352 | object->comm, object->pid, object->jiffies); | ||
353 | pr_notice(" min_count = %d\n", object->min_count); | ||
354 | pr_notice(" count = %d\n", object->count); | ||
355 | pr_notice(" backtrace:\n"); | ||
356 | print_stack_trace(&trace, 4); | ||
357 | } | ||
358 | |||
359 | /* | ||
360 | * Look-up a memory block metadata (kmemleak_object) in the priority search | ||
361 | * tree based on a pointer value. If alias is 0, only values pointing to the | ||
362 | * beginning of the memory block are allowed. The kmemleak_lock must be held | ||
363 | * when calling this function. | ||
364 | */ | ||
365 | static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) | ||
366 | { | ||
367 | struct prio_tree_node *node; | ||
368 | struct prio_tree_iter iter; | ||
369 | struct kmemleak_object *object; | ||
370 | |||
371 | prio_tree_iter_init(&iter, &object_tree_root, ptr, ptr); | ||
372 | node = prio_tree_next(&iter); | ||
373 | if (node) { | ||
374 | object = prio_tree_entry(node, struct kmemleak_object, | ||
375 | tree_node); | ||
376 | if (!alias && object->pointer != ptr) { | ||
377 | kmemleak_warn("Found object by alias"); | ||
378 | object = NULL; | ||
379 | } | ||
380 | } else | ||
381 | object = NULL; | ||
382 | |||
383 | return object; | ||
384 | } | ||
385 | |||
386 | /* | ||
387 | * Increment the object use_count. Return 1 if successful or 0 otherwise. Note | ||
388 | * that once an object's use_count reached 0, the RCU freeing was already | ||
389 | * registered and the object should no longer be used. This function must be | ||
390 | * called under the protection of rcu_read_lock(). | ||
391 | */ | ||
392 | static int get_object(struct kmemleak_object *object) | ||
393 | { | ||
394 | return atomic_inc_not_zero(&object->use_count); | ||
395 | } | ||
396 | |||
397 | /* | ||
398 | * RCU callback to free a kmemleak_object. | ||
399 | */ | ||
400 | static void free_object_rcu(struct rcu_head *rcu) | ||
401 | { | ||
402 | struct hlist_node *elem, *tmp; | ||
403 | struct kmemleak_scan_area *area; | ||
404 | struct kmemleak_object *object = | ||
405 | container_of(rcu, struct kmemleak_object, rcu); | ||
406 | |||
407 | /* | ||
408 | * Once use_count is 0 (guaranteed by put_object), there is no other | ||
409 | * code accessing this object, hence no need for locking. | ||
410 | */ | ||
411 | hlist_for_each_entry_safe(area, elem, tmp, &object->area_list, node) { | ||
412 | hlist_del(elem); | ||
413 | kmem_cache_free(scan_area_cache, area); | ||
414 | } | ||
415 | kmem_cache_free(object_cache, object); | ||
416 | } | ||
417 | |||
418 | /* | ||
419 | * Decrement the object use_count. Once the count is 0, free the object using | ||
420 | * an RCU callback. Since put_object() may be called via the kmemleak_free() -> | ||
421 | * delete_object() path, the delayed RCU freeing ensures that there is no | ||
422 | * recursive call to the kernel allocator. Lock-less RCU object_list traversal | ||
423 | * is also possible. | ||
424 | */ | ||
425 | static void put_object(struct kmemleak_object *object) | ||
426 | { | ||
427 | if (!atomic_dec_and_test(&object->use_count)) | ||
428 | return; | ||
429 | |||
430 | /* should only get here after delete_object was called */ | ||
431 | WARN_ON(object->flags & OBJECT_ALLOCATED); | ||
432 | |||
433 | call_rcu(&object->rcu, free_object_rcu); | ||
434 | } | ||
435 | |||
436 | /* | ||
437 | * Look up an object in the prio search tree and increase its use_count. | ||
438 | */ | ||
439 | static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) | ||
440 | { | ||
441 | unsigned long flags; | ||
442 | struct kmemleak_object *object = NULL; | ||
443 | |||
444 | rcu_read_lock(); | ||
445 | read_lock_irqsave(&kmemleak_lock, flags); | ||
446 | if (ptr >= min_addr && ptr < max_addr) | ||
447 | object = lookup_object(ptr, alias); | ||
448 | read_unlock_irqrestore(&kmemleak_lock, flags); | ||
449 | |||
450 | /* check whether the object is still available */ | ||
451 | if (object && !get_object(object)) | ||
452 | object = NULL; | ||
453 | rcu_read_unlock(); | ||
454 | |||
455 | return object; | ||
456 | } | ||
457 | |||
458 | /* | ||
459 | * Create the metadata (struct kmemleak_object) corresponding to an allocated | ||
460 | * memory block and add it to the object_list and object_tree_root. | ||
461 | */ | ||
462 | static void create_object(unsigned long ptr, size_t size, int min_count, | ||
463 | gfp_t gfp) | ||
464 | { | ||
465 | unsigned long flags; | ||
466 | struct kmemleak_object *object; | ||
467 | struct prio_tree_node *node; | ||
468 | struct stack_trace trace; | ||
469 | |||
470 | object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK); | ||
471 | if (!object) { | ||
472 | kmemleak_stop("Cannot allocate a kmemleak_object structure\n"); | ||
473 | return; | ||
474 | } | ||
475 | |||
476 | INIT_LIST_HEAD(&object->object_list); | ||
477 | INIT_LIST_HEAD(&object->gray_list); | ||
478 | INIT_HLIST_HEAD(&object->area_list); | ||
479 | spin_lock_init(&object->lock); | ||
480 | atomic_set(&object->use_count, 1); | ||
481 | object->flags = OBJECT_ALLOCATED; | ||
482 | object->pointer = ptr; | ||
483 | object->size = size; | ||
484 | object->min_count = min_count; | ||
485 | object->count = -1; /* no color initially */ | ||
486 | object->jiffies = jiffies; | ||
487 | |||
488 | /* task information */ | ||
489 | if (in_irq()) { | ||
490 | object->pid = 0; | ||
491 | strncpy(object->comm, "hardirq", sizeof(object->comm)); | ||
492 | } else if (in_softirq()) { | ||
493 | object->pid = 0; | ||
494 | strncpy(object->comm, "softirq", sizeof(object->comm)); | ||
495 | } else { | ||
496 | object->pid = current->pid; | ||
497 | /* | ||
498 | * There is a small chance of a race with set_task_comm(), | ||
499 | * however using get_task_comm() here may cause locking | ||
500 | * dependency issues with current->alloc_lock. In the worst | ||
501 | * case, the command line is not correct. | ||
502 | */ | ||
503 | strncpy(object->comm, current->comm, sizeof(object->comm)); | ||
504 | } | ||
505 | |||
506 | /* kernel backtrace */ | ||
507 | trace.max_entries = MAX_TRACE; | ||
508 | trace.nr_entries = 0; | ||
509 | trace.entries = object->trace; | ||
510 | trace.skip = 1; | ||
511 | save_stack_trace(&trace); | ||
512 | object->trace_len = trace.nr_entries; | ||
513 | |||
514 | INIT_PRIO_TREE_NODE(&object->tree_node); | ||
515 | object->tree_node.start = ptr; | ||
516 | object->tree_node.last = ptr + size - 1; | ||
517 | |||
518 | write_lock_irqsave(&kmemleak_lock, flags); | ||
519 | min_addr = min(min_addr, ptr); | ||
520 | max_addr = max(max_addr, ptr + size); | ||
521 | node = prio_tree_insert(&object_tree_root, &object->tree_node); | ||
522 | /* | ||
523 | * The code calling the kernel does not yet have the pointer to the | ||
524 | * memory block to be able to free it. However, we still hold the | ||
525 | * kmemleak_lock here in case parts of the kernel started freeing | ||
526 | * random memory blocks. | ||
527 | */ | ||
528 | if (node != &object->tree_node) { | ||
529 | unsigned long flags; | ||
530 | |||
531 | kmemleak_stop("Cannot insert 0x%lx into the object search tree " | ||
532 | "(already existing)\n", ptr); | ||
533 | object = lookup_object(ptr, 1); | ||
534 | spin_lock_irqsave(&object->lock, flags); | ||
535 | dump_object_info(object); | ||
536 | spin_unlock_irqrestore(&object->lock, flags); | ||
537 | |||
538 | goto out; | ||
539 | } | ||
540 | list_add_tail_rcu(&object->object_list, &object_list); | ||
541 | out: | ||
542 | write_unlock_irqrestore(&kmemleak_lock, flags); | ||
543 | } | ||
544 | |||
545 | /* | ||
546 | * Remove the metadata (struct kmemleak_object) for a memory block from the | ||
547 | * object_list and object_tree_root and decrement its use_count. | ||
548 | */ | ||
549 | static void delete_object(unsigned long ptr) | ||
550 | { | ||
551 | unsigned long flags; | ||
552 | struct kmemleak_object *object; | ||
553 | |||
554 | write_lock_irqsave(&kmemleak_lock, flags); | ||
555 | object = lookup_object(ptr, 0); | ||
556 | if (!object) { | ||
557 | kmemleak_warn("Freeing unknown object at 0x%08lx\n", | ||
558 | ptr); | ||
559 | write_unlock_irqrestore(&kmemleak_lock, flags); | ||
560 | return; | ||
561 | } | ||
562 | prio_tree_remove(&object_tree_root, &object->tree_node); | ||
563 | list_del_rcu(&object->object_list); | ||
564 | write_unlock_irqrestore(&kmemleak_lock, flags); | ||
565 | |||
566 | WARN_ON(!(object->flags & OBJECT_ALLOCATED)); | ||
567 | WARN_ON(atomic_read(&object->use_count) < 1); | ||
568 | |||
569 | /* | ||
570 | * Locking here also ensures that the corresponding memory block | ||
571 | * cannot be freed when it is being scanned. | ||
572 | */ | ||
573 | spin_lock_irqsave(&object->lock, flags); | ||
574 | if (object->flags & OBJECT_REPORTED) | ||
575 | print_referenced(object); | ||
576 | object->flags &= ~OBJECT_ALLOCATED; | ||
577 | spin_unlock_irqrestore(&object->lock, flags); | ||
578 | put_object(object); | ||
579 | } | ||
580 | |||
581 | /* | ||
582 | * Make a object permanently as gray-colored so that it can no longer be | ||
583 | * reported as a leak. This is used in general to mark a false positive. | ||
584 | */ | ||
585 | static void make_gray_object(unsigned long ptr) | ||
586 | { | ||
587 | unsigned long flags; | ||
588 | struct kmemleak_object *object; | ||
589 | |||
590 | object = find_and_get_object(ptr, 0); | ||
591 | if (!object) { | ||
592 | kmemleak_warn("Graying unknown object at 0x%08lx\n", ptr); | ||
593 | return; | ||
594 | } | ||
595 | |||
596 | spin_lock_irqsave(&object->lock, flags); | ||
597 | object->min_count = 0; | ||
598 | spin_unlock_irqrestore(&object->lock, flags); | ||
599 | put_object(object); | ||
600 | } | ||
601 | |||
602 | /* | ||
603 | * Mark the object as black-colored so that it is ignored from scans and | ||
604 | * reporting. | ||
605 | */ | ||
606 | static void make_black_object(unsigned long ptr) | ||
607 | { | ||
608 | unsigned long flags; | ||
609 | struct kmemleak_object *object; | ||
610 | |||
611 | object = find_and_get_object(ptr, 0); | ||
612 | if (!object) { | ||
613 | kmemleak_warn("Blacking unknown object at 0x%08lx\n", ptr); | ||
614 | return; | ||
615 | } | ||
616 | |||
617 | spin_lock_irqsave(&object->lock, flags); | ||
618 | object->min_count = -1; | ||
619 | spin_unlock_irqrestore(&object->lock, flags); | ||
620 | put_object(object); | ||
621 | } | ||
622 | |||
623 | /* | ||
624 | * Add a scanning area to the object. If at least one such area is added, | ||
625 | * kmemleak will only scan these ranges rather than the whole memory block. | ||
626 | */ | ||
627 | static void add_scan_area(unsigned long ptr, unsigned long offset, | ||
628 | size_t length, gfp_t gfp) | ||
629 | { | ||
630 | unsigned long flags; | ||
631 | struct kmemleak_object *object; | ||
632 | struct kmemleak_scan_area *area; | ||
633 | |||
634 | object = find_and_get_object(ptr, 0); | ||
635 | if (!object) { | ||
636 | kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n", | ||
637 | ptr); | ||
638 | return; | ||
639 | } | ||
640 | |||
641 | area = kmem_cache_alloc(scan_area_cache, gfp & GFP_KMEMLEAK_MASK); | ||
642 | if (!area) { | ||
643 | kmemleak_warn("Cannot allocate a scan area\n"); | ||
644 | goto out; | ||
645 | } | ||
646 | |||
647 | spin_lock_irqsave(&object->lock, flags); | ||
648 | if (offset + length > object->size) { | ||
649 | kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); | ||
650 | dump_object_info(object); | ||
651 | kmem_cache_free(scan_area_cache, area); | ||
652 | goto out_unlock; | ||
653 | } | ||
654 | |||
655 | INIT_HLIST_NODE(&area->node); | ||
656 | area->offset = offset; | ||
657 | area->length = length; | ||
658 | |||
659 | hlist_add_head(&area->node, &object->area_list); | ||
660 | out_unlock: | ||
661 | spin_unlock_irqrestore(&object->lock, flags); | ||
662 | out: | ||
663 | put_object(object); | ||
664 | } | ||
665 | |||
666 | /* | ||
667 | * Set the OBJECT_NO_SCAN flag for the object corresponding to the give | ||
668 | * pointer. Such object will not be scanned by kmemleak but references to it | ||
669 | * are searched. | ||
670 | */ | ||
671 | static void object_no_scan(unsigned long ptr) | ||
672 | { | ||
673 | unsigned long flags; | ||
674 | struct kmemleak_object *object; | ||
675 | |||
676 | object = find_and_get_object(ptr, 0); | ||
677 | if (!object) { | ||
678 | kmemleak_warn("Not scanning unknown object at 0x%08lx\n", ptr); | ||
679 | return; | ||
680 | } | ||
681 | |||
682 | spin_lock_irqsave(&object->lock, flags); | ||
683 | object->flags |= OBJECT_NO_SCAN; | ||
684 | spin_unlock_irqrestore(&object->lock, flags); | ||
685 | put_object(object); | ||
686 | } | ||
687 | |||
688 | /* | ||
689 | * Log an early kmemleak_* call to the early_log buffer. These calls will be | ||
690 | * processed later once kmemleak is fully initialized. | ||
691 | */ | ||
692 | static void log_early(int op_type, const void *ptr, size_t size, | ||
693 | int min_count, unsigned long offset, size_t length) | ||
694 | { | ||
695 | unsigned long flags; | ||
696 | struct early_log *log; | ||
697 | |||
698 | if (crt_early_log >= ARRAY_SIZE(early_log)) { | ||
699 | kmemleak_stop("Early log buffer exceeded\n"); | ||
700 | return; | ||
701 | } | ||
702 | |||
703 | /* | ||
704 | * There is no need for locking since the kernel is still in UP mode | ||
705 | * at this stage. Disabling the IRQs is enough. | ||
706 | */ | ||
707 | local_irq_save(flags); | ||
708 | log = &early_log[crt_early_log]; | ||
709 | log->op_type = op_type; | ||
710 | log->ptr = ptr; | ||
711 | log->size = size; | ||
712 | log->min_count = min_count; | ||
713 | log->offset = offset; | ||
714 | log->length = length; | ||
715 | crt_early_log++; | ||
716 | local_irq_restore(flags); | ||
717 | } | ||
718 | |||
719 | /* | ||
720 | * Memory allocation function callback. This function is called from the | ||
721 | * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc, | ||
722 | * vmalloc etc.). | ||
723 | */ | ||
724 | void kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) | ||
725 | { | ||
726 | pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count); | ||
727 | |||
728 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | ||
729 | create_object((unsigned long)ptr, size, min_count, gfp); | ||
730 | else if (atomic_read(&kmemleak_early_log)) | ||
731 | log_early(KMEMLEAK_ALLOC, ptr, size, min_count, 0, 0); | ||
732 | } | ||
733 | EXPORT_SYMBOL_GPL(kmemleak_alloc); | ||
734 | |||
735 | /* | ||
736 | * Memory freeing function callback. This function is called from the kernel | ||
737 | * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.). | ||
738 | */ | ||
739 | void kmemleak_free(const void *ptr) | ||
740 | { | ||
741 | pr_debug("%s(0x%p)\n", __func__, ptr); | ||
742 | |||
743 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | ||
744 | delete_object((unsigned long)ptr); | ||
745 | else if (atomic_read(&kmemleak_early_log)) | ||
746 | log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0); | ||
747 | } | ||
748 | EXPORT_SYMBOL_GPL(kmemleak_free); | ||
749 | |||
750 | /* | ||
751 | * Mark an already allocated memory block as a false positive. This will cause | ||
752 | * the block to no longer be reported as leak and always be scanned. | ||
753 | */ | ||
754 | void kmemleak_not_leak(const void *ptr) | ||
755 | { | ||
756 | pr_debug("%s(0x%p)\n", __func__, ptr); | ||
757 | |||
758 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | ||
759 | make_gray_object((unsigned long)ptr); | ||
760 | else if (atomic_read(&kmemleak_early_log)) | ||
761 | log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0, 0, 0); | ||
762 | } | ||
763 | EXPORT_SYMBOL(kmemleak_not_leak); | ||
764 | |||
765 | /* | ||
766 | * Ignore a memory block. This is usually done when it is known that the | ||
767 | * corresponding block is not a leak and does not contain any references to | ||
768 | * other allocated memory blocks. | ||
769 | */ | ||
770 | void kmemleak_ignore(const void *ptr) | ||
771 | { | ||
772 | pr_debug("%s(0x%p)\n", __func__, ptr); | ||
773 | |||
774 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | ||
775 | make_black_object((unsigned long)ptr); | ||
776 | else if (atomic_read(&kmemleak_early_log)) | ||
777 | log_early(KMEMLEAK_IGNORE, ptr, 0, 0, 0, 0); | ||
778 | } | ||
779 | EXPORT_SYMBOL(kmemleak_ignore); | ||
780 | |||
781 | /* | ||
782 | * Limit the range to be scanned in an allocated memory block. | ||
783 | */ | ||
784 | void kmemleak_scan_area(const void *ptr, unsigned long offset, size_t length, | ||
785 | gfp_t gfp) | ||
786 | { | ||
787 | pr_debug("%s(0x%p)\n", __func__, ptr); | ||
788 | |||
789 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | ||
790 | add_scan_area((unsigned long)ptr, offset, length, gfp); | ||
791 | else if (atomic_read(&kmemleak_early_log)) | ||
792 | log_early(KMEMLEAK_SCAN_AREA, ptr, 0, 0, offset, length); | ||
793 | } | ||
794 | EXPORT_SYMBOL(kmemleak_scan_area); | ||
795 | |||
796 | /* | ||
797 | * Inform kmemleak not to scan the given memory block. | ||
798 | */ | ||
799 | void kmemleak_no_scan(const void *ptr) | ||
800 | { | ||
801 | pr_debug("%s(0x%p)\n", __func__, ptr); | ||
802 | |||
803 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | ||
804 | object_no_scan((unsigned long)ptr); | ||
805 | else if (atomic_read(&kmemleak_early_log)) | ||
806 | log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0, 0, 0); | ||
807 | } | ||
808 | EXPORT_SYMBOL(kmemleak_no_scan); | ||
809 | |||
810 | /* | ||
811 | * Yield the CPU so that other tasks get a chance to run. The yielding is | ||
812 | * rate-limited to avoid excessive number of calls to the schedule() function | ||
813 | * during memory scanning. | ||
814 | */ | ||
815 | static void scan_yield(void) | ||
816 | { | ||
817 | might_sleep(); | ||
818 | |||
819 | if (time_is_before_eq_jiffies(next_scan_yield)) { | ||
820 | schedule(); | ||
821 | next_scan_yield = jiffies + jiffies_scan_yield; | ||
822 | } | ||
823 | } | ||
824 | |||
825 | /* | ||
826 | * Memory scanning is a long process and it needs to be interruptable. This | ||
827 | * function checks whether such interrupt condition occured. | ||
828 | */ | ||
829 | static int scan_should_stop(void) | ||
830 | { | ||
831 | if (!atomic_read(&kmemleak_enabled)) | ||
832 | return 1; | ||
833 | |||
834 | /* | ||
835 | * This function may be called from either process or kthread context, | ||
836 | * hence the need to check for both stop conditions. | ||
837 | */ | ||
838 | if (current->mm) | ||
839 | return signal_pending(current); | ||
840 | else | ||
841 | return kthread_should_stop(); | ||
842 | |||
843 | return 0; | ||
844 | } | ||
845 | |||
846 | /* | ||
847 | * Scan a memory block (exclusive range) for valid pointers and add those | ||
848 | * found to the gray list. | ||
849 | */ | ||
850 | static void scan_block(void *_start, void *_end, | ||
851 | struct kmemleak_object *scanned) | ||
852 | { | ||
853 | unsigned long *ptr; | ||
854 | unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER); | ||
855 | unsigned long *end = _end - (BYTES_PER_POINTER - 1); | ||
856 | |||
857 | for (ptr = start; ptr < end; ptr++) { | ||
858 | unsigned long flags; | ||
859 | unsigned long pointer = *ptr; | ||
860 | struct kmemleak_object *object; | ||
861 | |||
862 | if (scan_should_stop()) | ||
863 | break; | ||
864 | |||
865 | /* | ||
866 | * When scanning a memory block with a corresponding | ||
867 | * kmemleak_object, the CPU yielding is handled in the calling | ||
868 | * code since it holds the object->lock to avoid the block | ||
869 | * freeing. | ||
870 | */ | ||
871 | if (!scanned) | ||
872 | scan_yield(); | ||
873 | |||
874 | object = find_and_get_object(pointer, 1); | ||
875 | if (!object) | ||
876 | continue; | ||
877 | if (object == scanned) { | ||
878 | /* self referenced, ignore */ | ||
879 | put_object(object); | ||
880 | continue; | ||
881 | } | ||
882 | |||
883 | /* | ||
884 | * Avoid the lockdep recursive warning on object->lock being | ||
885 | * previously acquired in scan_object(). These locks are | ||
886 | * enclosed by scan_mutex. | ||
887 | */ | ||
888 | spin_lock_irqsave_nested(&object->lock, flags, | ||
889 | SINGLE_DEPTH_NESTING); | ||
890 | if (!color_white(object)) { | ||
891 | /* non-orphan, ignored or new */ | ||
892 | spin_unlock_irqrestore(&object->lock, flags); | ||
893 | put_object(object); | ||
894 | continue; | ||
895 | } | ||
896 | |||
897 | /* | ||
898 | * Increase the object's reference count (number of pointers | ||
899 | * to the memory block). If this count reaches the required | ||
900 | * minimum, the object's color will become gray and it will be | ||
901 | * added to the gray_list. | ||
902 | */ | ||
903 | object->count++; | ||
904 | if (color_gray(object)) | ||
905 | list_add_tail(&object->gray_list, &gray_list); | ||
906 | else | ||
907 | put_object(object); | ||
908 | spin_unlock_irqrestore(&object->lock, flags); | ||
909 | } | ||
910 | } | ||
911 | |||
912 | /* | ||
913 | * Scan a memory block corresponding to a kmemleak_object. A condition is | ||
914 | * that object->use_count >= 1. | ||
915 | */ | ||
916 | static void scan_object(struct kmemleak_object *object) | ||
917 | { | ||
918 | struct kmemleak_scan_area *area; | ||
919 | struct hlist_node *elem; | ||
920 | unsigned long flags; | ||
921 | |||
922 | /* | ||
923 | * Once the object->lock is aquired, the corresponding memory block | ||
924 | * cannot be freed (the same lock is aquired in delete_object). | ||
925 | */ | ||
926 | spin_lock_irqsave(&object->lock, flags); | ||
927 | if (object->flags & OBJECT_NO_SCAN) | ||
928 | goto out; | ||
929 | if (!(object->flags & OBJECT_ALLOCATED)) | ||
930 | /* already freed object */ | ||
931 | goto out; | ||
932 | if (hlist_empty(&object->area_list)) | ||
933 | scan_block((void *)object->pointer, | ||
934 | (void *)(object->pointer + object->size), object); | ||
935 | else | ||
936 | hlist_for_each_entry(area, elem, &object->area_list, node) | ||
937 | scan_block((void *)(object->pointer + area->offset), | ||
938 | (void *)(object->pointer + area->offset | ||
939 | + area->length), object); | ||
940 | out: | ||
941 | spin_unlock_irqrestore(&object->lock, flags); | ||
942 | } | ||
943 | |||
944 | /* | ||
945 | * Scan data sections and all the referenced memory blocks allocated via the | ||
946 | * kernel's standard allocators. This function must be called with the | ||
947 | * scan_mutex held. | ||
948 | */ | ||
949 | static void kmemleak_scan(void) | ||
950 | { | ||
951 | unsigned long flags; | ||
952 | struct kmemleak_object *object, *tmp; | ||
953 | struct task_struct *task; | ||
954 | int i; | ||
955 | |||
956 | /* prepare the kmemleak_object's */ | ||
957 | rcu_read_lock(); | ||
958 | list_for_each_entry_rcu(object, &object_list, object_list) { | ||
959 | spin_lock_irqsave(&object->lock, flags); | ||
960 | #ifdef DEBUG | ||
961 | /* | ||
962 | * With a few exceptions there should be a maximum of | ||
963 | * 1 reference to any object at this point. | ||
964 | */ | ||
965 | if (atomic_read(&object->use_count) > 1) { | ||
966 | pr_debug("object->use_count = %d\n", | ||
967 | atomic_read(&object->use_count)); | ||
968 | dump_object_info(object); | ||
969 | } | ||
970 | #endif | ||
971 | /* reset the reference count (whiten the object) */ | ||
972 | object->count = 0; | ||
973 | if (color_gray(object) && get_object(object)) | ||
974 | list_add_tail(&object->gray_list, &gray_list); | ||
975 | |||
976 | spin_unlock_irqrestore(&object->lock, flags); | ||
977 | } | ||
978 | rcu_read_unlock(); | ||
979 | |||
980 | /* data/bss scanning */ | ||
981 | scan_block(_sdata, _edata, NULL); | ||
982 | scan_block(__bss_start, __bss_stop, NULL); | ||
983 | |||
984 | #ifdef CONFIG_SMP | ||
985 | /* per-cpu sections scanning */ | ||
986 | for_each_possible_cpu(i) | ||
987 | scan_block(__per_cpu_start + per_cpu_offset(i), | ||
988 | __per_cpu_end + per_cpu_offset(i), NULL); | ||
989 | #endif | ||
990 | |||
991 | /* | ||
992 | * Struct page scanning for each node. The code below is not yet safe | ||
993 | * with MEMORY_HOTPLUG. | ||
994 | */ | ||
995 | for_each_online_node(i) { | ||
996 | pg_data_t *pgdat = NODE_DATA(i); | ||
997 | unsigned long start_pfn = pgdat->node_start_pfn; | ||
998 | unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; | ||
999 | unsigned long pfn; | ||
1000 | |||
1001 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { | ||
1002 | struct page *page; | ||
1003 | |||
1004 | if (!pfn_valid(pfn)) | ||
1005 | continue; | ||
1006 | page = pfn_to_page(pfn); | ||
1007 | /* only scan if page is in use */ | ||
1008 | if (page_count(page) == 0) | ||
1009 | continue; | ||
1010 | scan_block(page, page + 1, NULL); | ||
1011 | } | ||
1012 | } | ||
1013 | |||
1014 | /* | ||
1015 | * Scanning the task stacks may introduce false negatives and it is | ||
1016 | * not enabled by default. | ||
1017 | */ | ||
1018 | if (kmemleak_stack_scan) { | ||
1019 | read_lock(&tasklist_lock); | ||
1020 | for_each_process(task) | ||
1021 | scan_block(task_stack_page(task), | ||
1022 | task_stack_page(task) + THREAD_SIZE, NULL); | ||
1023 | read_unlock(&tasklist_lock); | ||
1024 | } | ||
1025 | |||
1026 | /* | ||
1027 | * Scan the objects already referenced from the sections scanned | ||
1028 | * above. More objects will be referenced and, if there are no memory | ||
1029 | * leaks, all the objects will be scanned. The list traversal is safe | ||
1030 | * for both tail additions and removals from inside the loop. The | ||
1031 | * kmemleak objects cannot be freed from outside the loop because their | ||
1032 | * use_count was increased. | ||
1033 | */ | ||
1034 | object = list_entry(gray_list.next, typeof(*object), gray_list); | ||
1035 | while (&object->gray_list != &gray_list) { | ||
1036 | scan_yield(); | ||
1037 | |||
1038 | /* may add new objects to the list */ | ||
1039 | if (!scan_should_stop()) | ||
1040 | scan_object(object); | ||
1041 | |||
1042 | tmp = list_entry(object->gray_list.next, typeof(*object), | ||
1043 | gray_list); | ||
1044 | |||
1045 | /* remove the object from the list and release it */ | ||
1046 | list_del(&object->gray_list); | ||
1047 | put_object(object); | ||
1048 | |||
1049 | object = tmp; | ||
1050 | } | ||
1051 | WARN_ON(!list_empty(&gray_list)); | ||
1052 | } | ||
1053 | |||
1054 | /* | ||
1055 | * Thread function performing automatic memory scanning. Unreferenced objects | ||
1056 | * at the end of a memory scan are reported but only the first time. | ||
1057 | */ | ||
1058 | static int kmemleak_scan_thread(void *arg) | ||
1059 | { | ||
1060 | static int first_run = 1; | ||
1061 | |||
1062 | pr_info("Automatic memory scanning thread started\n"); | ||
1063 | |||
1064 | /* | ||
1065 | * Wait before the first scan to allow the system to fully initialize. | ||
1066 | */ | ||
1067 | if (first_run) { | ||
1068 | first_run = 0; | ||
1069 | ssleep(SECS_FIRST_SCAN); | ||
1070 | } | ||
1071 | |||
1072 | while (!kthread_should_stop()) { | ||
1073 | struct kmemleak_object *object; | ||
1074 | signed long timeout = jiffies_scan_wait; | ||
1075 | |||
1076 | mutex_lock(&scan_mutex); | ||
1077 | |||
1078 | kmemleak_scan(); | ||
1079 | reported_leaks = 0; | ||
1080 | |||
1081 | rcu_read_lock(); | ||
1082 | list_for_each_entry_rcu(object, &object_list, object_list) { | ||
1083 | unsigned long flags; | ||
1084 | |||
1085 | if (reported_leaks >= REPORTS_NR) | ||
1086 | break; | ||
1087 | spin_lock_irqsave(&object->lock, flags); | ||
1088 | if (!(object->flags & OBJECT_REPORTED) && | ||
1089 | unreferenced_object(object)) { | ||
1090 | print_unreferenced(NULL, object); | ||
1091 | object->flags |= OBJECT_REPORTED; | ||
1092 | reported_leaks++; | ||
1093 | } else if ((object->flags & OBJECT_REPORTED) && | ||
1094 | referenced_object(object)) { | ||
1095 | print_referenced(object); | ||
1096 | object->flags &= ~OBJECT_REPORTED; | ||
1097 | } | ||
1098 | spin_unlock_irqrestore(&object->lock, flags); | ||
1099 | } | ||
1100 | rcu_read_unlock(); | ||
1101 | |||
1102 | mutex_unlock(&scan_mutex); | ||
1103 | /* wait before the next scan */ | ||
1104 | while (timeout && !kthread_should_stop()) | ||
1105 | timeout = schedule_timeout_interruptible(timeout); | ||
1106 | } | ||
1107 | |||
1108 | pr_info("Automatic memory scanning thread ended\n"); | ||
1109 | |||
1110 | return 0; | ||
1111 | } | ||
1112 | |||
1113 | /* | ||
1114 | * Start the automatic memory scanning thread. This function must be called | ||
1115 | * with the kmemleak_mutex held. | ||
1116 | */ | ||
1117 | void start_scan_thread(void) | ||
1118 | { | ||
1119 | if (scan_thread) | ||
1120 | return; | ||
1121 | scan_thread = kthread_run(kmemleak_scan_thread, NULL, "kmemleak"); | ||
1122 | if (IS_ERR(scan_thread)) { | ||
1123 | pr_warning("Failed to create the scan thread\n"); | ||
1124 | scan_thread = NULL; | ||
1125 | } | ||
1126 | } | ||
1127 | |||
1128 | /* | ||
1129 | * Stop the automatic memory scanning thread. This function must be called | ||
1130 | * with the kmemleak_mutex held. | ||
1131 | */ | ||
1132 | void stop_scan_thread(void) | ||
1133 | { | ||
1134 | if (scan_thread) { | ||
1135 | kthread_stop(scan_thread); | ||
1136 | scan_thread = NULL; | ||
1137 | } | ||
1138 | } | ||
1139 | |||
1140 | /* | ||
1141 | * Iterate over the object_list and return the first valid object at or after | ||
1142 | * the required position with its use_count incremented. The function triggers | ||
1143 | * a memory scanning when the pos argument points to the first position. | ||
1144 | */ | ||
1145 | static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos) | ||
1146 | { | ||
1147 | struct kmemleak_object *object; | ||
1148 | loff_t n = *pos; | ||
1149 | |||
1150 | if (!n) { | ||
1151 | kmemleak_scan(); | ||
1152 | reported_leaks = 0; | ||
1153 | } | ||
1154 | if (reported_leaks >= REPORTS_NR) | ||
1155 | return NULL; | ||
1156 | |||
1157 | rcu_read_lock(); | ||
1158 | list_for_each_entry_rcu(object, &object_list, object_list) { | ||
1159 | if (n-- > 0) | ||
1160 | continue; | ||
1161 | if (get_object(object)) | ||
1162 | goto out; | ||
1163 | } | ||
1164 | object = NULL; | ||
1165 | out: | ||
1166 | rcu_read_unlock(); | ||
1167 | return object; | ||
1168 | } | ||
1169 | |||
1170 | /* | ||
1171 | * Return the next object in the object_list. The function decrements the | ||
1172 | * use_count of the previous object and increases that of the next one. | ||
1173 | */ | ||
1174 | static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos) | ||
1175 | { | ||
1176 | struct kmemleak_object *prev_obj = v; | ||
1177 | struct kmemleak_object *next_obj = NULL; | ||
1178 | struct list_head *n = &prev_obj->object_list; | ||
1179 | |||
1180 | ++(*pos); | ||
1181 | if (reported_leaks >= REPORTS_NR) | ||
1182 | goto out; | ||
1183 | |||
1184 | rcu_read_lock(); | ||
1185 | list_for_each_continue_rcu(n, &object_list) { | ||
1186 | next_obj = list_entry(n, struct kmemleak_object, object_list); | ||
1187 | if (get_object(next_obj)) | ||
1188 | break; | ||
1189 | } | ||
1190 | rcu_read_unlock(); | ||
1191 | out: | ||
1192 | put_object(prev_obj); | ||
1193 | return next_obj; | ||
1194 | } | ||
1195 | |||
1196 | /* | ||
1197 | * Decrement the use_count of the last object required, if any. | ||
1198 | */ | ||
1199 | static void kmemleak_seq_stop(struct seq_file *seq, void *v) | ||
1200 | { | ||
1201 | if (v) | ||
1202 | put_object(v); | ||
1203 | } | ||
1204 | |||
1205 | /* | ||
1206 | * Print the information for an unreferenced object to the seq file. | ||
1207 | */ | ||
1208 | static int kmemleak_seq_show(struct seq_file *seq, void *v) | ||
1209 | { | ||
1210 | struct kmemleak_object *object = v; | ||
1211 | unsigned long flags; | ||
1212 | |||
1213 | spin_lock_irqsave(&object->lock, flags); | ||
1214 | if (!unreferenced_object(object)) | ||
1215 | goto out; | ||
1216 | print_unreferenced(seq, object); | ||
1217 | reported_leaks++; | ||
1218 | out: | ||
1219 | spin_unlock_irqrestore(&object->lock, flags); | ||
1220 | return 0; | ||
1221 | } | ||
1222 | |||
1223 | static const struct seq_operations kmemleak_seq_ops = { | ||
1224 | .start = kmemleak_seq_start, | ||
1225 | .next = kmemleak_seq_next, | ||
1226 | .stop = kmemleak_seq_stop, | ||
1227 | .show = kmemleak_seq_show, | ||
1228 | }; | ||
1229 | |||
1230 | static int kmemleak_open(struct inode *inode, struct file *file) | ||
1231 | { | ||
1232 | int ret = 0; | ||
1233 | |||
1234 | if (!atomic_read(&kmemleak_enabled)) | ||
1235 | return -EBUSY; | ||
1236 | |||
1237 | ret = mutex_lock_interruptible(&kmemleak_mutex); | ||
1238 | if (ret < 0) | ||
1239 | goto out; | ||
1240 | if (file->f_mode & FMODE_READ) { | ||
1241 | ret = mutex_lock_interruptible(&scan_mutex); | ||
1242 | if (ret < 0) | ||
1243 | goto kmemleak_unlock; | ||
1244 | ret = seq_open(file, &kmemleak_seq_ops); | ||
1245 | if (ret < 0) | ||
1246 | goto scan_unlock; | ||
1247 | } | ||
1248 | return ret; | ||
1249 | |||
1250 | scan_unlock: | ||
1251 | mutex_unlock(&scan_mutex); | ||
1252 | kmemleak_unlock: | ||
1253 | mutex_unlock(&kmemleak_mutex); | ||
1254 | out: | ||
1255 | return ret; | ||
1256 | } | ||
1257 | |||
1258 | static int kmemleak_release(struct inode *inode, struct file *file) | ||
1259 | { | ||
1260 | int ret = 0; | ||
1261 | |||
1262 | if (file->f_mode & FMODE_READ) { | ||
1263 | seq_release(inode, file); | ||
1264 | mutex_unlock(&scan_mutex); | ||
1265 | } | ||
1266 | mutex_unlock(&kmemleak_mutex); | ||
1267 | |||
1268 | return ret; | ||
1269 | } | ||
1270 | |||
1271 | /* | ||
1272 | * File write operation to configure kmemleak at run-time. The following | ||
1273 | * commands can be written to the /sys/kernel/debug/kmemleak file: | ||
1274 | * off - disable kmemleak (irreversible) | ||
1275 | * stack=on - enable the task stacks scanning | ||
1276 | * stack=off - disable the tasks stacks scanning | ||
1277 | * scan=on - start the automatic memory scanning thread | ||
1278 | * scan=off - stop the automatic memory scanning thread | ||
1279 | * scan=... - set the automatic memory scanning period in seconds (0 to | ||
1280 | * disable it) | ||
1281 | */ | ||
1282 | static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, | ||
1283 | size_t size, loff_t *ppos) | ||
1284 | { | ||
1285 | char buf[64]; | ||
1286 | int buf_size; | ||
1287 | |||
1288 | if (!atomic_read(&kmemleak_enabled)) | ||
1289 | return -EBUSY; | ||
1290 | |||
1291 | buf_size = min(size, (sizeof(buf) - 1)); | ||
1292 | if (strncpy_from_user(buf, user_buf, buf_size) < 0) | ||
1293 | return -EFAULT; | ||
1294 | buf[buf_size] = 0; | ||
1295 | |||
1296 | if (strncmp(buf, "off", 3) == 0) | ||
1297 | kmemleak_disable(); | ||
1298 | else if (strncmp(buf, "stack=on", 8) == 0) | ||
1299 | kmemleak_stack_scan = 1; | ||
1300 | else if (strncmp(buf, "stack=off", 9) == 0) | ||
1301 | kmemleak_stack_scan = 0; | ||
1302 | else if (strncmp(buf, "scan=on", 7) == 0) | ||
1303 | start_scan_thread(); | ||
1304 | else if (strncmp(buf, "scan=off", 8) == 0) | ||
1305 | stop_scan_thread(); | ||
1306 | else if (strncmp(buf, "scan=", 5) == 0) { | ||
1307 | unsigned long secs; | ||
1308 | int err; | ||
1309 | |||
1310 | err = strict_strtoul(buf + 5, 0, &secs); | ||
1311 | if (err < 0) | ||
1312 | return err; | ||
1313 | stop_scan_thread(); | ||
1314 | if (secs) { | ||
1315 | jiffies_scan_wait = msecs_to_jiffies(secs * 1000); | ||
1316 | start_scan_thread(); | ||
1317 | } | ||
1318 | } else | ||
1319 | return -EINVAL; | ||
1320 | |||
1321 | /* ignore the rest of the buffer, only one command at a time */ | ||
1322 | *ppos += size; | ||
1323 | return size; | ||
1324 | } | ||
1325 | |||
1326 | static const struct file_operations kmemleak_fops = { | ||
1327 | .owner = THIS_MODULE, | ||
1328 | .open = kmemleak_open, | ||
1329 | .read = seq_read, | ||
1330 | .write = kmemleak_write, | ||
1331 | .llseek = seq_lseek, | ||
1332 | .release = kmemleak_release, | ||
1333 | }; | ||
1334 | |||
1335 | /* | ||
1336 | * Perform the freeing of the kmemleak internal objects after waiting for any | ||
1337 | * current memory scan to complete. | ||
1338 | */ | ||
1339 | static int kmemleak_cleanup_thread(void *arg) | ||
1340 | { | ||
1341 | struct kmemleak_object *object; | ||
1342 | |||
1343 | mutex_lock(&kmemleak_mutex); | ||
1344 | stop_scan_thread(); | ||
1345 | mutex_unlock(&kmemleak_mutex); | ||
1346 | |||
1347 | mutex_lock(&scan_mutex); | ||
1348 | rcu_read_lock(); | ||
1349 | list_for_each_entry_rcu(object, &object_list, object_list) | ||
1350 | delete_object(object->pointer); | ||
1351 | rcu_read_unlock(); | ||
1352 | mutex_unlock(&scan_mutex); | ||
1353 | |||
1354 | return 0; | ||
1355 | } | ||
1356 | |||
1357 | /* | ||
1358 | * Start the clean-up thread. | ||
1359 | */ | ||
1360 | static void kmemleak_cleanup(void) | ||
1361 | { | ||
1362 | struct task_struct *cleanup_thread; | ||
1363 | |||
1364 | cleanup_thread = kthread_run(kmemleak_cleanup_thread, NULL, | ||
1365 | "kmemleak-clean"); | ||
1366 | if (IS_ERR(cleanup_thread)) | ||
1367 | pr_warning("Failed to create the clean-up thread\n"); | ||
1368 | } | ||
1369 | |||
1370 | /* | ||
1371 | * Disable kmemleak. No memory allocation/freeing will be traced once this | ||
1372 | * function is called. Disabling kmemleak is an irreversible operation. | ||
1373 | */ | ||
1374 | static void kmemleak_disable(void) | ||
1375 | { | ||
1376 | /* atomically check whether it was already invoked */ | ||
1377 | if (atomic_cmpxchg(&kmemleak_error, 0, 1)) | ||
1378 | return; | ||
1379 | |||
1380 | /* stop any memory operation tracing */ | ||
1381 | atomic_set(&kmemleak_early_log, 0); | ||
1382 | atomic_set(&kmemleak_enabled, 0); | ||
1383 | |||
1384 | /* check whether it is too early for a kernel thread */ | ||
1385 | if (atomic_read(&kmemleak_initialized)) | ||
1386 | kmemleak_cleanup(); | ||
1387 | |||
1388 | pr_info("Kernel memory leak detector disabled\n"); | ||
1389 | } | ||
1390 | |||
1391 | /* | ||
1392 | * Allow boot-time kmemleak disabling (enabled by default). | ||
1393 | */ | ||
1394 | static int kmemleak_boot_config(char *str) | ||
1395 | { | ||
1396 | if (!str) | ||
1397 | return -EINVAL; | ||
1398 | if (strcmp(str, "off") == 0) | ||
1399 | kmemleak_disable(); | ||
1400 | else if (strcmp(str, "on") != 0) | ||
1401 | return -EINVAL; | ||
1402 | return 0; | ||
1403 | } | ||
1404 | early_param("kmemleak", kmemleak_boot_config); | ||
1405 | |||
1406 | /* | ||
1407 | * Kmemleak initialization. | ||
1408 | */ | ||
1409 | void __init kmemleak_init(void) | ||
1410 | { | ||
1411 | int i; | ||
1412 | unsigned long flags; | ||
1413 | |||
1414 | jiffies_scan_yield = msecs_to_jiffies(MSECS_SCAN_YIELD); | ||
1415 | jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); | ||
1416 | jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); | ||
1417 | |||
1418 | object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); | ||
1419 | scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); | ||
1420 | INIT_PRIO_TREE_ROOT(&object_tree_root); | ||
1421 | |||
1422 | /* the kernel is still in UP mode, so disabling the IRQs is enough */ | ||
1423 | local_irq_save(flags); | ||
1424 | if (!atomic_read(&kmemleak_error)) { | ||
1425 | atomic_set(&kmemleak_enabled, 1); | ||
1426 | atomic_set(&kmemleak_early_log, 0); | ||
1427 | } | ||
1428 | local_irq_restore(flags); | ||
1429 | |||
1430 | /* | ||
1431 | * This is the point where tracking allocations is safe. Automatic | ||
1432 | * scanning is started during the late initcall. Add the early logged | ||
1433 | * callbacks to the kmemleak infrastructure. | ||
1434 | */ | ||
1435 | for (i = 0; i < crt_early_log; i++) { | ||
1436 | struct early_log *log = &early_log[i]; | ||
1437 | |||
1438 | switch (log->op_type) { | ||
1439 | case KMEMLEAK_ALLOC: | ||
1440 | kmemleak_alloc(log->ptr, log->size, log->min_count, | ||
1441 | GFP_KERNEL); | ||
1442 | break; | ||
1443 | case KMEMLEAK_FREE: | ||
1444 | kmemleak_free(log->ptr); | ||
1445 | break; | ||
1446 | case KMEMLEAK_NOT_LEAK: | ||
1447 | kmemleak_not_leak(log->ptr); | ||
1448 | break; | ||
1449 | case KMEMLEAK_IGNORE: | ||
1450 | kmemleak_ignore(log->ptr); | ||
1451 | break; | ||
1452 | case KMEMLEAK_SCAN_AREA: | ||
1453 | kmemleak_scan_area(log->ptr, log->offset, log->length, | ||
1454 | GFP_KERNEL); | ||
1455 | break; | ||
1456 | case KMEMLEAK_NO_SCAN: | ||
1457 | kmemleak_no_scan(log->ptr); | ||
1458 | break; | ||
1459 | default: | ||
1460 | WARN_ON(1); | ||
1461 | } | ||
1462 | } | ||
1463 | } | ||
1464 | |||
1465 | /* | ||
1466 | * Late initialization function. | ||
1467 | */ | ||
1468 | static int __init kmemleak_late_init(void) | ||
1469 | { | ||
1470 | struct dentry *dentry; | ||
1471 | |||
1472 | atomic_set(&kmemleak_initialized, 1); | ||
1473 | |||
1474 | if (atomic_read(&kmemleak_error)) { | ||
1475 | /* | ||
1476 | * Some error occured and kmemleak was disabled. There is a | ||
1477 | * small chance that kmemleak_disable() was called immediately | ||
1478 | * after setting kmemleak_initialized and we may end up with | ||
1479 | * two clean-up threads but serialized by scan_mutex. | ||
1480 | */ | ||
1481 | kmemleak_cleanup(); | ||
1482 | return -ENOMEM; | ||
1483 | } | ||
1484 | |||
1485 | dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL, | ||
1486 | &kmemleak_fops); | ||
1487 | if (!dentry) | ||
1488 | pr_warning("Failed to create the debugfs kmemleak file\n"); | ||
1489 | mutex_lock(&kmemleak_mutex); | ||
1490 | start_scan_thread(); | ||
1491 | mutex_unlock(&kmemleak_mutex); | ||
1492 | |||
1493 | pr_info("Kernel memory leak detector initialized\n"); | ||
1494 | |||
1495 | return 0; | ||
1496 | } | ||
1497 | late_initcall(kmemleak_late_init); | ||
diff --git a/mm/maccess.c b/mm/maccess.c index ac40796cfb15..9073695ff25f 100644 --- a/mm/maccess.c +++ b/mm/maccess.c | |||
@@ -39,7 +39,7 @@ EXPORT_SYMBOL_GPL(probe_kernel_read); | |||
39 | * Safely write to address @dst from the buffer at @src. If a kernel fault | 39 | * Safely write to address @dst from the buffer at @src. If a kernel fault |
40 | * happens, handle that and return -EFAULT. | 40 | * happens, handle that and return -EFAULT. |
41 | */ | 41 | */ |
42 | long probe_kernel_write(void *dst, void *src, size_t size) | 42 | long notrace __weak probe_kernel_write(void *dst, void *src, size_t size) |
43 | { | 43 | { |
44 | long ret; | 44 | long ret; |
45 | mm_segment_t old_fs = get_fs(); | 45 | mm_segment_t old_fs = get_fs(); |
diff --git a/mm/madvise.c b/mm/madvise.c index b9ce574827c8..76eb4193acdd 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -123,8 +123,7 @@ static long madvise_willneed(struct vm_area_struct * vma, | |||
123 | end = vma->vm_end; | 123 | end = vma->vm_end; |
124 | end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | 124 | end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; |
125 | 125 | ||
126 | force_page_cache_readahead(file->f_mapping, | 126 | force_page_cache_readahead(file->f_mapping, file, start, end - start); |
127 | file, start, max_sane_readahead(end - start)); | ||
128 | return 0; | 127 | return 0; |
129 | } | 128 | } |
130 | 129 | ||
@@ -239,12 +238,30 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, | |||
239 | break; | 238 | break; |
240 | 239 | ||
241 | default: | 240 | default: |
242 | error = -EINVAL; | 241 | BUG(); |
243 | break; | 242 | break; |
244 | } | 243 | } |
245 | return error; | 244 | return error; |
246 | } | 245 | } |
247 | 246 | ||
247 | static int | ||
248 | madvise_behavior_valid(int behavior) | ||
249 | { | ||
250 | switch (behavior) { | ||
251 | case MADV_DOFORK: | ||
252 | case MADV_DONTFORK: | ||
253 | case MADV_NORMAL: | ||
254 | case MADV_SEQUENTIAL: | ||
255 | case MADV_RANDOM: | ||
256 | case MADV_REMOVE: | ||
257 | case MADV_WILLNEED: | ||
258 | case MADV_DONTNEED: | ||
259 | return 1; | ||
260 | |||
261 | default: | ||
262 | return 0; | ||
263 | } | ||
264 | } | ||
248 | /* | 265 | /* |
249 | * The madvise(2) system call. | 266 | * The madvise(2) system call. |
250 | * | 267 | * |
@@ -290,6 +307,9 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | |||
290 | int write; | 307 | int write; |
291 | size_t len; | 308 | size_t len; |
292 | 309 | ||
310 | if (!madvise_behavior_valid(behavior)) | ||
311 | return error; | ||
312 | |||
293 | write = madvise_need_mmap_write(behavior); | 313 | write = madvise_need_mmap_write(behavior); |
294 | if (write) | 314 | if (write) |
295 | down_write(¤t->mm->mmap_sem); | 315 | down_write(¤t->mm->mmap_sem); |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 78eb8552818b..e2fa20dadf40 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -45,7 +45,7 @@ struct cgroup_subsys mem_cgroup_subsys __read_mostly; | |||
45 | #define MEM_CGROUP_RECLAIM_RETRIES 5 | 45 | #define MEM_CGROUP_RECLAIM_RETRIES 5 |
46 | 46 | ||
47 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 47 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
48 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */ | 48 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ |
49 | int do_swap_account __read_mostly; | 49 | int do_swap_account __read_mostly; |
50 | static int really_do_swap_account __initdata = 1; /* for remember boot option*/ | 50 | static int really_do_swap_account __initdata = 1; /* for remember boot option*/ |
51 | #else | 51 | #else |
@@ -62,7 +62,8 @@ enum mem_cgroup_stat_index { | |||
62 | * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. | 62 | * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. |
63 | */ | 63 | */ |
64 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ | 64 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ |
65 | MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */ | 65 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ |
66 | MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ | ||
66 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ | 67 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ |
67 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ | 68 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ |
68 | 69 | ||
@@ -176,6 +177,9 @@ struct mem_cgroup { | |||
176 | 177 | ||
177 | unsigned int swappiness; | 178 | unsigned int swappiness; |
178 | 179 | ||
180 | /* set when res.limit == memsw.limit */ | ||
181 | bool memsw_is_minimum; | ||
182 | |||
179 | /* | 183 | /* |
180 | * statistics. This must be placed at the end of memcg. | 184 | * statistics. This must be placed at the end of memcg. |
181 | */ | 185 | */ |
@@ -188,6 +192,7 @@ enum charge_type { | |||
188 | MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ | 192 | MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ |
189 | MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ | 193 | MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ |
190 | MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ | 194 | MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ |
195 | MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ | ||
191 | NR_CHARGE_TYPE, | 196 | NR_CHARGE_TYPE, |
192 | }; | 197 | }; |
193 | 198 | ||
@@ -570,6 +575,17 @@ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) | |||
570 | return 0; | 575 | return 0; |
571 | } | 576 | } |
572 | 577 | ||
578 | int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) | ||
579 | { | ||
580 | unsigned long active; | ||
581 | unsigned long inactive; | ||
582 | |||
583 | inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE); | ||
584 | active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE); | ||
585 | |||
586 | return (active > inactive); | ||
587 | } | ||
588 | |||
573 | unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, | 589 | unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, |
574 | struct zone *zone, | 590 | struct zone *zone, |
575 | enum lru_list lru) | 591 | enum lru_list lru) |
@@ -633,6 +649,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
633 | int zid = zone_idx(z); | 649 | int zid = zone_idx(z); |
634 | struct mem_cgroup_per_zone *mz; | 650 | struct mem_cgroup_per_zone *mz; |
635 | int lru = LRU_FILE * !!file + !!active; | 651 | int lru = LRU_FILE * !!file + !!active; |
652 | int ret; | ||
636 | 653 | ||
637 | BUG_ON(!mem_cont); | 654 | BUG_ON(!mem_cont); |
638 | mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); | 655 | mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); |
@@ -650,9 +667,19 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
650 | continue; | 667 | continue; |
651 | 668 | ||
652 | scan++; | 669 | scan++; |
653 | if (__isolate_lru_page(page, mode, file) == 0) { | 670 | ret = __isolate_lru_page(page, mode, file); |
671 | switch (ret) { | ||
672 | case 0: | ||
654 | list_move(&page->lru, dst); | 673 | list_move(&page->lru, dst); |
674 | mem_cgroup_del_lru(page); | ||
655 | nr_taken++; | 675 | nr_taken++; |
676 | break; | ||
677 | case -EBUSY: | ||
678 | /* we don't affect global LRU but rotate in our LRU */ | ||
679 | mem_cgroup_rotate_lru_list(page, page_lru(page)); | ||
680 | break; | ||
681 | default: | ||
682 | break; | ||
656 | } | 683 | } |
657 | } | 684 | } |
658 | 685 | ||
@@ -834,6 +861,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
834 | int ret, total = 0; | 861 | int ret, total = 0; |
835 | int loop = 0; | 862 | int loop = 0; |
836 | 863 | ||
864 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ | ||
865 | if (root_mem->memsw_is_minimum) | ||
866 | noswap = true; | ||
867 | |||
837 | while (loop < 2) { | 868 | while (loop < 2) { |
838 | victim = mem_cgroup_select_victim(root_mem); | 869 | victim = mem_cgroup_select_victim(root_mem); |
839 | if (victim == root_mem) | 870 | if (victim == root_mem) |
@@ -889,6 +920,44 @@ static void record_last_oom(struct mem_cgroup *mem) | |||
889 | mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); | 920 | mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); |
890 | } | 921 | } |
891 | 922 | ||
923 | /* | ||
924 | * Currently used to update mapped file statistics, but the routine can be | ||
925 | * generalized to update other statistics as well. | ||
926 | */ | ||
927 | void mem_cgroup_update_mapped_file_stat(struct page *page, int val) | ||
928 | { | ||
929 | struct mem_cgroup *mem; | ||
930 | struct mem_cgroup_stat *stat; | ||
931 | struct mem_cgroup_stat_cpu *cpustat; | ||
932 | int cpu; | ||
933 | struct page_cgroup *pc; | ||
934 | |||
935 | if (!page_is_file_cache(page)) | ||
936 | return; | ||
937 | |||
938 | pc = lookup_page_cgroup(page); | ||
939 | if (unlikely(!pc)) | ||
940 | return; | ||
941 | |||
942 | lock_page_cgroup(pc); | ||
943 | mem = pc->mem_cgroup; | ||
944 | if (!mem) | ||
945 | goto done; | ||
946 | |||
947 | if (!PageCgroupUsed(pc)) | ||
948 | goto done; | ||
949 | |||
950 | /* | ||
951 | * Preemption is already disabled, we don't need get_cpu() | ||
952 | */ | ||
953 | cpu = smp_processor_id(); | ||
954 | stat = &mem->stat; | ||
955 | cpustat = &stat->cpustat[cpu]; | ||
956 | |||
957 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val); | ||
958 | done: | ||
959 | unlock_page_cgroup(pc); | ||
960 | } | ||
892 | 961 | ||
893 | /* | 962 | /* |
894 | * Unlike exported interface, "oom" parameter is added. if oom==true, | 963 | * Unlike exported interface, "oom" parameter is added. if oom==true, |
@@ -1087,6 +1156,10 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
1087 | struct mem_cgroup_per_zone *from_mz, *to_mz; | 1156 | struct mem_cgroup_per_zone *from_mz, *to_mz; |
1088 | int nid, zid; | 1157 | int nid, zid; |
1089 | int ret = -EBUSY; | 1158 | int ret = -EBUSY; |
1159 | struct page *page; | ||
1160 | int cpu; | ||
1161 | struct mem_cgroup_stat *stat; | ||
1162 | struct mem_cgroup_stat_cpu *cpustat; | ||
1090 | 1163 | ||
1091 | VM_BUG_ON(from == to); | 1164 | VM_BUG_ON(from == to); |
1092 | VM_BUG_ON(PageLRU(pc->page)); | 1165 | VM_BUG_ON(PageLRU(pc->page)); |
@@ -1107,6 +1180,23 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
1107 | 1180 | ||
1108 | res_counter_uncharge(&from->res, PAGE_SIZE); | 1181 | res_counter_uncharge(&from->res, PAGE_SIZE); |
1109 | mem_cgroup_charge_statistics(from, pc, false); | 1182 | mem_cgroup_charge_statistics(from, pc, false); |
1183 | |||
1184 | page = pc->page; | ||
1185 | if (page_is_file_cache(page) && page_mapped(page)) { | ||
1186 | cpu = smp_processor_id(); | ||
1187 | /* Update mapped_file data for mem_cgroup "from" */ | ||
1188 | stat = &from->stat; | ||
1189 | cpustat = &stat->cpustat[cpu]; | ||
1190 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, | ||
1191 | -1); | ||
1192 | |||
1193 | /* Update mapped_file data for mem_cgroup "to" */ | ||
1194 | stat = &to->stat; | ||
1195 | cpustat = &stat->cpustat[cpu]; | ||
1196 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, | ||
1197 | 1); | ||
1198 | } | ||
1199 | |||
1110 | if (do_swap_account) | 1200 | if (do_swap_account) |
1111 | res_counter_uncharge(&from->memsw, PAGE_SIZE); | 1201 | res_counter_uncharge(&from->memsw, PAGE_SIZE); |
1112 | css_put(&from->css); | 1202 | css_put(&from->css); |
@@ -1422,6 +1512,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
1422 | 1512 | ||
1423 | switch (ctype) { | 1513 | switch (ctype) { |
1424 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: | 1514 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: |
1515 | case MEM_CGROUP_CHARGE_TYPE_DROP: | ||
1425 | if (page_mapped(page)) | 1516 | if (page_mapped(page)) |
1426 | goto unlock_out; | 1517 | goto unlock_out; |
1427 | break; | 1518 | break; |
@@ -1485,18 +1576,23 @@ void mem_cgroup_uncharge_cache_page(struct page *page) | |||
1485 | * called after __delete_from_swap_cache() and drop "page" account. | 1576 | * called after __delete_from_swap_cache() and drop "page" account. |
1486 | * memcg information is recorded to swap_cgroup of "ent" | 1577 | * memcg information is recorded to swap_cgroup of "ent" |
1487 | */ | 1578 | */ |
1488 | void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) | 1579 | void |
1580 | mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) | ||
1489 | { | 1581 | { |
1490 | struct mem_cgroup *memcg; | 1582 | struct mem_cgroup *memcg; |
1583 | int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; | ||
1584 | |||
1585 | if (!swapout) /* this was a swap cache but the swap is unused ! */ | ||
1586 | ctype = MEM_CGROUP_CHARGE_TYPE_DROP; | ||
1587 | |||
1588 | memcg = __mem_cgroup_uncharge_common(page, ctype); | ||
1491 | 1589 | ||
1492 | memcg = __mem_cgroup_uncharge_common(page, | ||
1493 | MEM_CGROUP_CHARGE_TYPE_SWAPOUT); | ||
1494 | /* record memcg information */ | 1590 | /* record memcg information */ |
1495 | if (do_swap_account && memcg) { | 1591 | if (do_swap_account && swapout && memcg) { |
1496 | swap_cgroup_record(ent, css_id(&memcg->css)); | 1592 | swap_cgroup_record(ent, css_id(&memcg->css)); |
1497 | mem_cgroup_get(memcg); | 1593 | mem_cgroup_get(memcg); |
1498 | } | 1594 | } |
1499 | if (memcg) | 1595 | if (swapout && memcg) |
1500 | css_put(&memcg->css); | 1596 | css_put(&memcg->css); |
1501 | } | 1597 | } |
1502 | #endif | 1598 | #endif |
@@ -1674,6 +1770,12 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
1674 | break; | 1770 | break; |
1675 | } | 1771 | } |
1676 | ret = res_counter_set_limit(&memcg->res, val); | 1772 | ret = res_counter_set_limit(&memcg->res, val); |
1773 | if (!ret) { | ||
1774 | if (memswlimit == val) | ||
1775 | memcg->memsw_is_minimum = true; | ||
1776 | else | ||
1777 | memcg->memsw_is_minimum = false; | ||
1778 | } | ||
1677 | mutex_unlock(&set_limit_mutex); | 1779 | mutex_unlock(&set_limit_mutex); |
1678 | 1780 | ||
1679 | if (!ret) | 1781 | if (!ret) |
@@ -1692,16 +1794,14 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
1692 | return ret; | 1794 | return ret; |
1693 | } | 1795 | } |
1694 | 1796 | ||
1695 | int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | 1797 | static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, |
1696 | unsigned long long val) | 1798 | unsigned long long val) |
1697 | { | 1799 | { |
1698 | int retry_count; | 1800 | int retry_count; |
1699 | u64 memlimit, oldusage, curusage; | 1801 | u64 memlimit, oldusage, curusage; |
1700 | int children = mem_cgroup_count_children(memcg); | 1802 | int children = mem_cgroup_count_children(memcg); |
1701 | int ret = -EBUSY; | 1803 | int ret = -EBUSY; |
1702 | 1804 | ||
1703 | if (!do_swap_account) | ||
1704 | return -EINVAL; | ||
1705 | /* see mem_cgroup_resize_res_limit */ | 1805 | /* see mem_cgroup_resize_res_limit */ |
1706 | retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; | 1806 | retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; |
1707 | oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | 1807 | oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); |
@@ -1723,6 +1823,12 @@ int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
1723 | break; | 1823 | break; |
1724 | } | 1824 | } |
1725 | ret = res_counter_set_limit(&memcg->memsw, val); | 1825 | ret = res_counter_set_limit(&memcg->memsw, val); |
1826 | if (!ret) { | ||
1827 | if (memlimit == val) | ||
1828 | memcg->memsw_is_minimum = true; | ||
1829 | else | ||
1830 | memcg->memsw_is_minimum = false; | ||
1831 | } | ||
1726 | mutex_unlock(&set_limit_mutex); | 1832 | mutex_unlock(&set_limit_mutex); |
1727 | 1833 | ||
1728 | if (!ret) | 1834 | if (!ret) |
@@ -1936,8 +2042,7 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | |||
1936 | val = res_counter_read_u64(&mem->res, name); | 2042 | val = res_counter_read_u64(&mem->res, name); |
1937 | break; | 2043 | break; |
1938 | case _MEMSWAP: | 2044 | case _MEMSWAP: |
1939 | if (do_swap_account) | 2045 | val = res_counter_read_u64(&mem->memsw, name); |
1940 | val = res_counter_read_u64(&mem->memsw, name); | ||
1941 | break; | 2046 | break; |
1942 | default: | 2047 | default: |
1943 | BUG(); | 2048 | BUG(); |
@@ -2035,6 +2140,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | |||
2035 | enum { | 2140 | enum { |
2036 | MCS_CACHE, | 2141 | MCS_CACHE, |
2037 | MCS_RSS, | 2142 | MCS_RSS, |
2143 | MCS_MAPPED_FILE, | ||
2038 | MCS_PGPGIN, | 2144 | MCS_PGPGIN, |
2039 | MCS_PGPGOUT, | 2145 | MCS_PGPGOUT, |
2040 | MCS_INACTIVE_ANON, | 2146 | MCS_INACTIVE_ANON, |
@@ -2055,6 +2161,7 @@ struct { | |||
2055 | } memcg_stat_strings[NR_MCS_STAT] = { | 2161 | } memcg_stat_strings[NR_MCS_STAT] = { |
2056 | {"cache", "total_cache"}, | 2162 | {"cache", "total_cache"}, |
2057 | {"rss", "total_rss"}, | 2163 | {"rss", "total_rss"}, |
2164 | {"mapped_file", "total_mapped_file"}, | ||
2058 | {"pgpgin", "total_pgpgin"}, | 2165 | {"pgpgin", "total_pgpgin"}, |
2059 | {"pgpgout", "total_pgpgout"}, | 2166 | {"pgpgout", "total_pgpgout"}, |
2060 | {"inactive_anon", "total_inactive_anon"}, | 2167 | {"inactive_anon", "total_inactive_anon"}, |
@@ -2075,6 +2182,8 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) | |||
2075 | s->stat[MCS_CACHE] += val * PAGE_SIZE; | 2182 | s->stat[MCS_CACHE] += val * PAGE_SIZE; |
2076 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); | 2183 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); |
2077 | s->stat[MCS_RSS] += val * PAGE_SIZE; | 2184 | s->stat[MCS_RSS] += val * PAGE_SIZE; |
2185 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_MAPPED_FILE); | ||
2186 | s->stat[MCS_MAPPED_FILE] += val * PAGE_SIZE; | ||
2078 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); | 2187 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); |
2079 | s->stat[MCS_PGPGIN] += val; | 2188 | s->stat[MCS_PGPGIN] += val; |
2080 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); | 2189 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); |
diff --git a/mm/memory.c b/mm/memory.c index 4126dd16778c..f46ac18ba231 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1310,8 +1310,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1310 | cond_resched(); | 1310 | cond_resched(); |
1311 | while (!(page = follow_page(vma, start, foll_flags))) { | 1311 | while (!(page = follow_page(vma, start, foll_flags))) { |
1312 | int ret; | 1312 | int ret; |
1313 | |||
1313 | ret = handle_mm_fault(mm, vma, start, | 1314 | ret = handle_mm_fault(mm, vma, start, |
1314 | foll_flags & FOLL_WRITE); | 1315 | (foll_flags & FOLL_WRITE) ? |
1316 | FAULT_FLAG_WRITE : 0); | ||
1317 | |||
1315 | if (ret & VM_FAULT_ERROR) { | 1318 | if (ret & VM_FAULT_ERROR) { |
1316 | if (ret & VM_FAULT_OOM) | 1319 | if (ret & VM_FAULT_OOM) |
1317 | return i ? i : -ENOMEM; | 1320 | return i ? i : -ENOMEM; |
@@ -1360,6 +1363,56 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1360 | return i; | 1363 | return i; |
1361 | } | 1364 | } |
1362 | 1365 | ||
1366 | /** | ||
1367 | * get_user_pages() - pin user pages in memory | ||
1368 | * @tsk: task_struct of target task | ||
1369 | * @mm: mm_struct of target mm | ||
1370 | * @start: starting user address | ||
1371 | * @len: number of pages from start to pin | ||
1372 | * @write: whether pages will be written to by the caller | ||
1373 | * @force: whether to force write access even if user mapping is | ||
1374 | * readonly. This will result in the page being COWed even | ||
1375 | * in MAP_SHARED mappings. You do not want this. | ||
1376 | * @pages: array that receives pointers to the pages pinned. | ||
1377 | * Should be at least nr_pages long. Or NULL, if caller | ||
1378 | * only intends to ensure the pages are faulted in. | ||
1379 | * @vmas: array of pointers to vmas corresponding to each page. | ||
1380 | * Or NULL if the caller does not require them. | ||
1381 | * | ||
1382 | * Returns number of pages pinned. This may be fewer than the number | ||
1383 | * requested. If len is 0 or negative, returns 0. If no pages | ||
1384 | * were pinned, returns -errno. Each page returned must be released | ||
1385 | * with a put_page() call when it is finished with. vmas will only | ||
1386 | * remain valid while mmap_sem is held. | ||
1387 | * | ||
1388 | * Must be called with mmap_sem held for read or write. | ||
1389 | * | ||
1390 | * get_user_pages walks a process's page tables and takes a reference to | ||
1391 | * each struct page that each user address corresponds to at a given | ||
1392 | * instant. That is, it takes the page that would be accessed if a user | ||
1393 | * thread accesses the given user virtual address at that instant. | ||
1394 | * | ||
1395 | * This does not guarantee that the page exists in the user mappings when | ||
1396 | * get_user_pages returns, and there may even be a completely different | ||
1397 | * page there in some cases (eg. if mmapped pagecache has been invalidated | ||
1398 | * and subsequently re faulted). However it does guarantee that the page | ||
1399 | * won't be freed completely. And mostly callers simply care that the page | ||
1400 | * contains data that was valid *at some point in time*. Typically, an IO | ||
1401 | * or similar operation cannot guarantee anything stronger anyway because | ||
1402 | * locks can't be held over the syscall boundary. | ||
1403 | * | ||
1404 | * If write=0, the page must not be written to. If the page is written to, | ||
1405 | * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called | ||
1406 | * after the page is finished with, and before put_page is called. | ||
1407 | * | ||
1408 | * get_user_pages is typically used for fewer-copy IO operations, to get a | ||
1409 | * handle on the memory by some means other than accesses via the user virtual | ||
1410 | * addresses. The pages may be submitted for DMA to devices or accessed via | ||
1411 | * their kernel linear mapping (via the kmap APIs). Care should be taken to | ||
1412 | * use the correct cache flushing APIs. | ||
1413 | * | ||
1414 | * See also get_user_pages_fast, for performance critical applications. | ||
1415 | */ | ||
1363 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 1416 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
1364 | unsigned long start, int len, int write, int force, | 1417 | unsigned long start, int len, int write, int force, |
1365 | struct page **pages, struct vm_area_struct **vmas) | 1418 | struct page **pages, struct vm_area_struct **vmas) |
@@ -2446,7 +2499,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) | |||
2446 | */ | 2499 | */ |
2447 | static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2500 | static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, |
2448 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 2501 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
2449 | int write_access, pte_t orig_pte) | 2502 | unsigned int flags, pte_t orig_pte) |
2450 | { | 2503 | { |
2451 | spinlock_t *ptl; | 2504 | spinlock_t *ptl; |
2452 | struct page *page; | 2505 | struct page *page; |
@@ -2466,7 +2519,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2466 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); | 2519 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); |
2467 | page = lookup_swap_cache(entry); | 2520 | page = lookup_swap_cache(entry); |
2468 | if (!page) { | 2521 | if (!page) { |
2469 | grab_swap_token(); /* Contend for token _before_ read-in */ | 2522 | grab_swap_token(mm); /* Contend for token _before_ read-in */ |
2470 | page = swapin_readahead(entry, | 2523 | page = swapin_readahead(entry, |
2471 | GFP_HIGHUSER_MOVABLE, vma, address); | 2524 | GFP_HIGHUSER_MOVABLE, vma, address); |
2472 | if (!page) { | 2525 | if (!page) { |
@@ -2522,9 +2575,9 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2522 | 2575 | ||
2523 | inc_mm_counter(mm, anon_rss); | 2576 | inc_mm_counter(mm, anon_rss); |
2524 | pte = mk_pte(page, vma->vm_page_prot); | 2577 | pte = mk_pte(page, vma->vm_page_prot); |
2525 | if (write_access && reuse_swap_page(page)) { | 2578 | if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { |
2526 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); | 2579 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); |
2527 | write_access = 0; | 2580 | flags &= ~FAULT_FLAG_WRITE; |
2528 | } | 2581 | } |
2529 | flush_icache_page(vma, page); | 2582 | flush_icache_page(vma, page); |
2530 | set_pte_at(mm, address, page_table, pte); | 2583 | set_pte_at(mm, address, page_table, pte); |
@@ -2537,7 +2590,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2537 | try_to_free_swap(page); | 2590 | try_to_free_swap(page); |
2538 | unlock_page(page); | 2591 | unlock_page(page); |
2539 | 2592 | ||
2540 | if (write_access) { | 2593 | if (flags & FAULT_FLAG_WRITE) { |
2541 | ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); | 2594 | ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); |
2542 | if (ret & VM_FAULT_ERROR) | 2595 | if (ret & VM_FAULT_ERROR) |
2543 | ret &= VM_FAULT_ERROR; | 2596 | ret &= VM_FAULT_ERROR; |
@@ -2566,7 +2619,7 @@ out_page: | |||
2566 | */ | 2619 | */ |
2567 | static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2620 | static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, |
2568 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 2621 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
2569 | int write_access) | 2622 | unsigned int flags) |
2570 | { | 2623 | { |
2571 | struct page *page; | 2624 | struct page *page; |
2572 | spinlock_t *ptl; | 2625 | spinlock_t *ptl; |
@@ -2726,7 +2779,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2726 | * due to the bad i386 page protection. But it's valid | 2779 | * due to the bad i386 page protection. But it's valid |
2727 | * for other architectures too. | 2780 | * for other architectures too. |
2728 | * | 2781 | * |
2729 | * Note that if write_access is true, we either now have | 2782 | * Note that if FAULT_FLAG_WRITE is set, we either now have |
2730 | * an exclusive copy of the page, or this is a shared mapping, | 2783 | * an exclusive copy of the page, or this is a shared mapping, |
2731 | * so we can make it writable and dirty to avoid having to | 2784 | * so we can make it writable and dirty to avoid having to |
2732 | * handle that later. | 2785 | * handle that later. |
@@ -2797,11 +2850,10 @@ unwritable_page: | |||
2797 | 2850 | ||
2798 | static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 2851 | static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
2799 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 2852 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
2800 | int write_access, pte_t orig_pte) | 2853 | unsigned int flags, pte_t orig_pte) |
2801 | { | 2854 | { |
2802 | pgoff_t pgoff = (((address & PAGE_MASK) | 2855 | pgoff_t pgoff = (((address & PAGE_MASK) |
2803 | - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | 2856 | - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; |
2804 | unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0); | ||
2805 | 2857 | ||
2806 | pte_unmap(page_table); | 2858 | pte_unmap(page_table); |
2807 | return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); | 2859 | return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); |
@@ -2818,12 +2870,12 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2818 | */ | 2870 | */ |
2819 | static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 2871 | static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
2820 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 2872 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
2821 | int write_access, pte_t orig_pte) | 2873 | unsigned int flags, pte_t orig_pte) |
2822 | { | 2874 | { |
2823 | unsigned int flags = FAULT_FLAG_NONLINEAR | | ||
2824 | (write_access ? FAULT_FLAG_WRITE : 0); | ||
2825 | pgoff_t pgoff; | 2875 | pgoff_t pgoff; |
2826 | 2876 | ||
2877 | flags |= FAULT_FLAG_NONLINEAR; | ||
2878 | |||
2827 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) | 2879 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) |
2828 | return 0; | 2880 | return 0; |
2829 | 2881 | ||
@@ -2854,7 +2906,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2854 | */ | 2906 | */ |
2855 | static inline int handle_pte_fault(struct mm_struct *mm, | 2907 | static inline int handle_pte_fault(struct mm_struct *mm, |
2856 | struct vm_area_struct *vma, unsigned long address, | 2908 | struct vm_area_struct *vma, unsigned long address, |
2857 | pte_t *pte, pmd_t *pmd, int write_access) | 2909 | pte_t *pte, pmd_t *pmd, unsigned int flags) |
2858 | { | 2910 | { |
2859 | pte_t entry; | 2911 | pte_t entry; |
2860 | spinlock_t *ptl; | 2912 | spinlock_t *ptl; |
@@ -2865,30 +2917,30 @@ static inline int handle_pte_fault(struct mm_struct *mm, | |||
2865 | if (vma->vm_ops) { | 2917 | if (vma->vm_ops) { |
2866 | if (likely(vma->vm_ops->fault)) | 2918 | if (likely(vma->vm_ops->fault)) |
2867 | return do_linear_fault(mm, vma, address, | 2919 | return do_linear_fault(mm, vma, address, |
2868 | pte, pmd, write_access, entry); | 2920 | pte, pmd, flags, entry); |
2869 | } | 2921 | } |
2870 | return do_anonymous_page(mm, vma, address, | 2922 | return do_anonymous_page(mm, vma, address, |
2871 | pte, pmd, write_access); | 2923 | pte, pmd, flags); |
2872 | } | 2924 | } |
2873 | if (pte_file(entry)) | 2925 | if (pte_file(entry)) |
2874 | return do_nonlinear_fault(mm, vma, address, | 2926 | return do_nonlinear_fault(mm, vma, address, |
2875 | pte, pmd, write_access, entry); | 2927 | pte, pmd, flags, entry); |
2876 | return do_swap_page(mm, vma, address, | 2928 | return do_swap_page(mm, vma, address, |
2877 | pte, pmd, write_access, entry); | 2929 | pte, pmd, flags, entry); |
2878 | } | 2930 | } |
2879 | 2931 | ||
2880 | ptl = pte_lockptr(mm, pmd); | 2932 | ptl = pte_lockptr(mm, pmd); |
2881 | spin_lock(ptl); | 2933 | spin_lock(ptl); |
2882 | if (unlikely(!pte_same(*pte, entry))) | 2934 | if (unlikely(!pte_same(*pte, entry))) |
2883 | goto unlock; | 2935 | goto unlock; |
2884 | if (write_access) { | 2936 | if (flags & FAULT_FLAG_WRITE) { |
2885 | if (!pte_write(entry)) | 2937 | if (!pte_write(entry)) |
2886 | return do_wp_page(mm, vma, address, | 2938 | return do_wp_page(mm, vma, address, |
2887 | pte, pmd, ptl, entry); | 2939 | pte, pmd, ptl, entry); |
2888 | entry = pte_mkdirty(entry); | 2940 | entry = pte_mkdirty(entry); |
2889 | } | 2941 | } |
2890 | entry = pte_mkyoung(entry); | 2942 | entry = pte_mkyoung(entry); |
2891 | if (ptep_set_access_flags(vma, address, pte, entry, write_access)) { | 2943 | if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { |
2892 | update_mmu_cache(vma, address, entry); | 2944 | update_mmu_cache(vma, address, entry); |
2893 | } else { | 2945 | } else { |
2894 | /* | 2946 | /* |
@@ -2897,7 +2949,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, | |||
2897 | * This still avoids useless tlb flushes for .text page faults | 2949 | * This still avoids useless tlb flushes for .text page faults |
2898 | * with threads. | 2950 | * with threads. |
2899 | */ | 2951 | */ |
2900 | if (write_access) | 2952 | if (flags & FAULT_FLAG_WRITE) |
2901 | flush_tlb_page(vma, address); | 2953 | flush_tlb_page(vma, address); |
2902 | } | 2954 | } |
2903 | unlock: | 2955 | unlock: |
@@ -2909,7 +2961,7 @@ unlock: | |||
2909 | * By the time we get here, we already hold the mm semaphore | 2961 | * By the time we get here, we already hold the mm semaphore |
2910 | */ | 2962 | */ |
2911 | int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 2963 | int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
2912 | unsigned long address, int write_access) | 2964 | unsigned long address, unsigned int flags) |
2913 | { | 2965 | { |
2914 | pgd_t *pgd; | 2966 | pgd_t *pgd; |
2915 | pud_t *pud; | 2967 | pud_t *pud; |
@@ -2921,7 +2973,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2921 | count_vm_event(PGFAULT); | 2973 | count_vm_event(PGFAULT); |
2922 | 2974 | ||
2923 | if (unlikely(is_vm_hugetlb_page(vma))) | 2975 | if (unlikely(is_vm_hugetlb_page(vma))) |
2924 | return hugetlb_fault(mm, vma, address, write_access); | 2976 | return hugetlb_fault(mm, vma, address, flags); |
2925 | 2977 | ||
2926 | pgd = pgd_offset(mm, address); | 2978 | pgd = pgd_offset(mm, address); |
2927 | pud = pud_alloc(mm, pgd, address); | 2979 | pud = pud_alloc(mm, pgd, address); |
@@ -2934,7 +2986,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2934 | if (!pte) | 2986 | if (!pte) |
2935 | return VM_FAULT_OOM; | 2987 | return VM_FAULT_OOM; |
2936 | 2988 | ||
2937 | return handle_pte_fault(mm, vma, address, pte, pmd, write_access); | 2989 | return handle_pte_fault(mm, vma, address, pte, pmd, flags); |
2938 | } | 2990 | } |
2939 | 2991 | ||
2940 | #ifndef __PAGETABLE_PUD_FOLDED | 2992 | #ifndef __PAGETABLE_PUD_FOLDED |
@@ -3053,22 +3105,13 @@ int in_gate_area_no_task(unsigned long addr) | |||
3053 | 3105 | ||
3054 | #endif /* __HAVE_ARCH_GATE_AREA */ | 3106 | #endif /* __HAVE_ARCH_GATE_AREA */ |
3055 | 3107 | ||
3056 | #ifdef CONFIG_HAVE_IOREMAP_PROT | 3108 | static int follow_pte(struct mm_struct *mm, unsigned long address, |
3057 | int follow_phys(struct vm_area_struct *vma, | 3109 | pte_t **ptepp, spinlock_t **ptlp) |
3058 | unsigned long address, unsigned int flags, | ||
3059 | unsigned long *prot, resource_size_t *phys) | ||
3060 | { | 3110 | { |
3061 | pgd_t *pgd; | 3111 | pgd_t *pgd; |
3062 | pud_t *pud; | 3112 | pud_t *pud; |
3063 | pmd_t *pmd; | 3113 | pmd_t *pmd; |
3064 | pte_t *ptep, pte; | 3114 | pte_t *ptep; |
3065 | spinlock_t *ptl; | ||
3066 | resource_size_t phys_addr = 0; | ||
3067 | struct mm_struct *mm = vma->vm_mm; | ||
3068 | int ret = -EINVAL; | ||
3069 | |||
3070 | if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) | ||
3071 | goto out; | ||
3072 | 3115 | ||
3073 | pgd = pgd_offset(mm, address); | 3116 | pgd = pgd_offset(mm, address); |
3074 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) | 3117 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) |
@@ -3086,22 +3129,71 @@ int follow_phys(struct vm_area_struct *vma, | |||
3086 | if (pmd_huge(*pmd)) | 3129 | if (pmd_huge(*pmd)) |
3087 | goto out; | 3130 | goto out; |
3088 | 3131 | ||
3089 | ptep = pte_offset_map_lock(mm, pmd, address, &ptl); | 3132 | ptep = pte_offset_map_lock(mm, pmd, address, ptlp); |
3090 | if (!ptep) | 3133 | if (!ptep) |
3091 | goto out; | 3134 | goto out; |
3135 | if (!pte_present(*ptep)) | ||
3136 | goto unlock; | ||
3137 | *ptepp = ptep; | ||
3138 | return 0; | ||
3139 | unlock: | ||
3140 | pte_unmap_unlock(ptep, *ptlp); | ||
3141 | out: | ||
3142 | return -EINVAL; | ||
3143 | } | ||
3092 | 3144 | ||
3145 | /** | ||
3146 | * follow_pfn - look up PFN at a user virtual address | ||
3147 | * @vma: memory mapping | ||
3148 | * @address: user virtual address | ||
3149 | * @pfn: location to store found PFN | ||
3150 | * | ||
3151 | * Only IO mappings and raw PFN mappings are allowed. | ||
3152 | * | ||
3153 | * Returns zero and the pfn at @pfn on success, -ve otherwise. | ||
3154 | */ | ||
3155 | int follow_pfn(struct vm_area_struct *vma, unsigned long address, | ||
3156 | unsigned long *pfn) | ||
3157 | { | ||
3158 | int ret = -EINVAL; | ||
3159 | spinlock_t *ptl; | ||
3160 | pte_t *ptep; | ||
3161 | |||
3162 | if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) | ||
3163 | return ret; | ||
3164 | |||
3165 | ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); | ||
3166 | if (ret) | ||
3167 | return ret; | ||
3168 | *pfn = pte_pfn(*ptep); | ||
3169 | pte_unmap_unlock(ptep, ptl); | ||
3170 | return 0; | ||
3171 | } | ||
3172 | EXPORT_SYMBOL(follow_pfn); | ||
3173 | |||
3174 | #ifdef CONFIG_HAVE_IOREMAP_PROT | ||
3175 | int follow_phys(struct vm_area_struct *vma, | ||
3176 | unsigned long address, unsigned int flags, | ||
3177 | unsigned long *prot, resource_size_t *phys) | ||
3178 | { | ||
3179 | int ret = -EINVAL; | ||
3180 | pte_t *ptep, pte; | ||
3181 | spinlock_t *ptl; | ||
3182 | |||
3183 | if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) | ||
3184 | goto out; | ||
3185 | |||
3186 | if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) | ||
3187 | goto out; | ||
3093 | pte = *ptep; | 3188 | pte = *ptep; |
3094 | if (!pte_present(pte)) | 3189 | |
3095 | goto unlock; | ||
3096 | if ((flags & FOLL_WRITE) && !pte_write(pte)) | 3190 | if ((flags & FOLL_WRITE) && !pte_write(pte)) |
3097 | goto unlock; | 3191 | goto unlock; |
3098 | phys_addr = pte_pfn(pte); | ||
3099 | phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */ | ||
3100 | 3192 | ||
3101 | *prot = pgprot_val(pte_pgprot(pte)); | 3193 | *prot = pgprot_val(pte_pgprot(pte)); |
3102 | *phys = phys_addr; | 3194 | *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; |
3103 | ret = 0; | ||
3104 | 3195 | ||
3196 | ret = 0; | ||
3105 | unlock: | 3197 | unlock: |
3106 | pte_unmap_unlock(ptep, ptl); | 3198 | pte_unmap_unlock(ptep, ptl); |
3107 | out: | 3199 | out: |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c083cf5fd6df..e4412a676c88 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -422,7 +422,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
422 | zone->present_pages += onlined_pages; | 422 | zone->present_pages += onlined_pages; |
423 | zone->zone_pgdat->node_present_pages += onlined_pages; | 423 | zone->zone_pgdat->node_present_pages += onlined_pages; |
424 | 424 | ||
425 | setup_per_zone_pages_min(); | 425 | setup_per_zone_wmarks(); |
426 | calculate_zone_inactive_ratio(zone); | ||
426 | if (onlined_pages) { | 427 | if (onlined_pages) { |
427 | kswapd_run(zone_to_nid(zone)); | 428 | kswapd_run(zone_to_nid(zone)); |
428 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); | 429 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); |
@@ -832,6 +833,9 @@ repeat: | |||
832 | totalram_pages -= offlined_pages; | 833 | totalram_pages -= offlined_pages; |
833 | num_physpages -= offlined_pages; | 834 | num_physpages -= offlined_pages; |
834 | 835 | ||
836 | setup_per_zone_wmarks(); | ||
837 | calculate_zone_inactive_ratio(zone); | ||
838 | |||
835 | vm_total_pages = nr_free_pagecache_pages(); | 839 | vm_total_pages = nr_free_pagecache_pages(); |
836 | writeback_set_ratelimit(); | 840 | writeback_set_ratelimit(); |
837 | 841 | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3eb4a6fdc043..e08e2c4da63a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -182,13 +182,54 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) | |||
182 | return 0; | 182 | return 0; |
183 | } | 183 | } |
184 | 184 | ||
185 | /* Create a new policy */ | 185 | /* |
186 | * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if | ||
187 | * any, for the new policy. mpol_new() has already validated the nodes | ||
188 | * parameter with respect to the policy mode and flags. But, we need to | ||
189 | * handle an empty nodemask with MPOL_PREFERRED here. | ||
190 | * | ||
191 | * Must be called holding task's alloc_lock to protect task's mems_allowed | ||
192 | * and mempolicy. May also be called holding the mmap_semaphore for write. | ||
193 | */ | ||
194 | static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes) | ||
195 | { | ||
196 | nodemask_t cpuset_context_nmask; | ||
197 | int ret; | ||
198 | |||
199 | /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ | ||
200 | if (pol == NULL) | ||
201 | return 0; | ||
202 | |||
203 | VM_BUG_ON(!nodes); | ||
204 | if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) | ||
205 | nodes = NULL; /* explicit local allocation */ | ||
206 | else { | ||
207 | if (pol->flags & MPOL_F_RELATIVE_NODES) | ||
208 | mpol_relative_nodemask(&cpuset_context_nmask, nodes, | ||
209 | &cpuset_current_mems_allowed); | ||
210 | else | ||
211 | nodes_and(cpuset_context_nmask, *nodes, | ||
212 | cpuset_current_mems_allowed); | ||
213 | if (mpol_store_user_nodemask(pol)) | ||
214 | pol->w.user_nodemask = *nodes; | ||
215 | else | ||
216 | pol->w.cpuset_mems_allowed = | ||
217 | cpuset_current_mems_allowed; | ||
218 | } | ||
219 | |||
220 | ret = mpol_ops[pol->mode].create(pol, | ||
221 | nodes ? &cpuset_context_nmask : NULL); | ||
222 | return ret; | ||
223 | } | ||
224 | |||
225 | /* | ||
226 | * This function just creates a new policy, does some check and simple | ||
227 | * initialization. You must invoke mpol_set_nodemask() to set nodes. | ||
228 | */ | ||
186 | static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | 229 | static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, |
187 | nodemask_t *nodes) | 230 | nodemask_t *nodes) |
188 | { | 231 | { |
189 | struct mempolicy *policy; | 232 | struct mempolicy *policy; |
190 | nodemask_t cpuset_context_nmask; | ||
191 | int ret; | ||
192 | 233 | ||
193 | pr_debug("setting mode %d flags %d nodes[0] %lx\n", | 234 | pr_debug("setting mode %d flags %d nodes[0] %lx\n", |
194 | mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); | 235 | mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); |
@@ -210,7 +251,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | |||
210 | if (((flags & MPOL_F_STATIC_NODES) || | 251 | if (((flags & MPOL_F_STATIC_NODES) || |
211 | (flags & MPOL_F_RELATIVE_NODES))) | 252 | (flags & MPOL_F_RELATIVE_NODES))) |
212 | return ERR_PTR(-EINVAL); | 253 | return ERR_PTR(-EINVAL); |
213 | nodes = NULL; /* flag local alloc */ | ||
214 | } | 254 | } |
215 | } else if (nodes_empty(*nodes)) | 255 | } else if (nodes_empty(*nodes)) |
216 | return ERR_PTR(-EINVAL); | 256 | return ERR_PTR(-EINVAL); |
@@ -221,30 +261,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | |||
221 | policy->mode = mode; | 261 | policy->mode = mode; |
222 | policy->flags = flags; | 262 | policy->flags = flags; |
223 | 263 | ||
224 | if (nodes) { | ||
225 | /* | ||
226 | * cpuset related setup doesn't apply to local allocation | ||
227 | */ | ||
228 | cpuset_update_task_memory_state(); | ||
229 | if (flags & MPOL_F_RELATIVE_NODES) | ||
230 | mpol_relative_nodemask(&cpuset_context_nmask, nodes, | ||
231 | &cpuset_current_mems_allowed); | ||
232 | else | ||
233 | nodes_and(cpuset_context_nmask, *nodes, | ||
234 | cpuset_current_mems_allowed); | ||
235 | if (mpol_store_user_nodemask(policy)) | ||
236 | policy->w.user_nodemask = *nodes; | ||
237 | else | ||
238 | policy->w.cpuset_mems_allowed = | ||
239 | cpuset_mems_allowed(current); | ||
240 | } | ||
241 | |||
242 | ret = mpol_ops[mode].create(policy, | ||
243 | nodes ? &cpuset_context_nmask : NULL); | ||
244 | if (ret < 0) { | ||
245 | kmem_cache_free(policy_cache, policy); | ||
246 | return ERR_PTR(ret); | ||
247 | } | ||
248 | return policy; | 264 | return policy; |
249 | } | 265 | } |
250 | 266 | ||
@@ -324,6 +340,8 @@ static void mpol_rebind_policy(struct mempolicy *pol, | |||
324 | /* | 340 | /* |
325 | * Wrapper for mpol_rebind_policy() that just requires task | 341 | * Wrapper for mpol_rebind_policy() that just requires task |
326 | * pointer, and updates task mempolicy. | 342 | * pointer, and updates task mempolicy. |
343 | * | ||
344 | * Called with task's alloc_lock held. | ||
327 | */ | 345 | */ |
328 | 346 | ||
329 | void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) | 347 | void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) |
@@ -600,8 +618,9 @@ static void mpol_set_task_struct_flag(void) | |||
600 | static long do_set_mempolicy(unsigned short mode, unsigned short flags, | 618 | static long do_set_mempolicy(unsigned short mode, unsigned short flags, |
601 | nodemask_t *nodes) | 619 | nodemask_t *nodes) |
602 | { | 620 | { |
603 | struct mempolicy *new; | 621 | struct mempolicy *new, *old; |
604 | struct mm_struct *mm = current->mm; | 622 | struct mm_struct *mm = current->mm; |
623 | int ret; | ||
605 | 624 | ||
606 | new = mpol_new(mode, flags, nodes); | 625 | new = mpol_new(mode, flags, nodes); |
607 | if (IS_ERR(new)) | 626 | if (IS_ERR(new)) |
@@ -615,20 +634,33 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, | |||
615 | */ | 634 | */ |
616 | if (mm) | 635 | if (mm) |
617 | down_write(&mm->mmap_sem); | 636 | down_write(&mm->mmap_sem); |
618 | mpol_put(current->mempolicy); | 637 | task_lock(current); |
638 | ret = mpol_set_nodemask(new, nodes); | ||
639 | if (ret) { | ||
640 | task_unlock(current); | ||
641 | if (mm) | ||
642 | up_write(&mm->mmap_sem); | ||
643 | mpol_put(new); | ||
644 | return ret; | ||
645 | } | ||
646 | old = current->mempolicy; | ||
619 | current->mempolicy = new; | 647 | current->mempolicy = new; |
620 | mpol_set_task_struct_flag(); | 648 | mpol_set_task_struct_flag(); |
621 | if (new && new->mode == MPOL_INTERLEAVE && | 649 | if (new && new->mode == MPOL_INTERLEAVE && |
622 | nodes_weight(new->v.nodes)) | 650 | nodes_weight(new->v.nodes)) |
623 | current->il_next = first_node(new->v.nodes); | 651 | current->il_next = first_node(new->v.nodes); |
652 | task_unlock(current); | ||
624 | if (mm) | 653 | if (mm) |
625 | up_write(&mm->mmap_sem); | 654 | up_write(&mm->mmap_sem); |
626 | 655 | ||
656 | mpol_put(old); | ||
627 | return 0; | 657 | return 0; |
628 | } | 658 | } |
629 | 659 | ||
630 | /* | 660 | /* |
631 | * Return nodemask for policy for get_mempolicy() query | 661 | * Return nodemask for policy for get_mempolicy() query |
662 | * | ||
663 | * Called with task's alloc_lock held | ||
632 | */ | 664 | */ |
633 | static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) | 665 | static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) |
634 | { | 666 | { |
@@ -674,7 +706,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
674 | struct vm_area_struct *vma = NULL; | 706 | struct vm_area_struct *vma = NULL; |
675 | struct mempolicy *pol = current->mempolicy; | 707 | struct mempolicy *pol = current->mempolicy; |
676 | 708 | ||
677 | cpuset_update_task_memory_state(); | ||
678 | if (flags & | 709 | if (flags & |
679 | ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) | 710 | ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) |
680 | return -EINVAL; | 711 | return -EINVAL; |
@@ -683,7 +714,9 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
683 | if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) | 714 | if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) |
684 | return -EINVAL; | 715 | return -EINVAL; |
685 | *policy = 0; /* just so it's initialized */ | 716 | *policy = 0; /* just so it's initialized */ |
717 | task_lock(current); | ||
686 | *nmask = cpuset_current_mems_allowed; | 718 | *nmask = cpuset_current_mems_allowed; |
719 | task_unlock(current); | ||
687 | return 0; | 720 | return 0; |
688 | } | 721 | } |
689 | 722 | ||
@@ -738,8 +771,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
738 | } | 771 | } |
739 | 772 | ||
740 | err = 0; | 773 | err = 0; |
741 | if (nmask) | 774 | if (nmask) { |
775 | task_lock(current); | ||
742 | get_policy_nodemask(pol, nmask); | 776 | get_policy_nodemask(pol, nmask); |
777 | task_unlock(current); | ||
778 | } | ||
743 | 779 | ||
744 | out: | 780 | out: |
745 | mpol_cond_put(pol); | 781 | mpol_cond_put(pol); |
@@ -767,7 +803,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, | |||
767 | 803 | ||
768 | static struct page *new_node_page(struct page *page, unsigned long node, int **x) | 804 | static struct page *new_node_page(struct page *page, unsigned long node, int **x) |
769 | { | 805 | { |
770 | return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0); | 806 | return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); |
771 | } | 807 | } |
772 | 808 | ||
773 | /* | 809 | /* |
@@ -979,6 +1015,14 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
979 | return err; | 1015 | return err; |
980 | } | 1016 | } |
981 | down_write(&mm->mmap_sem); | 1017 | down_write(&mm->mmap_sem); |
1018 | task_lock(current); | ||
1019 | err = mpol_set_nodemask(new, nmask); | ||
1020 | task_unlock(current); | ||
1021 | if (err) { | ||
1022 | up_write(&mm->mmap_sem); | ||
1023 | mpol_put(new); | ||
1024 | return err; | ||
1025 | } | ||
982 | vma = check_range(mm, start, end, nmask, | 1026 | vma = check_range(mm, start, end, nmask, |
983 | flags | MPOL_MF_INVERT, &pagelist); | 1027 | flags | MPOL_MF_INVERT, &pagelist); |
984 | 1028 | ||
@@ -1545,8 +1589,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
1545 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | 1589 | struct mempolicy *pol = get_vma_policy(current, vma, addr); |
1546 | struct zonelist *zl; | 1590 | struct zonelist *zl; |
1547 | 1591 | ||
1548 | cpuset_update_task_memory_state(); | ||
1549 | |||
1550 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { | 1592 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { |
1551 | unsigned nid; | 1593 | unsigned nid; |
1552 | 1594 | ||
@@ -1593,8 +1635,6 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
1593 | { | 1635 | { |
1594 | struct mempolicy *pol = current->mempolicy; | 1636 | struct mempolicy *pol = current->mempolicy; |
1595 | 1637 | ||
1596 | if ((gfp & __GFP_WAIT) && !in_interrupt()) | ||
1597 | cpuset_update_task_memory_state(); | ||
1598 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) | 1638 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) |
1599 | pol = &default_policy; | 1639 | pol = &default_policy; |
1600 | 1640 | ||
@@ -1854,6 +1894,8 @@ restart: | |||
1854 | */ | 1894 | */ |
1855 | void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) | 1895 | void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) |
1856 | { | 1896 | { |
1897 | int ret; | ||
1898 | |||
1857 | sp->root = RB_ROOT; /* empty tree == default mempolicy */ | 1899 | sp->root = RB_ROOT; /* empty tree == default mempolicy */ |
1858 | spin_lock_init(&sp->lock); | 1900 | spin_lock_init(&sp->lock); |
1859 | 1901 | ||
@@ -1863,9 +1905,19 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) | |||
1863 | 1905 | ||
1864 | /* contextualize the tmpfs mount point mempolicy */ | 1906 | /* contextualize the tmpfs mount point mempolicy */ |
1865 | new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); | 1907 | new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); |
1866 | mpol_put(mpol); /* drop our ref on sb mpol */ | 1908 | if (IS_ERR(new)) { |
1867 | if (IS_ERR(new)) | 1909 | mpol_put(mpol); /* drop our ref on sb mpol */ |
1868 | return; /* no valid nodemask intersection */ | 1910 | return; /* no valid nodemask intersection */ |
1911 | } | ||
1912 | |||
1913 | task_lock(current); | ||
1914 | ret = mpol_set_nodemask(new, &mpol->w.user_nodemask); | ||
1915 | task_unlock(current); | ||
1916 | mpol_put(mpol); /* drop our ref on sb mpol */ | ||
1917 | if (ret) { | ||
1918 | mpol_put(new); | ||
1919 | return; | ||
1920 | } | ||
1869 | 1921 | ||
1870 | /* Create pseudo-vma that contains just the policy */ | 1922 | /* Create pseudo-vma that contains just the policy */ |
1871 | memset(&pvma, 0, sizeof(struct vm_area_struct)); | 1923 | memset(&pvma, 0, sizeof(struct vm_area_struct)); |
@@ -2086,8 +2138,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
2086 | new = mpol_new(mode, mode_flags, &nodes); | 2138 | new = mpol_new(mode, mode_flags, &nodes); |
2087 | if (IS_ERR(new)) | 2139 | if (IS_ERR(new)) |
2088 | err = 1; | 2140 | err = 1; |
2089 | else if (no_context) | 2141 | else { |
2090 | new->w.user_nodemask = nodes; /* save for contextualization */ | 2142 | int ret; |
2143 | |||
2144 | task_lock(current); | ||
2145 | ret = mpol_set_nodemask(new, &nodes); | ||
2146 | task_unlock(current); | ||
2147 | if (ret) | ||
2148 | err = 1; | ||
2149 | else if (no_context) { | ||
2150 | /* save for contextualization */ | ||
2151 | new->w.user_nodemask = nodes; | ||
2152 | } | ||
2153 | } | ||
2091 | 2154 | ||
2092 | out: | 2155 | out: |
2093 | /* Restore string for error message */ | 2156 | /* Restore string for error message */ |
diff --git a/mm/migrate.c b/mm/migrate.c index 068655d8f883..939888f9ddab 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -802,7 +802,7 @@ static struct page *new_page_node(struct page *p, unsigned long private, | |||
802 | 802 | ||
803 | *result = &pm->status; | 803 | *result = &pm->status; |
804 | 804 | ||
805 | return alloc_pages_node(pm->node, | 805 | return alloc_pages_exact_node(pm->node, |
806 | GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); | 806 | GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); |
807 | } | 807 | } |
808 | 808 | ||
@@ -820,7 +820,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm, | |||
820 | struct page_to_node *pp; | 820 | struct page_to_node *pp; |
821 | LIST_HEAD(pagelist); | 821 | LIST_HEAD(pagelist); |
822 | 822 | ||
823 | migrate_prep(); | ||
824 | down_read(&mm->mmap_sem); | 823 | down_read(&mm->mmap_sem); |
825 | 824 | ||
826 | /* | 825 | /* |
@@ -907,6 +906,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task, | |||
907 | pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); | 906 | pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); |
908 | if (!pm) | 907 | if (!pm) |
909 | goto out; | 908 | goto out; |
909 | |||
910 | migrate_prep(); | ||
911 | |||
910 | /* | 912 | /* |
911 | * Store a chunk of page_to_node array in a page, | 913 | * Store a chunk of page_to_node array in a page, |
912 | * but keep the last one as a marker | 914 | * but keep the last one as a marker |
diff --git a/mm/mlock.c b/mm/mlock.c index cbe9e0581b75..45eb650b9654 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -31,7 +31,6 @@ int can_do_mlock(void) | |||
31 | } | 31 | } |
32 | EXPORT_SYMBOL(can_do_mlock); | 32 | EXPORT_SYMBOL(can_do_mlock); |
33 | 33 | ||
34 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
35 | /* | 34 | /* |
36 | * Mlocked pages are marked with PageMlocked() flag for efficient testing | 35 | * Mlocked pages are marked with PageMlocked() flag for efficient testing |
37 | * in vmscan and, possibly, the fault path; and to support semi-accurate | 36 | * in vmscan and, possibly, the fault path; and to support semi-accurate |
@@ -261,27 +260,6 @@ static int __mlock_posix_error_return(long retval) | |||
261 | return retval; | 260 | return retval; |
262 | } | 261 | } |
263 | 262 | ||
264 | #else /* CONFIG_UNEVICTABLE_LRU */ | ||
265 | |||
266 | /* | ||
267 | * Just make pages present if VM_LOCKED. No-op if unlocking. | ||
268 | */ | ||
269 | static long __mlock_vma_pages_range(struct vm_area_struct *vma, | ||
270 | unsigned long start, unsigned long end, | ||
271 | int mlock) | ||
272 | { | ||
273 | if (mlock && (vma->vm_flags & VM_LOCKED)) | ||
274 | return make_pages_present(start, end); | ||
275 | return 0; | ||
276 | } | ||
277 | |||
278 | static inline int __mlock_posix_error_return(long retval) | ||
279 | { | ||
280 | return 0; | ||
281 | } | ||
282 | |||
283 | #endif /* CONFIG_UNEVICTABLE_LRU */ | ||
284 | |||
285 | /** | 263 | /** |
286 | * mlock_vma_pages_range() - mlock pages in specified vma range. | 264 | * mlock_vma_pages_range() - mlock pages in specified vma range. |
287 | * @vma - the vma containing the specfied address range | 265 | * @vma - the vma containing the specfied address range |
@@ -629,52 +607,43 @@ void user_shm_unlock(size_t size, struct user_struct *user) | |||
629 | free_uid(user); | 607 | free_uid(user); |
630 | } | 608 | } |
631 | 609 | ||
632 | void *alloc_locked_buffer(size_t size) | 610 | int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim, |
611 | size_t size) | ||
633 | { | 612 | { |
634 | unsigned long rlim, vm, pgsz; | 613 | unsigned long lim, vm, pgsz; |
635 | void *buffer = NULL; | 614 | int error = -ENOMEM; |
636 | 615 | ||
637 | pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; | 616 | pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; |
638 | 617 | ||
639 | down_write(¤t->mm->mmap_sem); | 618 | down_write(&mm->mmap_sem); |
640 | 619 | ||
641 | rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; | 620 | lim = rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; |
642 | vm = current->mm->total_vm + pgsz; | 621 | vm = mm->total_vm + pgsz; |
643 | if (rlim < vm) | 622 | if (lim < vm) |
644 | goto out; | 623 | goto out; |
645 | 624 | ||
646 | rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; | 625 | lim = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; |
647 | vm = current->mm->locked_vm + pgsz; | 626 | vm = mm->locked_vm + pgsz; |
648 | if (rlim < vm) | 627 | if (lim < vm) |
649 | goto out; | 628 | goto out; |
650 | 629 | ||
651 | buffer = kzalloc(size, GFP_KERNEL); | 630 | mm->total_vm += pgsz; |
652 | if (!buffer) | 631 | mm->locked_vm += pgsz; |
653 | goto out; | ||
654 | |||
655 | current->mm->total_vm += pgsz; | ||
656 | current->mm->locked_vm += pgsz; | ||
657 | 632 | ||
633 | error = 0; | ||
658 | out: | 634 | out: |
659 | up_write(¤t->mm->mmap_sem); | 635 | up_write(&mm->mmap_sem); |
660 | return buffer; | 636 | return error; |
661 | } | 637 | } |
662 | 638 | ||
663 | void release_locked_buffer(void *buffer, size_t size) | 639 | void refund_locked_memory(struct mm_struct *mm, size_t size) |
664 | { | 640 | { |
665 | unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; | 641 | unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; |
666 | 642 | ||
667 | down_write(¤t->mm->mmap_sem); | 643 | down_write(&mm->mmap_sem); |
668 | |||
669 | current->mm->total_vm -= pgsz; | ||
670 | current->mm->locked_vm -= pgsz; | ||
671 | |||
672 | up_write(¤t->mm->mmap_sem); | ||
673 | } | ||
674 | 644 | ||
675 | void free_locked_buffer(void *buffer, size_t size) | 645 | mm->total_vm -= pgsz; |
676 | { | 646 | mm->locked_vm -= pgsz; |
677 | release_locked_buffer(buffer, size); | ||
678 | 647 | ||
679 | kfree(buffer); | 648 | up_write(&mm->mmap_sem); |
680 | } | 649 | } |
@@ -28,6 +28,7 @@ | |||
28 | #include <linux/mempolicy.h> | 28 | #include <linux/mempolicy.h> |
29 | #include <linux/rmap.h> | 29 | #include <linux/rmap.h> |
30 | #include <linux/mmu_notifier.h> | 30 | #include <linux/mmu_notifier.h> |
31 | #include <linux/perf_counter.h> | ||
31 | 32 | ||
32 | #include <asm/uaccess.h> | 33 | #include <asm/uaccess.h> |
33 | #include <asm/cacheflush.h> | 34 | #include <asm/cacheflush.h> |
@@ -87,6 +88,9 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */ | |||
87 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; | 88 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; |
88 | struct percpu_counter vm_committed_as; | 89 | struct percpu_counter vm_committed_as; |
89 | 90 | ||
91 | /* amount of vm to protect from userspace access */ | ||
92 | unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; | ||
93 | |||
90 | /* | 94 | /* |
91 | * Check that a process has enough memory to allocate a new virtual | 95 | * Check that a process has enough memory to allocate a new virtual |
92 | * mapping. 0 means there is enough memory for the allocation to | 96 | * mapping. 0 means there is enough memory for the allocation to |
@@ -1219,6 +1223,8 @@ munmap_back: | |||
1219 | if (correct_wcount) | 1223 | if (correct_wcount) |
1220 | atomic_inc(&inode->i_writecount); | 1224 | atomic_inc(&inode->i_writecount); |
1221 | out: | 1225 | out: |
1226 | perf_counter_mmap(vma); | ||
1227 | |||
1222 | mm->total_vm += len >> PAGE_SHIFT; | 1228 | mm->total_vm += len >> PAGE_SHIFT; |
1223 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); | 1229 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); |
1224 | if (vm_flags & VM_LOCKED) { | 1230 | if (vm_flags & VM_LOCKED) { |
@@ -2305,6 +2311,8 @@ int install_special_mapping(struct mm_struct *mm, | |||
2305 | 2311 | ||
2306 | mm->total_vm += len >> PAGE_SHIFT; | 2312 | mm->total_vm += len >> PAGE_SHIFT; |
2307 | 2313 | ||
2314 | perf_counter_mmap(vma); | ||
2315 | |||
2308 | return 0; | 2316 | return 0; |
2309 | } | 2317 | } |
2310 | 2318 | ||
diff --git a/mm/mprotect.c b/mm/mprotect.c index 258197b76fb4..d80311baeb2d 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/swapops.h> | 23 | #include <linux/swapops.h> |
24 | #include <linux/mmu_notifier.h> | 24 | #include <linux/mmu_notifier.h> |
25 | #include <linux/migrate.h> | 25 | #include <linux/migrate.h> |
26 | #include <linux/perf_counter.h> | ||
26 | #include <asm/uaccess.h> | 27 | #include <asm/uaccess.h> |
27 | #include <asm/pgtable.h> | 28 | #include <asm/pgtable.h> |
28 | #include <asm/cacheflush.h> | 29 | #include <asm/cacheflush.h> |
@@ -299,6 +300,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, | |||
299 | error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); | 300 | error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); |
300 | if (error) | 301 | if (error) |
301 | goto out; | 302 | goto out; |
303 | perf_counter_mmap(vma); | ||
302 | nstart = tmp; | 304 | nstart = tmp; |
303 | 305 | ||
304 | if (nstart < prev->vm_end) | 306 | if (nstart < prev->vm_end) |
diff --git a/mm/nommu.c b/mm/nommu.c index b571ef707428..2fd2ad5da98e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -69,6 +69,9 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; | |||
69 | int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; | 69 | int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; |
70 | int heap_stack_gap = 0; | 70 | int heap_stack_gap = 0; |
71 | 71 | ||
72 | /* amount of vm to protect from userspace access */ | ||
73 | unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; | ||
74 | |||
72 | atomic_long_t mmap_pages_allocated; | 75 | atomic_long_t mmap_pages_allocated; |
73 | 76 | ||
74 | EXPORT_SYMBOL(mem_map); | 77 | EXPORT_SYMBOL(mem_map); |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index a7b2460e922b..175a67a78a99 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -58,6 +58,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
58 | unsigned long points, cpu_time, run_time; | 58 | unsigned long points, cpu_time, run_time; |
59 | struct mm_struct *mm; | 59 | struct mm_struct *mm; |
60 | struct task_struct *child; | 60 | struct task_struct *child; |
61 | int oom_adj; | ||
61 | 62 | ||
62 | task_lock(p); | 63 | task_lock(p); |
63 | mm = p->mm; | 64 | mm = p->mm; |
@@ -65,6 +66,11 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
65 | task_unlock(p); | 66 | task_unlock(p); |
66 | return 0; | 67 | return 0; |
67 | } | 68 | } |
69 | oom_adj = mm->oom_adj; | ||
70 | if (oom_adj == OOM_DISABLE) { | ||
71 | task_unlock(p); | ||
72 | return 0; | ||
73 | } | ||
68 | 74 | ||
69 | /* | 75 | /* |
70 | * The memory size of the process is the basis for the badness. | 76 | * The memory size of the process is the basis for the badness. |
@@ -148,15 +154,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
148 | points /= 8; | 154 | points /= 8; |
149 | 155 | ||
150 | /* | 156 | /* |
151 | * Adjust the score by oomkilladj. | 157 | * Adjust the score by oom_adj. |
152 | */ | 158 | */ |
153 | if (p->oomkilladj) { | 159 | if (oom_adj) { |
154 | if (p->oomkilladj > 0) { | 160 | if (oom_adj > 0) { |
155 | if (!points) | 161 | if (!points) |
156 | points = 1; | 162 | points = 1; |
157 | points <<= p->oomkilladj; | 163 | points <<= oom_adj; |
158 | } else | 164 | } else |
159 | points >>= -(p->oomkilladj); | 165 | points >>= -(oom_adj); |
160 | } | 166 | } |
161 | 167 | ||
162 | #ifdef DEBUG | 168 | #ifdef DEBUG |
@@ -251,11 +257,8 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, | |||
251 | *ppoints = ULONG_MAX; | 257 | *ppoints = ULONG_MAX; |
252 | } | 258 | } |
253 | 259 | ||
254 | if (p->oomkilladj == OOM_DISABLE) | ||
255 | continue; | ||
256 | |||
257 | points = badness(p, uptime.tv_sec); | 260 | points = badness(p, uptime.tv_sec); |
258 | if (points > *ppoints || !chosen) { | 261 | if (points > *ppoints) { |
259 | chosen = p; | 262 | chosen = p; |
260 | *ppoints = points; | 263 | *ppoints = points; |
261 | } | 264 | } |
@@ -304,8 +307,7 @@ static void dump_tasks(const struct mem_cgroup *mem) | |||
304 | } | 307 | } |
305 | printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", | 308 | printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", |
306 | p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, | 309 | p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, |
307 | get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj, | 310 | get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm); |
308 | p->comm); | ||
309 | task_unlock(p); | 311 | task_unlock(p); |
310 | } while_each_thread(g, p); | 312 | } while_each_thread(g, p); |
311 | } | 313 | } |
@@ -323,11 +325,8 @@ static void __oom_kill_task(struct task_struct *p, int verbose) | |||
323 | return; | 325 | return; |
324 | } | 326 | } |
325 | 327 | ||
326 | if (!p->mm) { | 328 | if (!p->mm) |
327 | WARN_ON(1); | ||
328 | printk(KERN_WARNING "tried to kill an mm-less task!\n"); | ||
329 | return; | 329 | return; |
330 | } | ||
331 | 330 | ||
332 | if (verbose) | 331 | if (verbose) |
333 | printk(KERN_ERR "Killed process %d (%s)\n", | 332 | printk(KERN_ERR "Killed process %d (%s)\n", |
@@ -349,28 +348,13 @@ static int oom_kill_task(struct task_struct *p) | |||
349 | struct mm_struct *mm; | 348 | struct mm_struct *mm; |
350 | struct task_struct *g, *q; | 349 | struct task_struct *g, *q; |
351 | 350 | ||
351 | task_lock(p); | ||
352 | mm = p->mm; | 352 | mm = p->mm; |
353 | 353 | if (!mm || mm->oom_adj == OOM_DISABLE) { | |
354 | /* WARNING: mm may not be dereferenced since we did not obtain its | 354 | task_unlock(p); |
355 | * value from get_task_mm(p). This is OK since all we need to do is | ||
356 | * compare mm to q->mm below. | ||
357 | * | ||
358 | * Furthermore, even if mm contains a non-NULL value, p->mm may | ||
359 | * change to NULL at any time since we do not hold task_lock(p). | ||
360 | * However, this is of no concern to us. | ||
361 | */ | ||
362 | |||
363 | if (mm == NULL) | ||
364 | return 1; | 355 | return 1; |
365 | 356 | } | |
366 | /* | 357 | task_unlock(p); |
367 | * Don't kill the process if any threads are set to OOM_DISABLE | ||
368 | */ | ||
369 | do_each_thread(g, q) { | ||
370 | if (q->mm == mm && q->oomkilladj == OOM_DISABLE) | ||
371 | return 1; | ||
372 | } while_each_thread(g, q); | ||
373 | |||
374 | __oom_kill_task(p, 1); | 358 | __oom_kill_task(p, 1); |
375 | 359 | ||
376 | /* | 360 | /* |
@@ -393,10 +377,11 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
393 | struct task_struct *c; | 377 | struct task_struct *c; |
394 | 378 | ||
395 | if (printk_ratelimit()) { | 379 | if (printk_ratelimit()) { |
396 | printk(KERN_WARNING "%s invoked oom-killer: " | ||
397 | "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", | ||
398 | current->comm, gfp_mask, order, current->oomkilladj); | ||
399 | task_lock(current); | 380 | task_lock(current); |
381 | printk(KERN_WARNING "%s invoked oom-killer: " | ||
382 | "gfp_mask=0x%x, order=%d, oom_adj=%d\n", | ||
383 | current->comm, gfp_mask, order, | ||
384 | current->mm ? current->mm->oom_adj : OOM_DISABLE); | ||
400 | cpuset_print_task_mems_allowed(current); | 385 | cpuset_print_task_mems_allowed(current); |
401 | task_unlock(current); | 386 | task_unlock(current); |
402 | dump_stack(); | 387 | dump_stack(); |
@@ -409,8 +394,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
409 | /* | 394 | /* |
410 | * If the task is already exiting, don't alarm the sysadmin or kill | 395 | * If the task is already exiting, don't alarm the sysadmin or kill |
411 | * its children or threads, just set TIF_MEMDIE so it can die quickly | 396 | * its children or threads, just set TIF_MEMDIE so it can die quickly |
397 | * if its mm is still attached. | ||
412 | */ | 398 | */ |
413 | if (p->flags & PF_EXITING) { | 399 | if (p->mm && (p->flags & PF_EXITING)) { |
414 | __oom_kill_task(p, 0); | 400 | __oom_kill_task(p, 0); |
415 | return 0; | 401 | return 0; |
416 | } | 402 | } |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index bb553c3e955d..7b0dcea4935b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -265,18 +265,19 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi, | |||
265 | * This avoids exceeding the total dirty_limit when the floating averages | 265 | * This avoids exceeding the total dirty_limit when the floating averages |
266 | * fluctuate too quickly. | 266 | * fluctuate too quickly. |
267 | */ | 267 | */ |
268 | static void | 268 | static void clip_bdi_dirty_limit(struct backing_dev_info *bdi, |
269 | clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty) | 269 | unsigned long dirty, unsigned long *pbdi_dirty) |
270 | { | 270 | { |
271 | long avail_dirty; | 271 | unsigned long avail_dirty; |
272 | 272 | ||
273 | avail_dirty = dirty - | 273 | avail_dirty = global_page_state(NR_FILE_DIRTY) + |
274 | (global_page_state(NR_FILE_DIRTY) + | ||
275 | global_page_state(NR_WRITEBACK) + | 274 | global_page_state(NR_WRITEBACK) + |
276 | global_page_state(NR_UNSTABLE_NFS) + | 275 | global_page_state(NR_UNSTABLE_NFS) + |
277 | global_page_state(NR_WRITEBACK_TEMP)); | 276 | global_page_state(NR_WRITEBACK_TEMP); |
278 | 277 | ||
279 | if (avail_dirty < 0) | 278 | if (avail_dirty < dirty) |
279 | avail_dirty = dirty - avail_dirty; | ||
280 | else | ||
280 | avail_dirty = 0; | 281 | avail_dirty = 0; |
281 | 282 | ||
282 | avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + | 283 | avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + |
@@ -299,10 +300,10 @@ static inline void task_dirties_fraction(struct task_struct *tsk, | |||
299 | * | 300 | * |
300 | * dirty -= (dirty/8) * p_{t} | 301 | * dirty -= (dirty/8) * p_{t} |
301 | */ | 302 | */ |
302 | static void task_dirty_limit(struct task_struct *tsk, long *pdirty) | 303 | static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty) |
303 | { | 304 | { |
304 | long numerator, denominator; | 305 | long numerator, denominator; |
305 | long dirty = *pdirty; | 306 | unsigned long dirty = *pdirty; |
306 | u64 inv = dirty >> 3; | 307 | u64 inv = dirty >> 3; |
307 | 308 | ||
308 | task_dirties_fraction(tsk, &numerator, &denominator); | 309 | task_dirties_fraction(tsk, &numerator, &denominator); |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fe753ecf2aa5..5d714f8fb303 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/bootmem.h> | 23 | #include <linux/bootmem.h> |
24 | #include <linux/compiler.h> | 24 | #include <linux/compiler.h> |
25 | #include <linux/kernel.h> | 25 | #include <linux/kernel.h> |
26 | #include <linux/kmemcheck.h> | ||
26 | #include <linux/module.h> | 27 | #include <linux/module.h> |
27 | #include <linux/suspend.h> | 28 | #include <linux/suspend.h> |
28 | #include <linux/pagevec.h> | 29 | #include <linux/pagevec.h> |
@@ -46,6 +47,7 @@ | |||
46 | #include <linux/page-isolation.h> | 47 | #include <linux/page-isolation.h> |
47 | #include <linux/page_cgroup.h> | 48 | #include <linux/page_cgroup.h> |
48 | #include <linux/debugobjects.h> | 49 | #include <linux/debugobjects.h> |
50 | #include <linux/kmemleak.h> | ||
49 | 51 | ||
50 | #include <asm/tlbflush.h> | 52 | #include <asm/tlbflush.h> |
51 | #include <asm/div64.h> | 53 | #include <asm/div64.h> |
@@ -71,6 +73,7 @@ unsigned long totalram_pages __read_mostly; | |||
71 | unsigned long totalreserve_pages __read_mostly; | 73 | unsigned long totalreserve_pages __read_mostly; |
72 | unsigned long highest_memmap_pfn __read_mostly; | 74 | unsigned long highest_memmap_pfn __read_mostly; |
73 | int percpu_pagelist_fraction; | 75 | int percpu_pagelist_fraction; |
76 | gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; | ||
74 | 77 | ||
75 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | 78 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE |
76 | int pageblock_order __read_mostly; | 79 | int pageblock_order __read_mostly; |
@@ -149,10 +152,6 @@ static unsigned long __meminitdata dma_reserve; | |||
149 | static int __meminitdata nr_nodemap_entries; | 152 | static int __meminitdata nr_nodemap_entries; |
150 | static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; | 153 | static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; |
151 | static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; | 154 | static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; |
152 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE | ||
153 | static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES]; | ||
154 | static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; | ||
155 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ | ||
156 | static unsigned long __initdata required_kernelcore; | 155 | static unsigned long __initdata required_kernelcore; |
157 | static unsigned long __initdata required_movablecore; | 156 | static unsigned long __initdata required_movablecore; |
158 | static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; | 157 | static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; |
@@ -164,17 +163,25 @@ static unsigned long __meminitdata dma_reserve; | |||
164 | 163 | ||
165 | #if MAX_NUMNODES > 1 | 164 | #if MAX_NUMNODES > 1 |
166 | int nr_node_ids __read_mostly = MAX_NUMNODES; | 165 | int nr_node_ids __read_mostly = MAX_NUMNODES; |
166 | int nr_online_nodes __read_mostly = 1; | ||
167 | EXPORT_SYMBOL(nr_node_ids); | 167 | EXPORT_SYMBOL(nr_node_ids); |
168 | EXPORT_SYMBOL(nr_online_nodes); | ||
168 | #endif | 169 | #endif |
169 | 170 | ||
170 | int page_group_by_mobility_disabled __read_mostly; | 171 | int page_group_by_mobility_disabled __read_mostly; |
171 | 172 | ||
172 | static void set_pageblock_migratetype(struct page *page, int migratetype) | 173 | static void set_pageblock_migratetype(struct page *page, int migratetype) |
173 | { | 174 | { |
175 | |||
176 | if (unlikely(page_group_by_mobility_disabled)) | ||
177 | migratetype = MIGRATE_UNMOVABLE; | ||
178 | |||
174 | set_pageblock_flags_group(page, (unsigned long)migratetype, | 179 | set_pageblock_flags_group(page, (unsigned long)migratetype, |
175 | PB_migrate, PB_migrate_end); | 180 | PB_migrate, PB_migrate_end); |
176 | } | 181 | } |
177 | 182 | ||
183 | bool oom_killer_disabled __read_mostly; | ||
184 | |||
178 | #ifdef CONFIG_DEBUG_VM | 185 | #ifdef CONFIG_DEBUG_VM |
179 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | 186 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) |
180 | { | 187 | { |
@@ -297,23 +304,6 @@ void prep_compound_page(struct page *page, unsigned long order) | |||
297 | } | 304 | } |
298 | } | 305 | } |
299 | 306 | ||
300 | #ifdef CONFIG_HUGETLBFS | ||
301 | void prep_compound_gigantic_page(struct page *page, unsigned long order) | ||
302 | { | ||
303 | int i; | ||
304 | int nr_pages = 1 << order; | ||
305 | struct page *p = page + 1; | ||
306 | |||
307 | set_compound_page_dtor(page, free_compound_page); | ||
308 | set_compound_order(page, order); | ||
309 | __SetPageHead(page); | ||
310 | for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { | ||
311 | __SetPageTail(p); | ||
312 | p->first_page = page; | ||
313 | } | ||
314 | } | ||
315 | #endif | ||
316 | |||
317 | static int destroy_compound_page(struct page *page, unsigned long order) | 307 | static int destroy_compound_page(struct page *page, unsigned long order) |
318 | { | 308 | { |
319 | int i; | 309 | int i; |
@@ -420,7 +410,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
420 | return 0; | 410 | return 0; |
421 | 411 | ||
422 | if (PageBuddy(buddy) && page_order(buddy) == order) { | 412 | if (PageBuddy(buddy) && page_order(buddy) == order) { |
423 | BUG_ON(page_count(buddy) != 0); | 413 | VM_BUG_ON(page_count(buddy) != 0); |
424 | return 1; | 414 | return 1; |
425 | } | 415 | } |
426 | return 0; | 416 | return 0; |
@@ -451,22 +441,22 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
451 | */ | 441 | */ |
452 | 442 | ||
453 | static inline void __free_one_page(struct page *page, | 443 | static inline void __free_one_page(struct page *page, |
454 | struct zone *zone, unsigned int order) | 444 | struct zone *zone, unsigned int order, |
445 | int migratetype) | ||
455 | { | 446 | { |
456 | unsigned long page_idx; | 447 | unsigned long page_idx; |
457 | int order_size = 1 << order; | ||
458 | int migratetype = get_pageblock_migratetype(page); | ||
459 | 448 | ||
460 | if (unlikely(PageCompound(page))) | 449 | if (unlikely(PageCompound(page))) |
461 | if (unlikely(destroy_compound_page(page, order))) | 450 | if (unlikely(destroy_compound_page(page, order))) |
462 | return; | 451 | return; |
463 | 452 | ||
453 | VM_BUG_ON(migratetype == -1); | ||
454 | |||
464 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); | 455 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); |
465 | 456 | ||
466 | VM_BUG_ON(page_idx & (order_size - 1)); | 457 | VM_BUG_ON(page_idx & ((1 << order) - 1)); |
467 | VM_BUG_ON(bad_range(zone, page)); | 458 | VM_BUG_ON(bad_range(zone, page)); |
468 | 459 | ||
469 | __mod_zone_page_state(zone, NR_FREE_PAGES, order_size); | ||
470 | while (order < MAX_ORDER-1) { | 460 | while (order < MAX_ORDER-1) { |
471 | unsigned long combined_idx; | 461 | unsigned long combined_idx; |
472 | struct page *buddy; | 462 | struct page *buddy; |
@@ -490,12 +480,26 @@ static inline void __free_one_page(struct page *page, | |||
490 | zone->free_area[order].nr_free++; | 480 | zone->free_area[order].nr_free++; |
491 | } | 481 | } |
492 | 482 | ||
483 | #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT | ||
484 | /* | ||
485 | * free_page_mlock() -- clean up attempts to free and mlocked() page. | ||
486 | * Page should not be on lru, so no need to fix that up. | ||
487 | * free_pages_check() will verify... | ||
488 | */ | ||
489 | static inline void free_page_mlock(struct page *page) | ||
490 | { | ||
491 | __dec_zone_page_state(page, NR_MLOCK); | ||
492 | __count_vm_event(UNEVICTABLE_MLOCKFREED); | ||
493 | } | ||
494 | #else | ||
495 | static void free_page_mlock(struct page *page) { } | ||
496 | #endif | ||
497 | |||
493 | static inline int free_pages_check(struct page *page) | 498 | static inline int free_pages_check(struct page *page) |
494 | { | 499 | { |
495 | free_page_mlock(page); | ||
496 | if (unlikely(page_mapcount(page) | | 500 | if (unlikely(page_mapcount(page) | |
497 | (page->mapping != NULL) | | 501 | (page->mapping != NULL) | |
498 | (page_count(page) != 0) | | 502 | (atomic_read(&page->_count) != 0) | |
499 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { | 503 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { |
500 | bad_page(page); | 504 | bad_page(page); |
501 | return 1; | 505 | return 1; |
@@ -522,6 +526,8 @@ static void free_pages_bulk(struct zone *zone, int count, | |||
522 | spin_lock(&zone->lock); | 526 | spin_lock(&zone->lock); |
523 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); | 527 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); |
524 | zone->pages_scanned = 0; | 528 | zone->pages_scanned = 0; |
529 | |||
530 | __mod_zone_page_state(zone, NR_FREE_PAGES, count << order); | ||
525 | while (count--) { | 531 | while (count--) { |
526 | struct page *page; | 532 | struct page *page; |
527 | 533 | ||
@@ -529,17 +535,20 @@ static void free_pages_bulk(struct zone *zone, int count, | |||
529 | page = list_entry(list->prev, struct page, lru); | 535 | page = list_entry(list->prev, struct page, lru); |
530 | /* have to delete it as __free_one_page list manipulates */ | 536 | /* have to delete it as __free_one_page list manipulates */ |
531 | list_del(&page->lru); | 537 | list_del(&page->lru); |
532 | __free_one_page(page, zone, order); | 538 | __free_one_page(page, zone, order, page_private(page)); |
533 | } | 539 | } |
534 | spin_unlock(&zone->lock); | 540 | spin_unlock(&zone->lock); |
535 | } | 541 | } |
536 | 542 | ||
537 | static void free_one_page(struct zone *zone, struct page *page, int order) | 543 | static void free_one_page(struct zone *zone, struct page *page, int order, |
544 | int migratetype) | ||
538 | { | 545 | { |
539 | spin_lock(&zone->lock); | 546 | spin_lock(&zone->lock); |
540 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); | 547 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); |
541 | zone->pages_scanned = 0; | 548 | zone->pages_scanned = 0; |
542 | __free_one_page(page, zone, order); | 549 | |
550 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); | ||
551 | __free_one_page(page, zone, order, migratetype); | ||
543 | spin_unlock(&zone->lock); | 552 | spin_unlock(&zone->lock); |
544 | } | 553 | } |
545 | 554 | ||
@@ -548,6 +557,9 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
548 | unsigned long flags; | 557 | unsigned long flags; |
549 | int i; | 558 | int i; |
550 | int bad = 0; | 559 | int bad = 0; |
560 | int wasMlocked = TestClearPageMlocked(page); | ||
561 | |||
562 | kmemcheck_free_shadow(page, order); | ||
551 | 563 | ||
552 | for (i = 0 ; i < (1 << order) ; ++i) | 564 | for (i = 0 ; i < (1 << order) ; ++i) |
553 | bad += free_pages_check(page + i); | 565 | bad += free_pages_check(page + i); |
@@ -563,8 +575,11 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
563 | kernel_map_pages(page, 1 << order, 0); | 575 | kernel_map_pages(page, 1 << order, 0); |
564 | 576 | ||
565 | local_irq_save(flags); | 577 | local_irq_save(flags); |
578 | if (unlikely(wasMlocked)) | ||
579 | free_page_mlock(page); | ||
566 | __count_vm_events(PGFREE, 1 << order); | 580 | __count_vm_events(PGFREE, 1 << order); |
567 | free_one_page(page_zone(page), page, order); | 581 | free_one_page(page_zone(page), page, order, |
582 | get_pageblock_migratetype(page)); | ||
568 | local_irq_restore(flags); | 583 | local_irq_restore(flags); |
569 | } | 584 | } |
570 | 585 | ||
@@ -635,7 +650,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | |||
635 | { | 650 | { |
636 | if (unlikely(page_mapcount(page) | | 651 | if (unlikely(page_mapcount(page) | |
637 | (page->mapping != NULL) | | 652 | (page->mapping != NULL) | |
638 | (page_count(page) != 0) | | 653 | (atomic_read(&page->_count) != 0) | |
639 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { | 654 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { |
640 | bad_page(page); | 655 | bad_page(page); |
641 | return 1; | 656 | return 1; |
@@ -660,7 +675,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | |||
660 | * Go through the free lists for the given migratetype and remove | 675 | * Go through the free lists for the given migratetype and remove |
661 | * the smallest available page from the freelists | 676 | * the smallest available page from the freelists |
662 | */ | 677 | */ |
663 | static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, | 678 | static inline |
679 | struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, | ||
664 | int migratetype) | 680 | int migratetype) |
665 | { | 681 | { |
666 | unsigned int current_order; | 682 | unsigned int current_order; |
@@ -678,7 +694,6 @@ static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, | |||
678 | list_del(&page->lru); | 694 | list_del(&page->lru); |
679 | rmv_page_order(page); | 695 | rmv_page_order(page); |
680 | area->nr_free--; | 696 | area->nr_free--; |
681 | __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order)); | ||
682 | expand(zone, page, order, current_order, area, migratetype); | 697 | expand(zone, page, order, current_order, area, migratetype); |
683 | return page; | 698 | return page; |
684 | } | 699 | } |
@@ -769,8 +784,8 @@ static int move_freepages_block(struct zone *zone, struct page *page, | |||
769 | } | 784 | } |
770 | 785 | ||
771 | /* Remove an element from the buddy allocator from the fallback list */ | 786 | /* Remove an element from the buddy allocator from the fallback list */ |
772 | static struct page *__rmqueue_fallback(struct zone *zone, int order, | 787 | static inline struct page * |
773 | int start_migratetype) | 788 | __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) |
774 | { | 789 | { |
775 | struct free_area * area; | 790 | struct free_area * area; |
776 | int current_order; | 791 | int current_order; |
@@ -818,8 +833,6 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order, | |||
818 | /* Remove the page from the freelists */ | 833 | /* Remove the page from the freelists */ |
819 | list_del(&page->lru); | 834 | list_del(&page->lru); |
820 | rmv_page_order(page); | 835 | rmv_page_order(page); |
821 | __mod_zone_page_state(zone, NR_FREE_PAGES, | ||
822 | -(1UL << order)); | ||
823 | 836 | ||
824 | if (current_order == pageblock_order) | 837 | if (current_order == pageblock_order) |
825 | set_pageblock_migratetype(page, | 838 | set_pageblock_migratetype(page, |
@@ -830,8 +843,7 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order, | |||
830 | } | 843 | } |
831 | } | 844 | } |
832 | 845 | ||
833 | /* Use MIGRATE_RESERVE rather than fail an allocation */ | 846 | return NULL; |
834 | return __rmqueue_smallest(zone, order, MIGRATE_RESERVE); | ||
835 | } | 847 | } |
836 | 848 | ||
837 | /* | 849 | /* |
@@ -843,11 +855,23 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order, | |||
843 | { | 855 | { |
844 | struct page *page; | 856 | struct page *page; |
845 | 857 | ||
858 | retry_reserve: | ||
846 | page = __rmqueue_smallest(zone, order, migratetype); | 859 | page = __rmqueue_smallest(zone, order, migratetype); |
847 | 860 | ||
848 | if (unlikely(!page)) | 861 | if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { |
849 | page = __rmqueue_fallback(zone, order, migratetype); | 862 | page = __rmqueue_fallback(zone, order, migratetype); |
850 | 863 | ||
864 | /* | ||
865 | * Use MIGRATE_RESERVE rather than fail an allocation. goto | ||
866 | * is used because __rmqueue_smallest is an inline function | ||
867 | * and we want just one call site | ||
868 | */ | ||
869 | if (!page) { | ||
870 | migratetype = MIGRATE_RESERVE; | ||
871 | goto retry_reserve; | ||
872 | } | ||
873 | } | ||
874 | |||
851 | return page; | 875 | return page; |
852 | } | 876 | } |
853 | 877 | ||
@@ -881,6 +905,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
881 | set_page_private(page, migratetype); | 905 | set_page_private(page, migratetype); |
882 | list = &page->lru; | 906 | list = &page->lru; |
883 | } | 907 | } |
908 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); | ||
884 | spin_unlock(&zone->lock); | 909 | spin_unlock(&zone->lock); |
885 | return i; | 910 | return i; |
886 | } | 911 | } |
@@ -996,6 +1021,9 @@ static void free_hot_cold_page(struct page *page, int cold) | |||
996 | struct zone *zone = page_zone(page); | 1021 | struct zone *zone = page_zone(page); |
997 | struct per_cpu_pages *pcp; | 1022 | struct per_cpu_pages *pcp; |
998 | unsigned long flags; | 1023 | unsigned long flags; |
1024 | int wasMlocked = TestClearPageMlocked(page); | ||
1025 | |||
1026 | kmemcheck_free_shadow(page, 0); | ||
999 | 1027 | ||
1000 | if (PageAnon(page)) | 1028 | if (PageAnon(page)) |
1001 | page->mapping = NULL; | 1029 | page->mapping = NULL; |
@@ -1010,13 +1038,16 @@ static void free_hot_cold_page(struct page *page, int cold) | |||
1010 | kernel_map_pages(page, 1, 0); | 1038 | kernel_map_pages(page, 1, 0); |
1011 | 1039 | ||
1012 | pcp = &zone_pcp(zone, get_cpu())->pcp; | 1040 | pcp = &zone_pcp(zone, get_cpu())->pcp; |
1041 | set_page_private(page, get_pageblock_migratetype(page)); | ||
1013 | local_irq_save(flags); | 1042 | local_irq_save(flags); |
1043 | if (unlikely(wasMlocked)) | ||
1044 | free_page_mlock(page); | ||
1014 | __count_vm_event(PGFREE); | 1045 | __count_vm_event(PGFREE); |
1046 | |||
1015 | if (cold) | 1047 | if (cold) |
1016 | list_add_tail(&page->lru, &pcp->list); | 1048 | list_add_tail(&page->lru, &pcp->list); |
1017 | else | 1049 | else |
1018 | list_add(&page->lru, &pcp->list); | 1050 | list_add(&page->lru, &pcp->list); |
1019 | set_page_private(page, get_pageblock_migratetype(page)); | ||
1020 | pcp->count++; | 1051 | pcp->count++; |
1021 | if (pcp->count >= pcp->high) { | 1052 | if (pcp->count >= pcp->high) { |
1022 | free_pages_bulk(zone, pcp->batch, &pcp->list, 0); | 1053 | free_pages_bulk(zone, pcp->batch, &pcp->list, 0); |
@@ -1050,6 +1081,16 @@ void split_page(struct page *page, unsigned int order) | |||
1050 | 1081 | ||
1051 | VM_BUG_ON(PageCompound(page)); | 1082 | VM_BUG_ON(PageCompound(page)); |
1052 | VM_BUG_ON(!page_count(page)); | 1083 | VM_BUG_ON(!page_count(page)); |
1084 | |||
1085 | #ifdef CONFIG_KMEMCHECK | ||
1086 | /* | ||
1087 | * Split shadow pages too, because free(page[0]) would | ||
1088 | * otherwise free the whole shadow. | ||
1089 | */ | ||
1090 | if (kmemcheck_page_is_tracked(page)) | ||
1091 | split_page(virt_to_page(page[0].shadow), order); | ||
1092 | #endif | ||
1093 | |||
1053 | for (i = 1; i < (1 << order); i++) | 1094 | for (i = 1; i < (1 << order); i++) |
1054 | set_page_refcounted(page + i); | 1095 | set_page_refcounted(page + i); |
1055 | } | 1096 | } |
@@ -1059,14 +1100,15 @@ void split_page(struct page *page, unsigned int order) | |||
1059 | * we cheat by calling it from here, in the order > 0 path. Saves a branch | 1100 | * we cheat by calling it from here, in the order > 0 path. Saves a branch |
1060 | * or two. | 1101 | * or two. |
1061 | */ | 1102 | */ |
1062 | static struct page *buffered_rmqueue(struct zone *preferred_zone, | 1103 | static inline |
1063 | struct zone *zone, int order, gfp_t gfp_flags) | 1104 | struct page *buffered_rmqueue(struct zone *preferred_zone, |
1105 | struct zone *zone, int order, gfp_t gfp_flags, | ||
1106 | int migratetype) | ||
1064 | { | 1107 | { |
1065 | unsigned long flags; | 1108 | unsigned long flags; |
1066 | struct page *page; | 1109 | struct page *page; |
1067 | int cold = !!(gfp_flags & __GFP_COLD); | 1110 | int cold = !!(gfp_flags & __GFP_COLD); |
1068 | int cpu; | 1111 | int cpu; |
1069 | int migratetype = allocflags_to_migratetype(gfp_flags); | ||
1070 | 1112 | ||
1071 | again: | 1113 | again: |
1072 | cpu = get_cpu(); | 1114 | cpu = get_cpu(); |
@@ -1103,8 +1145,22 @@ again: | |||
1103 | list_del(&page->lru); | 1145 | list_del(&page->lru); |
1104 | pcp->count--; | 1146 | pcp->count--; |
1105 | } else { | 1147 | } else { |
1148 | if (unlikely(gfp_flags & __GFP_NOFAIL)) { | ||
1149 | /* | ||
1150 | * __GFP_NOFAIL is not to be used in new code. | ||
1151 | * | ||
1152 | * All __GFP_NOFAIL callers should be fixed so that they | ||
1153 | * properly detect and handle allocation failures. | ||
1154 | * | ||
1155 | * We most definitely don't want callers attempting to | ||
1156 | * allocate greater than order-1 page units with | ||
1157 | * __GFP_NOFAIL. | ||
1158 | */ | ||
1159 | WARN_ON_ONCE(order > 1); | ||
1160 | } | ||
1106 | spin_lock_irqsave(&zone->lock, flags); | 1161 | spin_lock_irqsave(&zone->lock, flags); |
1107 | page = __rmqueue(zone, order, migratetype); | 1162 | page = __rmqueue(zone, order, migratetype); |
1163 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); | ||
1108 | spin_unlock(&zone->lock); | 1164 | spin_unlock(&zone->lock); |
1109 | if (!page) | 1165 | if (!page) |
1110 | goto failed; | 1166 | goto failed; |
@@ -1126,10 +1182,15 @@ failed: | |||
1126 | return NULL; | 1182 | return NULL; |
1127 | } | 1183 | } |
1128 | 1184 | ||
1129 | #define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ | 1185 | /* The ALLOC_WMARK bits are used as an index to zone->watermark */ |
1130 | #define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ | 1186 | #define ALLOC_WMARK_MIN WMARK_MIN |
1131 | #define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ | 1187 | #define ALLOC_WMARK_LOW WMARK_LOW |
1132 | #define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ | 1188 | #define ALLOC_WMARK_HIGH WMARK_HIGH |
1189 | #define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ | ||
1190 | |||
1191 | /* Mask to get the watermark bits */ | ||
1192 | #define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) | ||
1193 | |||
1133 | #define ALLOC_HARDER 0x10 /* try to alloc harder */ | 1194 | #define ALLOC_HARDER 0x10 /* try to alloc harder */ |
1134 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ | 1195 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ |
1135 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ | 1196 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ |
@@ -1387,23 +1448,18 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) | |||
1387 | */ | 1448 | */ |
1388 | static struct page * | 1449 | static struct page * |
1389 | get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, | 1450 | get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, |
1390 | struct zonelist *zonelist, int high_zoneidx, int alloc_flags) | 1451 | struct zonelist *zonelist, int high_zoneidx, int alloc_flags, |
1452 | struct zone *preferred_zone, int migratetype) | ||
1391 | { | 1453 | { |
1392 | struct zoneref *z; | 1454 | struct zoneref *z; |
1393 | struct page *page = NULL; | 1455 | struct page *page = NULL; |
1394 | int classzone_idx; | 1456 | int classzone_idx; |
1395 | struct zone *zone, *preferred_zone; | 1457 | struct zone *zone; |
1396 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ | 1458 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ |
1397 | int zlc_active = 0; /* set if using zonelist_cache */ | 1459 | int zlc_active = 0; /* set if using zonelist_cache */ |
1398 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ | 1460 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ |
1399 | 1461 | ||
1400 | (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask, | ||
1401 | &preferred_zone); | ||
1402 | if (!preferred_zone) | ||
1403 | return NULL; | ||
1404 | |||
1405 | classzone_idx = zone_idx(preferred_zone); | 1462 | classzone_idx = zone_idx(preferred_zone); |
1406 | |||
1407 | zonelist_scan: | 1463 | zonelist_scan: |
1408 | /* | 1464 | /* |
1409 | * Scan zonelist, looking for a zone with enough free. | 1465 | * Scan zonelist, looking for a zone with enough free. |
@@ -1418,31 +1474,49 @@ zonelist_scan: | |||
1418 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) | 1474 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) |
1419 | goto try_next_zone; | 1475 | goto try_next_zone; |
1420 | 1476 | ||
1477 | BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); | ||
1421 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { | 1478 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { |
1422 | unsigned long mark; | 1479 | unsigned long mark; |
1423 | if (alloc_flags & ALLOC_WMARK_MIN) | 1480 | int ret; |
1424 | mark = zone->pages_min; | 1481 | |
1425 | else if (alloc_flags & ALLOC_WMARK_LOW) | 1482 | mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; |
1426 | mark = zone->pages_low; | 1483 | if (zone_watermark_ok(zone, order, mark, |
1427 | else | 1484 | classzone_idx, alloc_flags)) |
1428 | mark = zone->pages_high; | 1485 | goto try_this_zone; |
1429 | if (!zone_watermark_ok(zone, order, mark, | 1486 | |
1430 | classzone_idx, alloc_flags)) { | 1487 | if (zone_reclaim_mode == 0) |
1431 | if (!zone_reclaim_mode || | 1488 | goto this_zone_full; |
1432 | !zone_reclaim(zone, gfp_mask, order)) | 1489 | |
1490 | ret = zone_reclaim(zone, gfp_mask, order); | ||
1491 | switch (ret) { | ||
1492 | case ZONE_RECLAIM_NOSCAN: | ||
1493 | /* did not scan */ | ||
1494 | goto try_next_zone; | ||
1495 | case ZONE_RECLAIM_FULL: | ||
1496 | /* scanned but unreclaimable */ | ||
1497 | goto this_zone_full; | ||
1498 | default: | ||
1499 | /* did we reclaim enough */ | ||
1500 | if (!zone_watermark_ok(zone, order, mark, | ||
1501 | classzone_idx, alloc_flags)) | ||
1433 | goto this_zone_full; | 1502 | goto this_zone_full; |
1434 | } | 1503 | } |
1435 | } | 1504 | } |
1436 | 1505 | ||
1437 | page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask); | 1506 | try_this_zone: |
1507 | page = buffered_rmqueue(preferred_zone, zone, order, | ||
1508 | gfp_mask, migratetype); | ||
1438 | if (page) | 1509 | if (page) |
1439 | break; | 1510 | break; |
1440 | this_zone_full: | 1511 | this_zone_full: |
1441 | if (NUMA_BUILD) | 1512 | if (NUMA_BUILD) |
1442 | zlc_mark_zone_full(zonelist, z); | 1513 | zlc_mark_zone_full(zonelist, z); |
1443 | try_next_zone: | 1514 | try_next_zone: |
1444 | if (NUMA_BUILD && !did_zlc_setup) { | 1515 | if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { |
1445 | /* we do zlc_setup after the first zone is tried */ | 1516 | /* |
1517 | * we do zlc_setup after the first zone is tried but only | ||
1518 | * if there are multiple nodes make it worthwhile | ||
1519 | */ | ||
1446 | allowednodes = zlc_setup(zonelist, alloc_flags); | 1520 | allowednodes = zlc_setup(zonelist, alloc_flags); |
1447 | zlc_active = 1; | 1521 | zlc_active = 1; |
1448 | did_zlc_setup = 1; | 1522 | did_zlc_setup = 1; |
@@ -1457,47 +1531,217 @@ try_next_zone: | |||
1457 | return page; | 1531 | return page; |
1458 | } | 1532 | } |
1459 | 1533 | ||
1534 | static inline int | ||
1535 | should_alloc_retry(gfp_t gfp_mask, unsigned int order, | ||
1536 | unsigned long pages_reclaimed) | ||
1537 | { | ||
1538 | /* Do not loop if specifically requested */ | ||
1539 | if (gfp_mask & __GFP_NORETRY) | ||
1540 | return 0; | ||
1541 | |||
1542 | /* | ||
1543 | * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER | ||
1544 | * means __GFP_NOFAIL, but that may not be true in other | ||
1545 | * implementations. | ||
1546 | */ | ||
1547 | if (order <= PAGE_ALLOC_COSTLY_ORDER) | ||
1548 | return 1; | ||
1549 | |||
1550 | /* | ||
1551 | * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is | ||
1552 | * specified, then we retry until we no longer reclaim any pages | ||
1553 | * (above), or we've reclaimed an order of pages at least as | ||
1554 | * large as the allocation's order. In both cases, if the | ||
1555 | * allocation still fails, we stop retrying. | ||
1556 | */ | ||
1557 | if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) | ||
1558 | return 1; | ||
1559 | |||
1560 | /* | ||
1561 | * Don't let big-order allocations loop unless the caller | ||
1562 | * explicitly requests that. | ||
1563 | */ | ||
1564 | if (gfp_mask & __GFP_NOFAIL) | ||
1565 | return 1; | ||
1566 | |||
1567 | return 0; | ||
1568 | } | ||
1569 | |||
1570 | static inline struct page * | ||
1571 | __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | ||
1572 | struct zonelist *zonelist, enum zone_type high_zoneidx, | ||
1573 | nodemask_t *nodemask, struct zone *preferred_zone, | ||
1574 | int migratetype) | ||
1575 | { | ||
1576 | struct page *page; | ||
1577 | |||
1578 | /* Acquire the OOM killer lock for the zones in zonelist */ | ||
1579 | if (!try_set_zone_oom(zonelist, gfp_mask)) { | ||
1580 | schedule_timeout_uninterruptible(1); | ||
1581 | return NULL; | ||
1582 | } | ||
1583 | |||
1584 | /* | ||
1585 | * Go through the zonelist yet one more time, keep very high watermark | ||
1586 | * here, this is only to catch a parallel oom killing, we must fail if | ||
1587 | * we're still under heavy pressure. | ||
1588 | */ | ||
1589 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, | ||
1590 | order, zonelist, high_zoneidx, | ||
1591 | ALLOC_WMARK_HIGH|ALLOC_CPUSET, | ||
1592 | preferred_zone, migratetype); | ||
1593 | if (page) | ||
1594 | goto out; | ||
1595 | |||
1596 | /* The OOM killer will not help higher order allocs */ | ||
1597 | if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL)) | ||
1598 | goto out; | ||
1599 | |||
1600 | /* Exhausted what can be done so it's blamo time */ | ||
1601 | out_of_memory(zonelist, gfp_mask, order); | ||
1602 | |||
1603 | out: | ||
1604 | clear_zonelist_oom(zonelist, gfp_mask); | ||
1605 | return page; | ||
1606 | } | ||
1607 | |||
1608 | /* The really slow allocator path where we enter direct reclaim */ | ||
1609 | static inline struct page * | ||
1610 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | ||
1611 | struct zonelist *zonelist, enum zone_type high_zoneidx, | ||
1612 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | ||
1613 | int migratetype, unsigned long *did_some_progress) | ||
1614 | { | ||
1615 | struct page *page = NULL; | ||
1616 | struct reclaim_state reclaim_state; | ||
1617 | struct task_struct *p = current; | ||
1618 | |||
1619 | cond_resched(); | ||
1620 | |||
1621 | /* We now go into synchronous reclaim */ | ||
1622 | cpuset_memory_pressure_bump(); | ||
1623 | |||
1624 | /* | ||
1625 | * The task's cpuset might have expanded its set of allowable nodes | ||
1626 | */ | ||
1627 | p->flags |= PF_MEMALLOC; | ||
1628 | lockdep_set_current_reclaim_state(gfp_mask); | ||
1629 | reclaim_state.reclaimed_slab = 0; | ||
1630 | p->reclaim_state = &reclaim_state; | ||
1631 | |||
1632 | *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); | ||
1633 | |||
1634 | p->reclaim_state = NULL; | ||
1635 | lockdep_clear_current_reclaim_state(); | ||
1636 | p->flags &= ~PF_MEMALLOC; | ||
1637 | |||
1638 | cond_resched(); | ||
1639 | |||
1640 | if (order != 0) | ||
1641 | drain_all_pages(); | ||
1642 | |||
1643 | if (likely(*did_some_progress)) | ||
1644 | page = get_page_from_freelist(gfp_mask, nodemask, order, | ||
1645 | zonelist, high_zoneidx, | ||
1646 | alloc_flags, preferred_zone, | ||
1647 | migratetype); | ||
1648 | return page; | ||
1649 | } | ||
1650 | |||
1460 | /* | 1651 | /* |
1461 | * This is the 'heart' of the zoned buddy allocator. | 1652 | * This is called in the allocator slow-path if the allocation request is of |
1653 | * sufficient urgency to ignore watermarks and take other desperate measures | ||
1462 | */ | 1654 | */ |
1463 | struct page * | 1655 | static inline struct page * |
1464 | __alloc_pages_internal(gfp_t gfp_mask, unsigned int order, | 1656 | __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, |
1465 | struct zonelist *zonelist, nodemask_t *nodemask) | 1657 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
1658 | nodemask_t *nodemask, struct zone *preferred_zone, | ||
1659 | int migratetype) | ||
1660 | { | ||
1661 | struct page *page; | ||
1662 | |||
1663 | do { | ||
1664 | page = get_page_from_freelist(gfp_mask, nodemask, order, | ||
1665 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, | ||
1666 | preferred_zone, migratetype); | ||
1667 | |||
1668 | if (!page && gfp_mask & __GFP_NOFAIL) | ||
1669 | congestion_wait(WRITE, HZ/50); | ||
1670 | } while (!page && (gfp_mask & __GFP_NOFAIL)); | ||
1671 | |||
1672 | return page; | ||
1673 | } | ||
1674 | |||
1675 | static inline | ||
1676 | void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, | ||
1677 | enum zone_type high_zoneidx) | ||
1466 | { | 1678 | { |
1467 | const gfp_t wait = gfp_mask & __GFP_WAIT; | ||
1468 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | ||
1469 | struct zoneref *z; | 1679 | struct zoneref *z; |
1470 | struct zone *zone; | 1680 | struct zone *zone; |
1471 | struct page *page; | ||
1472 | struct reclaim_state reclaim_state; | ||
1473 | struct task_struct *p = current; | ||
1474 | int do_retry; | ||
1475 | int alloc_flags; | ||
1476 | unsigned long did_some_progress; | ||
1477 | unsigned long pages_reclaimed = 0; | ||
1478 | 1681 | ||
1479 | lockdep_trace_alloc(gfp_mask); | 1682 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) |
1683 | wakeup_kswapd(zone, order); | ||
1684 | } | ||
1480 | 1685 | ||
1481 | might_sleep_if(wait); | 1686 | static inline int |
1687 | gfp_to_alloc_flags(gfp_t gfp_mask) | ||
1688 | { | ||
1689 | struct task_struct *p = current; | ||
1690 | int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; | ||
1691 | const gfp_t wait = gfp_mask & __GFP_WAIT; | ||
1482 | 1692 | ||
1483 | if (should_fail_alloc_page(gfp_mask, order)) | 1693 | /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ |
1484 | return NULL; | 1694 | BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH); |
1485 | 1695 | ||
1486 | restart: | 1696 | /* |
1487 | z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */ | 1697 | * The caller may dip into page reserves a bit more if the caller |
1698 | * cannot run direct reclaim, or if the caller has realtime scheduling | ||
1699 | * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will | ||
1700 | * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). | ||
1701 | */ | ||
1702 | alloc_flags |= (gfp_mask & __GFP_HIGH); | ||
1488 | 1703 | ||
1489 | if (unlikely(!z->zone)) { | 1704 | if (!wait) { |
1705 | alloc_flags |= ALLOC_HARDER; | ||
1490 | /* | 1706 | /* |
1491 | * Happens if we have an empty zonelist as a result of | 1707 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. |
1492 | * GFP_THISNODE being used on a memoryless node | 1708 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
1493 | */ | 1709 | */ |
1494 | return NULL; | 1710 | alloc_flags &= ~ALLOC_CPUSET; |
1711 | } else if (unlikely(rt_task(p))) | ||
1712 | alloc_flags |= ALLOC_HARDER; | ||
1713 | |||
1714 | if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { | ||
1715 | if (!in_interrupt() && | ||
1716 | ((p->flags & PF_MEMALLOC) || | ||
1717 | unlikely(test_thread_flag(TIF_MEMDIE)))) | ||
1718 | alloc_flags |= ALLOC_NO_WATERMARKS; | ||
1495 | } | 1719 | } |
1496 | 1720 | ||
1497 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 1721 | return alloc_flags; |
1498 | zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); | 1722 | } |
1499 | if (page) | 1723 | |
1500 | goto got_pg; | 1724 | static inline struct page * |
1725 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | ||
1726 | struct zonelist *zonelist, enum zone_type high_zoneidx, | ||
1727 | nodemask_t *nodemask, struct zone *preferred_zone, | ||
1728 | int migratetype) | ||
1729 | { | ||
1730 | const gfp_t wait = gfp_mask & __GFP_WAIT; | ||
1731 | struct page *page = NULL; | ||
1732 | int alloc_flags; | ||
1733 | unsigned long pages_reclaimed = 0; | ||
1734 | unsigned long did_some_progress; | ||
1735 | struct task_struct *p = current; | ||
1736 | |||
1737 | /* | ||
1738 | * In the slowpath, we sanity check order to avoid ever trying to | ||
1739 | * reclaim >= MAX_ORDER areas which will never succeed. Callers may | ||
1740 | * be using allocators in order of preference for an area that is | ||
1741 | * too large. | ||
1742 | */ | ||
1743 | if (WARN_ON_ONCE(order >= MAX_ORDER)) | ||
1744 | return NULL; | ||
1501 | 1745 | ||
1502 | /* | 1746 | /* |
1503 | * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and | 1747 | * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and |
@@ -1510,154 +1754,83 @@ restart: | |||
1510 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) | 1754 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) |
1511 | goto nopage; | 1755 | goto nopage; |
1512 | 1756 | ||
1513 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) | 1757 | wake_all_kswapd(order, zonelist, high_zoneidx); |
1514 | wakeup_kswapd(zone, order); | ||
1515 | 1758 | ||
1516 | /* | 1759 | /* |
1517 | * OK, we're below the kswapd watermark and have kicked background | 1760 | * OK, we're below the kswapd watermark and have kicked background |
1518 | * reclaim. Now things get more complex, so set up alloc_flags according | 1761 | * reclaim. Now things get more complex, so set up alloc_flags according |
1519 | * to how we want to proceed. | 1762 | * to how we want to proceed. |
1520 | * | ||
1521 | * The caller may dip into page reserves a bit more if the caller | ||
1522 | * cannot run direct reclaim, or if the caller has realtime scheduling | ||
1523 | * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will | ||
1524 | * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). | ||
1525 | */ | 1763 | */ |
1526 | alloc_flags = ALLOC_WMARK_MIN; | 1764 | alloc_flags = gfp_to_alloc_flags(gfp_mask); |
1527 | if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) | ||
1528 | alloc_flags |= ALLOC_HARDER; | ||
1529 | if (gfp_mask & __GFP_HIGH) | ||
1530 | alloc_flags |= ALLOC_HIGH; | ||
1531 | if (wait) | ||
1532 | alloc_flags |= ALLOC_CPUSET; | ||
1533 | 1765 | ||
1534 | /* | 1766 | restart: |
1535 | * Go through the zonelist again. Let __GFP_HIGH and allocations | 1767 | /* This is the last chance, in general, before the goto nopage. */ |
1536 | * coming from realtime tasks go deeper into reserves. | ||
1537 | * | ||
1538 | * This is the last chance, in general, before the goto nopage. | ||
1539 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. | ||
1540 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | ||
1541 | */ | ||
1542 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, | 1768 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, |
1543 | high_zoneidx, alloc_flags); | 1769 | high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, |
1770 | preferred_zone, migratetype); | ||
1544 | if (page) | 1771 | if (page) |
1545 | goto got_pg; | 1772 | goto got_pg; |
1546 | 1773 | ||
1547 | /* This allocation should allow future memory freeing. */ | ||
1548 | |||
1549 | rebalance: | 1774 | rebalance: |
1550 | if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) | 1775 | /* Allocate without watermarks if the context allows */ |
1551 | && !in_interrupt()) { | 1776 | if (alloc_flags & ALLOC_NO_WATERMARKS) { |
1552 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { | 1777 | page = __alloc_pages_high_priority(gfp_mask, order, |
1553 | nofail_alloc: | 1778 | zonelist, high_zoneidx, nodemask, |
1554 | /* go through the zonelist yet again, ignoring mins */ | 1779 | preferred_zone, migratetype); |
1555 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 1780 | if (page) |
1556 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); | 1781 | goto got_pg; |
1557 | if (page) | ||
1558 | goto got_pg; | ||
1559 | if (gfp_mask & __GFP_NOFAIL) { | ||
1560 | congestion_wait(WRITE, HZ/50); | ||
1561 | goto nofail_alloc; | ||
1562 | } | ||
1563 | } | ||
1564 | goto nopage; | ||
1565 | } | 1782 | } |
1566 | 1783 | ||
1567 | /* Atomic allocations - we can't balance anything */ | 1784 | /* Atomic allocations - we can't balance anything */ |
1568 | if (!wait) | 1785 | if (!wait) |
1569 | goto nopage; | 1786 | goto nopage; |
1570 | 1787 | ||
1571 | cond_resched(); | 1788 | /* Avoid recursion of direct reclaim */ |
1789 | if (p->flags & PF_MEMALLOC) | ||
1790 | goto nopage; | ||
1791 | |||
1792 | /* Try direct reclaim and then allocating */ | ||
1793 | page = __alloc_pages_direct_reclaim(gfp_mask, order, | ||
1794 | zonelist, high_zoneidx, | ||
1795 | nodemask, | ||
1796 | alloc_flags, preferred_zone, | ||
1797 | migratetype, &did_some_progress); | ||
1798 | if (page) | ||
1799 | goto got_pg; | ||
1572 | 1800 | ||
1573 | /* We now go into synchronous reclaim */ | ||
1574 | cpuset_memory_pressure_bump(); | ||
1575 | /* | 1801 | /* |
1576 | * The task's cpuset might have expanded its set of allowable nodes | 1802 | * If we failed to make any progress reclaiming, then we are |
1803 | * running out of options and have to consider going OOM | ||
1577 | */ | 1804 | */ |
1578 | cpuset_update_task_memory_state(); | 1805 | if (!did_some_progress) { |
1579 | p->flags |= PF_MEMALLOC; | 1806 | if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { |
1580 | 1807 | if (oom_killer_disabled) | |
1581 | lockdep_set_current_reclaim_state(gfp_mask); | 1808 | goto nopage; |
1582 | reclaim_state.reclaimed_slab = 0; | 1809 | page = __alloc_pages_may_oom(gfp_mask, order, |
1583 | p->reclaim_state = &reclaim_state; | 1810 | zonelist, high_zoneidx, |
1584 | 1811 | nodemask, preferred_zone, | |
1585 | did_some_progress = try_to_free_pages(zonelist, order, | 1812 | migratetype); |
1586 | gfp_mask, nodemask); | 1813 | if (page) |
1587 | 1814 | goto got_pg; | |
1588 | p->reclaim_state = NULL; | ||
1589 | lockdep_clear_current_reclaim_state(); | ||
1590 | p->flags &= ~PF_MEMALLOC; | ||
1591 | |||
1592 | cond_resched(); | ||
1593 | 1815 | ||
1594 | if (order != 0) | 1816 | /* |
1595 | drain_all_pages(); | 1817 | * The OOM killer does not trigger for high-order |
1818 | * ~__GFP_NOFAIL allocations so if no progress is being | ||
1819 | * made, there are no other options and retrying is | ||
1820 | * unlikely to help. | ||
1821 | */ | ||
1822 | if (order > PAGE_ALLOC_COSTLY_ORDER && | ||
1823 | !(gfp_mask & __GFP_NOFAIL)) | ||
1824 | goto nopage; | ||
1596 | 1825 | ||
1597 | if (likely(did_some_progress)) { | ||
1598 | page = get_page_from_freelist(gfp_mask, nodemask, order, | ||
1599 | zonelist, high_zoneidx, alloc_flags); | ||
1600 | if (page) | ||
1601 | goto got_pg; | ||
1602 | } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { | ||
1603 | if (!try_set_zone_oom(zonelist, gfp_mask)) { | ||
1604 | schedule_timeout_uninterruptible(1); | ||
1605 | goto restart; | 1826 | goto restart; |
1606 | } | 1827 | } |
1607 | |||
1608 | /* | ||
1609 | * Go through the zonelist yet one more time, keep | ||
1610 | * very high watermark here, this is only to catch | ||
1611 | * a parallel oom killing, we must fail if we're still | ||
1612 | * under heavy pressure. | ||
1613 | */ | ||
1614 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, | ||
1615 | order, zonelist, high_zoneidx, | ||
1616 | ALLOC_WMARK_HIGH|ALLOC_CPUSET); | ||
1617 | if (page) { | ||
1618 | clear_zonelist_oom(zonelist, gfp_mask); | ||
1619 | goto got_pg; | ||
1620 | } | ||
1621 | |||
1622 | /* The OOM killer will not help higher order allocs so fail */ | ||
1623 | if (order > PAGE_ALLOC_COSTLY_ORDER) { | ||
1624 | clear_zonelist_oom(zonelist, gfp_mask); | ||
1625 | goto nopage; | ||
1626 | } | ||
1627 | |||
1628 | out_of_memory(zonelist, gfp_mask, order); | ||
1629 | clear_zonelist_oom(zonelist, gfp_mask); | ||
1630 | goto restart; | ||
1631 | } | 1828 | } |
1632 | 1829 | ||
1633 | /* | 1830 | /* Check if we should retry the allocation */ |
1634 | * Don't let big-order allocations loop unless the caller explicitly | ||
1635 | * requests that. Wait for some write requests to complete then retry. | ||
1636 | * | ||
1637 | * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER | ||
1638 | * means __GFP_NOFAIL, but that may not be true in other | ||
1639 | * implementations. | ||
1640 | * | ||
1641 | * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is | ||
1642 | * specified, then we retry until we no longer reclaim any pages | ||
1643 | * (above), or we've reclaimed an order of pages at least as | ||
1644 | * large as the allocation's order. In both cases, if the | ||
1645 | * allocation still fails, we stop retrying. | ||
1646 | */ | ||
1647 | pages_reclaimed += did_some_progress; | 1831 | pages_reclaimed += did_some_progress; |
1648 | do_retry = 0; | 1832 | if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { |
1649 | if (!(gfp_mask & __GFP_NORETRY)) { | 1833 | /* Wait for some write requests to complete then retry */ |
1650 | if (order <= PAGE_ALLOC_COSTLY_ORDER) { | ||
1651 | do_retry = 1; | ||
1652 | } else { | ||
1653 | if (gfp_mask & __GFP_REPEAT && | ||
1654 | pages_reclaimed < (1 << order)) | ||
1655 | do_retry = 1; | ||
1656 | } | ||
1657 | if (gfp_mask & __GFP_NOFAIL) | ||
1658 | do_retry = 1; | ||
1659 | } | ||
1660 | if (do_retry) { | ||
1661 | congestion_wait(WRITE, HZ/50); | 1834 | congestion_wait(WRITE, HZ/50); |
1662 | goto rebalance; | 1835 | goto rebalance; |
1663 | } | 1836 | } |
@@ -1670,10 +1843,60 @@ nopage: | |||
1670 | dump_stack(); | 1843 | dump_stack(); |
1671 | show_mem(); | 1844 | show_mem(); |
1672 | } | 1845 | } |
1846 | return page; | ||
1673 | got_pg: | 1847 | got_pg: |
1848 | if (kmemcheck_enabled) | ||
1849 | kmemcheck_pagealloc_alloc(page, order, gfp_mask); | ||
1850 | return page; | ||
1851 | |||
1852 | } | ||
1853 | |||
1854 | /* | ||
1855 | * This is the 'heart' of the zoned buddy allocator. | ||
1856 | */ | ||
1857 | struct page * | ||
1858 | __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | ||
1859 | struct zonelist *zonelist, nodemask_t *nodemask) | ||
1860 | { | ||
1861 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | ||
1862 | struct zone *preferred_zone; | ||
1863 | struct page *page; | ||
1864 | int migratetype = allocflags_to_migratetype(gfp_mask); | ||
1865 | |||
1866 | gfp_mask &= gfp_allowed_mask; | ||
1867 | |||
1868 | lockdep_trace_alloc(gfp_mask); | ||
1869 | |||
1870 | might_sleep_if(gfp_mask & __GFP_WAIT); | ||
1871 | |||
1872 | if (should_fail_alloc_page(gfp_mask, order)) | ||
1873 | return NULL; | ||
1874 | |||
1875 | /* | ||
1876 | * Check the zones suitable for the gfp_mask contain at least one | ||
1877 | * valid zone. It's possible to have an empty zonelist as a result | ||
1878 | * of GFP_THISNODE and a memoryless node | ||
1879 | */ | ||
1880 | if (unlikely(!zonelist->_zonerefs->zone)) | ||
1881 | return NULL; | ||
1882 | |||
1883 | /* The preferred zone is used for statistics later */ | ||
1884 | first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); | ||
1885 | if (!preferred_zone) | ||
1886 | return NULL; | ||
1887 | |||
1888 | /* First allocation attempt */ | ||
1889 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | ||
1890 | zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET, | ||
1891 | preferred_zone, migratetype); | ||
1892 | if (unlikely(!page)) | ||
1893 | page = __alloc_pages_slowpath(gfp_mask, order, | ||
1894 | zonelist, high_zoneidx, nodemask, | ||
1895 | preferred_zone, migratetype); | ||
1896 | |||
1674 | return page; | 1897 | return page; |
1675 | } | 1898 | } |
1676 | EXPORT_SYMBOL(__alloc_pages_internal); | 1899 | EXPORT_SYMBOL(__alloc_pages_nodemask); |
1677 | 1900 | ||
1678 | /* | 1901 | /* |
1679 | * Common helper functions. | 1902 | * Common helper functions. |
@@ -1802,7 +2025,7 @@ static unsigned int nr_free_zone_pages(int offset) | |||
1802 | 2025 | ||
1803 | for_each_zone_zonelist(zone, z, zonelist, offset) { | 2026 | for_each_zone_zonelist(zone, z, zonelist, offset) { |
1804 | unsigned long size = zone->present_pages; | 2027 | unsigned long size = zone->present_pages; |
1805 | unsigned long high = zone->pages_high; | 2028 | unsigned long high = high_wmark_pages(zone); |
1806 | if (size > high) | 2029 | if (size > high) |
1807 | sum += size - high; | 2030 | sum += size - high; |
1808 | } | 2031 | } |
@@ -1894,19 +2117,14 @@ void show_free_areas(void) | |||
1894 | 2117 | ||
1895 | printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" | 2118 | printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" |
1896 | " inactive_file:%lu" | 2119 | " inactive_file:%lu" |
1897 | //TODO: check/adjust line lengths | ||
1898 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
1899 | " unevictable:%lu" | 2120 | " unevictable:%lu" |
1900 | #endif | ||
1901 | " dirty:%lu writeback:%lu unstable:%lu\n" | 2121 | " dirty:%lu writeback:%lu unstable:%lu\n" |
1902 | " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", | 2122 | " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", |
1903 | global_page_state(NR_ACTIVE_ANON), | 2123 | global_page_state(NR_ACTIVE_ANON), |
1904 | global_page_state(NR_ACTIVE_FILE), | 2124 | global_page_state(NR_ACTIVE_FILE), |
1905 | global_page_state(NR_INACTIVE_ANON), | 2125 | global_page_state(NR_INACTIVE_ANON), |
1906 | global_page_state(NR_INACTIVE_FILE), | 2126 | global_page_state(NR_INACTIVE_FILE), |
1907 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
1908 | global_page_state(NR_UNEVICTABLE), | 2127 | global_page_state(NR_UNEVICTABLE), |
1909 | #endif | ||
1910 | global_page_state(NR_FILE_DIRTY), | 2128 | global_page_state(NR_FILE_DIRTY), |
1911 | global_page_state(NR_WRITEBACK), | 2129 | global_page_state(NR_WRITEBACK), |
1912 | global_page_state(NR_UNSTABLE_NFS), | 2130 | global_page_state(NR_UNSTABLE_NFS), |
@@ -1930,25 +2148,21 @@ void show_free_areas(void) | |||
1930 | " inactive_anon:%lukB" | 2148 | " inactive_anon:%lukB" |
1931 | " active_file:%lukB" | 2149 | " active_file:%lukB" |
1932 | " inactive_file:%lukB" | 2150 | " inactive_file:%lukB" |
1933 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
1934 | " unevictable:%lukB" | 2151 | " unevictable:%lukB" |
1935 | #endif | ||
1936 | " present:%lukB" | 2152 | " present:%lukB" |
1937 | " pages_scanned:%lu" | 2153 | " pages_scanned:%lu" |
1938 | " all_unreclaimable? %s" | 2154 | " all_unreclaimable? %s" |
1939 | "\n", | 2155 | "\n", |
1940 | zone->name, | 2156 | zone->name, |
1941 | K(zone_page_state(zone, NR_FREE_PAGES)), | 2157 | K(zone_page_state(zone, NR_FREE_PAGES)), |
1942 | K(zone->pages_min), | 2158 | K(min_wmark_pages(zone)), |
1943 | K(zone->pages_low), | 2159 | K(low_wmark_pages(zone)), |
1944 | K(zone->pages_high), | 2160 | K(high_wmark_pages(zone)), |
1945 | K(zone_page_state(zone, NR_ACTIVE_ANON)), | 2161 | K(zone_page_state(zone, NR_ACTIVE_ANON)), |
1946 | K(zone_page_state(zone, NR_INACTIVE_ANON)), | 2162 | K(zone_page_state(zone, NR_INACTIVE_ANON)), |
1947 | K(zone_page_state(zone, NR_ACTIVE_FILE)), | 2163 | K(zone_page_state(zone, NR_ACTIVE_FILE)), |
1948 | K(zone_page_state(zone, NR_INACTIVE_FILE)), | 2164 | K(zone_page_state(zone, NR_INACTIVE_FILE)), |
1949 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
1950 | K(zone_page_state(zone, NR_UNEVICTABLE)), | 2165 | K(zone_page_state(zone, NR_UNEVICTABLE)), |
1951 | #endif | ||
1952 | K(zone->present_pages), | 2166 | K(zone->present_pages), |
1953 | zone->pages_scanned, | 2167 | zone->pages_scanned, |
1954 | (zone_is_all_unreclaimable(zone) ? "yes" : "no") | 2168 | (zone_is_all_unreclaimable(zone) ? "yes" : "no") |
@@ -2106,7 +2320,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write, | |||
2106 | } | 2320 | } |
2107 | 2321 | ||
2108 | 2322 | ||
2109 | #define MAX_NODE_LOAD (num_online_nodes()) | 2323 | #define MAX_NODE_LOAD (nr_online_nodes) |
2110 | static int node_load[MAX_NUMNODES]; | 2324 | static int node_load[MAX_NUMNODES]; |
2111 | 2325 | ||
2112 | /** | 2326 | /** |
@@ -2315,7 +2529,7 @@ static void build_zonelists(pg_data_t *pgdat) | |||
2315 | 2529 | ||
2316 | /* NUMA-aware ordering of nodes */ | 2530 | /* NUMA-aware ordering of nodes */ |
2317 | local_node = pgdat->node_id; | 2531 | local_node = pgdat->node_id; |
2318 | load = num_online_nodes(); | 2532 | load = nr_online_nodes; |
2319 | prev_node = local_node; | 2533 | prev_node = local_node; |
2320 | nodes_clear(used_mask); | 2534 | nodes_clear(used_mask); |
2321 | 2535 | ||
@@ -2466,7 +2680,7 @@ void build_all_zonelists(void) | |||
2466 | 2680 | ||
2467 | printk("Built %i zonelists in %s order, mobility grouping %s. " | 2681 | printk("Built %i zonelists in %s order, mobility grouping %s. " |
2468 | "Total pages: %ld\n", | 2682 | "Total pages: %ld\n", |
2469 | num_online_nodes(), | 2683 | nr_online_nodes, |
2470 | zonelist_order_name[current_zonelist_order], | 2684 | zonelist_order_name[current_zonelist_order], |
2471 | page_group_by_mobility_disabled ? "off" : "on", | 2685 | page_group_by_mobility_disabled ? "off" : "on", |
2472 | vm_total_pages); | 2686 | vm_total_pages); |
@@ -2545,8 +2759,8 @@ static inline unsigned long wait_table_bits(unsigned long size) | |||
2545 | 2759 | ||
2546 | /* | 2760 | /* |
2547 | * Mark a number of pageblocks as MIGRATE_RESERVE. The number | 2761 | * Mark a number of pageblocks as MIGRATE_RESERVE. The number |
2548 | * of blocks reserved is based on zone->pages_min. The memory within the | 2762 | * of blocks reserved is based on min_wmark_pages(zone). The memory within |
2549 | * reserve will tend to store contiguous free pages. Setting min_free_kbytes | 2763 | * the reserve will tend to store contiguous free pages. Setting min_free_kbytes |
2550 | * higher will lead to a bigger reserve which will get freed as contiguous | 2764 | * higher will lead to a bigger reserve which will get freed as contiguous |
2551 | * blocks as reclaim kicks in | 2765 | * blocks as reclaim kicks in |
2552 | */ | 2766 | */ |
@@ -2559,7 +2773,7 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
2559 | /* Get the start pfn, end pfn and the number of blocks to reserve */ | 2773 | /* Get the start pfn, end pfn and the number of blocks to reserve */ |
2560 | start_pfn = zone->zone_start_pfn; | 2774 | start_pfn = zone->zone_start_pfn; |
2561 | end_pfn = start_pfn + zone->spanned_pages; | 2775 | end_pfn = start_pfn + zone->spanned_pages; |
2562 | reserve = roundup(zone->pages_min, pageblock_nr_pages) >> | 2776 | reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> |
2563 | pageblock_order; | 2777 | pageblock_order; |
2564 | 2778 | ||
2565 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { | 2779 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { |
@@ -2812,7 +3026,7 @@ bad: | |||
2812 | if (dzone == zone) | 3026 | if (dzone == zone) |
2813 | break; | 3027 | break; |
2814 | kfree(zone_pcp(dzone, cpu)); | 3028 | kfree(zone_pcp(dzone, cpu)); |
2815 | zone_pcp(dzone, cpu) = NULL; | 3029 | zone_pcp(dzone, cpu) = &boot_pageset[cpu]; |
2816 | } | 3030 | } |
2817 | return -ENOMEM; | 3031 | return -ENOMEM; |
2818 | } | 3032 | } |
@@ -2827,7 +3041,7 @@ static inline void free_zone_pagesets(int cpu) | |||
2827 | /* Free per_cpu_pageset if it is slab allocated */ | 3041 | /* Free per_cpu_pageset if it is slab allocated */ |
2828 | if (pset != &boot_pageset[cpu]) | 3042 | if (pset != &boot_pageset[cpu]) |
2829 | kfree(pset); | 3043 | kfree(pset); |
2830 | zone_pcp(zone, cpu) = NULL; | 3044 | zone_pcp(zone, cpu) = &boot_pageset[cpu]; |
2831 | } | 3045 | } |
2832 | } | 3046 | } |
2833 | 3047 | ||
@@ -3103,64 +3317,6 @@ void __init sparse_memory_present_with_active_regions(int nid) | |||
3103 | } | 3317 | } |
3104 | 3318 | ||
3105 | /** | 3319 | /** |
3106 | * push_node_boundaries - Push node boundaries to at least the requested boundary | ||
3107 | * @nid: The nid of the node to push the boundary for | ||
3108 | * @start_pfn: The start pfn of the node | ||
3109 | * @end_pfn: The end pfn of the node | ||
3110 | * | ||
3111 | * In reserve-based hot-add, mem_map is allocated that is unused until hotadd | ||
3112 | * time. Specifically, on x86_64, SRAT will report ranges that can potentially | ||
3113 | * be hotplugged even though no physical memory exists. This function allows | ||
3114 | * an arch to push out the node boundaries so mem_map is allocated that can | ||
3115 | * be used later. | ||
3116 | */ | ||
3117 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE | ||
3118 | void __init push_node_boundaries(unsigned int nid, | ||
3119 | unsigned long start_pfn, unsigned long end_pfn) | ||
3120 | { | ||
3121 | mminit_dprintk(MMINIT_TRACE, "zoneboundary", | ||
3122 | "Entering push_node_boundaries(%u, %lu, %lu)\n", | ||
3123 | nid, start_pfn, end_pfn); | ||
3124 | |||
3125 | /* Initialise the boundary for this node if necessary */ | ||
3126 | if (node_boundary_end_pfn[nid] == 0) | ||
3127 | node_boundary_start_pfn[nid] = -1UL; | ||
3128 | |||
3129 | /* Update the boundaries */ | ||
3130 | if (node_boundary_start_pfn[nid] > start_pfn) | ||
3131 | node_boundary_start_pfn[nid] = start_pfn; | ||
3132 | if (node_boundary_end_pfn[nid] < end_pfn) | ||
3133 | node_boundary_end_pfn[nid] = end_pfn; | ||
3134 | } | ||
3135 | |||
3136 | /* If necessary, push the node boundary out for reserve hotadd */ | ||
3137 | static void __meminit account_node_boundary(unsigned int nid, | ||
3138 | unsigned long *start_pfn, unsigned long *end_pfn) | ||
3139 | { | ||
3140 | mminit_dprintk(MMINIT_TRACE, "zoneboundary", | ||
3141 | "Entering account_node_boundary(%u, %lu, %lu)\n", | ||
3142 | nid, *start_pfn, *end_pfn); | ||
3143 | |||
3144 | /* Return if boundary information has not been provided */ | ||
3145 | if (node_boundary_end_pfn[nid] == 0) | ||
3146 | return; | ||
3147 | |||
3148 | /* Check the boundaries and update if necessary */ | ||
3149 | if (node_boundary_start_pfn[nid] < *start_pfn) | ||
3150 | *start_pfn = node_boundary_start_pfn[nid]; | ||
3151 | if (node_boundary_end_pfn[nid] > *end_pfn) | ||
3152 | *end_pfn = node_boundary_end_pfn[nid]; | ||
3153 | } | ||
3154 | #else | ||
3155 | void __init push_node_boundaries(unsigned int nid, | ||
3156 | unsigned long start_pfn, unsigned long end_pfn) {} | ||
3157 | |||
3158 | static void __meminit account_node_boundary(unsigned int nid, | ||
3159 | unsigned long *start_pfn, unsigned long *end_pfn) {} | ||
3160 | #endif | ||
3161 | |||
3162 | |||
3163 | /** | ||
3164 | * get_pfn_range_for_nid - Return the start and end page frames for a node | 3320 | * get_pfn_range_for_nid - Return the start and end page frames for a node |
3165 | * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. | 3321 | * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. |
3166 | * @start_pfn: Passed by reference. On return, it will have the node start_pfn. | 3322 | * @start_pfn: Passed by reference. On return, it will have the node start_pfn. |
@@ -3185,9 +3341,6 @@ void __meminit get_pfn_range_for_nid(unsigned int nid, | |||
3185 | 3341 | ||
3186 | if (*start_pfn == -1UL) | 3342 | if (*start_pfn == -1UL) |
3187 | *start_pfn = 0; | 3343 | *start_pfn = 0; |
3188 | |||
3189 | /* Push the node boundaries out if requested */ | ||
3190 | account_node_boundary(nid, start_pfn, end_pfn); | ||
3191 | } | 3344 | } |
3192 | 3345 | ||
3193 | /* | 3346 | /* |
@@ -3552,7 +3705,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
3552 | zone_pcp_init(zone); | 3705 | zone_pcp_init(zone); |
3553 | for_each_lru(l) { | 3706 | for_each_lru(l) { |
3554 | INIT_LIST_HEAD(&zone->lru[l].list); | 3707 | INIT_LIST_HEAD(&zone->lru[l].list); |
3555 | zone->lru[l].nr_scan = 0; | 3708 | zone->lru[l].nr_saved_scan = 0; |
3556 | } | 3709 | } |
3557 | zone->reclaim_stat.recent_rotated[0] = 0; | 3710 | zone->reclaim_stat.recent_rotated[0] = 0; |
3558 | zone->reclaim_stat.recent_rotated[1] = 0; | 3711 | zone->reclaim_stat.recent_rotated[1] = 0; |
@@ -3793,10 +3946,6 @@ void __init remove_all_active_ranges(void) | |||
3793 | { | 3946 | { |
3794 | memset(early_node_map, 0, sizeof(early_node_map)); | 3947 | memset(early_node_map, 0, sizeof(early_node_map)); |
3795 | nr_nodemap_entries = 0; | 3948 | nr_nodemap_entries = 0; |
3796 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE | ||
3797 | memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn)); | ||
3798 | memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn)); | ||
3799 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ | ||
3800 | } | 3949 | } |
3801 | 3950 | ||
3802 | /* Compare two active node_active_regions */ | 3951 | /* Compare two active node_active_regions */ |
@@ -4093,6 +4242,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
4093 | early_node_map[i].start_pfn, | 4242 | early_node_map[i].start_pfn, |
4094 | early_node_map[i].end_pfn); | 4243 | early_node_map[i].end_pfn); |
4095 | 4244 | ||
4245 | /* | ||
4246 | * find_zone_movable_pfns_for_nodes/early_calculate_totalpages init | ||
4247 | * that node_mask, clear it at first | ||
4248 | */ | ||
4249 | nodes_clear(node_states[N_HIGH_MEMORY]); | ||
4096 | /* Initialise every node */ | 4250 | /* Initialise every node */ |
4097 | mminit_verify_pageflags_layout(); | 4251 | mminit_verify_pageflags_layout(); |
4098 | setup_nr_node_ids(); | 4252 | setup_nr_node_ids(); |
@@ -4227,8 +4381,8 @@ static void calculate_totalreserve_pages(void) | |||
4227 | max = zone->lowmem_reserve[j]; | 4381 | max = zone->lowmem_reserve[j]; |
4228 | } | 4382 | } |
4229 | 4383 | ||
4230 | /* we treat pages_high as reserved pages. */ | 4384 | /* we treat the high watermark as reserved pages. */ |
4231 | max += zone->pages_high; | 4385 | max += high_wmark_pages(zone); |
4232 | 4386 | ||
4233 | if (max > zone->present_pages) | 4387 | if (max > zone->present_pages) |
4234 | max = zone->present_pages; | 4388 | max = zone->present_pages; |
@@ -4278,12 +4432,13 @@ static void setup_per_zone_lowmem_reserve(void) | |||
4278 | } | 4432 | } |
4279 | 4433 | ||
4280 | /** | 4434 | /** |
4281 | * setup_per_zone_pages_min - called when min_free_kbytes changes. | 4435 | * setup_per_zone_wmarks - called when min_free_kbytes changes |
4436 | * or when memory is hot-{added|removed} | ||
4282 | * | 4437 | * |
4283 | * Ensures that the pages_{min,low,high} values for each zone are set correctly | 4438 | * Ensures that the watermark[min,low,high] values for each zone are set |
4284 | * with respect to min_free_kbytes. | 4439 | * correctly with respect to min_free_kbytes. |
4285 | */ | 4440 | */ |
4286 | void setup_per_zone_pages_min(void) | 4441 | void setup_per_zone_wmarks(void) |
4287 | { | 4442 | { |
4288 | unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); | 4443 | unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); |
4289 | unsigned long lowmem_pages = 0; | 4444 | unsigned long lowmem_pages = 0; |
@@ -4308,7 +4463,7 @@ void setup_per_zone_pages_min(void) | |||
4308 | * need highmem pages, so cap pages_min to a small | 4463 | * need highmem pages, so cap pages_min to a small |
4309 | * value here. | 4464 | * value here. |
4310 | * | 4465 | * |
4311 | * The (pages_high-pages_low) and (pages_low-pages_min) | 4466 | * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) |
4312 | * deltas controls asynch page reclaim, and so should | 4467 | * deltas controls asynch page reclaim, and so should |
4313 | * not be capped for highmem. | 4468 | * not be capped for highmem. |
4314 | */ | 4469 | */ |
@@ -4319,17 +4474,17 @@ void setup_per_zone_pages_min(void) | |||
4319 | min_pages = SWAP_CLUSTER_MAX; | 4474 | min_pages = SWAP_CLUSTER_MAX; |
4320 | if (min_pages > 128) | 4475 | if (min_pages > 128) |
4321 | min_pages = 128; | 4476 | min_pages = 128; |
4322 | zone->pages_min = min_pages; | 4477 | zone->watermark[WMARK_MIN] = min_pages; |
4323 | } else { | 4478 | } else { |
4324 | /* | 4479 | /* |
4325 | * If it's a lowmem zone, reserve a number of pages | 4480 | * If it's a lowmem zone, reserve a number of pages |
4326 | * proportionate to the zone's size. | 4481 | * proportionate to the zone's size. |
4327 | */ | 4482 | */ |
4328 | zone->pages_min = tmp; | 4483 | zone->watermark[WMARK_MIN] = tmp; |
4329 | } | 4484 | } |
4330 | 4485 | ||
4331 | zone->pages_low = zone->pages_min + (tmp >> 2); | 4486 | zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); |
4332 | zone->pages_high = zone->pages_min + (tmp >> 1); | 4487 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); |
4333 | setup_zone_migrate_reserve(zone); | 4488 | setup_zone_migrate_reserve(zone); |
4334 | spin_unlock_irqrestore(&zone->lock, flags); | 4489 | spin_unlock_irqrestore(&zone->lock, flags); |
4335 | } | 4490 | } |
@@ -4339,8 +4494,6 @@ void setup_per_zone_pages_min(void) | |||
4339 | } | 4494 | } |
4340 | 4495 | ||
4341 | /** | 4496 | /** |
4342 | * setup_per_zone_inactive_ratio - called when min_free_kbytes changes. | ||
4343 | * | ||
4344 | * The inactive anon list should be small enough that the VM never has to | 4497 | * The inactive anon list should be small enough that the VM never has to |
4345 | * do too much work, but large enough that each inactive page has a chance | 4498 | * do too much work, but large enough that each inactive page has a chance |
4346 | * to be referenced again before it is swapped out. | 4499 | * to be referenced again before it is swapped out. |
@@ -4361,21 +4514,26 @@ void setup_per_zone_pages_min(void) | |||
4361 | * 1TB 101 10GB | 4514 | * 1TB 101 10GB |
4362 | * 10TB 320 32GB | 4515 | * 10TB 320 32GB |
4363 | */ | 4516 | */ |
4364 | static void setup_per_zone_inactive_ratio(void) | 4517 | void calculate_zone_inactive_ratio(struct zone *zone) |
4365 | { | 4518 | { |
4366 | struct zone *zone; | 4519 | unsigned int gb, ratio; |
4367 | 4520 | ||
4368 | for_each_zone(zone) { | 4521 | /* Zone size in gigabytes */ |
4369 | unsigned int gb, ratio; | 4522 | gb = zone->present_pages >> (30 - PAGE_SHIFT); |
4370 | 4523 | if (gb) | |
4371 | /* Zone size in gigabytes */ | ||
4372 | gb = zone->present_pages >> (30 - PAGE_SHIFT); | ||
4373 | ratio = int_sqrt(10 * gb); | 4524 | ratio = int_sqrt(10 * gb); |
4374 | if (!ratio) | 4525 | else |
4375 | ratio = 1; | 4526 | ratio = 1; |
4376 | 4527 | ||
4377 | zone->inactive_ratio = ratio; | 4528 | zone->inactive_ratio = ratio; |
4378 | } | 4529 | } |
4530 | |||
4531 | static void __init setup_per_zone_inactive_ratio(void) | ||
4532 | { | ||
4533 | struct zone *zone; | ||
4534 | |||
4535 | for_each_zone(zone) | ||
4536 | calculate_zone_inactive_ratio(zone); | ||
4379 | } | 4537 | } |
4380 | 4538 | ||
4381 | /* | 4539 | /* |
@@ -4402,7 +4560,7 @@ static void setup_per_zone_inactive_ratio(void) | |||
4402 | * 8192MB: 11584k | 4560 | * 8192MB: 11584k |
4403 | * 16384MB: 16384k | 4561 | * 16384MB: 16384k |
4404 | */ | 4562 | */ |
4405 | static int __init init_per_zone_pages_min(void) | 4563 | static int __init init_per_zone_wmark_min(void) |
4406 | { | 4564 | { |
4407 | unsigned long lowmem_kbytes; | 4565 | unsigned long lowmem_kbytes; |
4408 | 4566 | ||
@@ -4413,12 +4571,12 @@ static int __init init_per_zone_pages_min(void) | |||
4413 | min_free_kbytes = 128; | 4571 | min_free_kbytes = 128; |
4414 | if (min_free_kbytes > 65536) | 4572 | if (min_free_kbytes > 65536) |
4415 | min_free_kbytes = 65536; | 4573 | min_free_kbytes = 65536; |
4416 | setup_per_zone_pages_min(); | 4574 | setup_per_zone_wmarks(); |
4417 | setup_per_zone_lowmem_reserve(); | 4575 | setup_per_zone_lowmem_reserve(); |
4418 | setup_per_zone_inactive_ratio(); | 4576 | setup_per_zone_inactive_ratio(); |
4419 | return 0; | 4577 | return 0; |
4420 | } | 4578 | } |
4421 | module_init(init_per_zone_pages_min) | 4579 | module_init(init_per_zone_wmark_min) |
4422 | 4580 | ||
4423 | /* | 4581 | /* |
4424 | * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so | 4582 | * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so |
@@ -4430,7 +4588,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, | |||
4430 | { | 4588 | { |
4431 | proc_dointvec(table, write, file, buffer, length, ppos); | 4589 | proc_dointvec(table, write, file, buffer, length, ppos); |
4432 | if (write) | 4590 | if (write) |
4433 | setup_per_zone_pages_min(); | 4591 | setup_per_zone_wmarks(); |
4434 | return 0; | 4592 | return 0; |
4435 | } | 4593 | } |
4436 | 4594 | ||
@@ -4474,7 +4632,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, | |||
4474 | * whenever sysctl_lowmem_reserve_ratio changes. | 4632 | * whenever sysctl_lowmem_reserve_ratio changes. |
4475 | * | 4633 | * |
4476 | * The reserve ratio obviously has absolutely no relation with the | 4634 | * The reserve ratio obviously has absolutely no relation with the |
4477 | * pages_min watermarks. The lowmem reserve ratio can only make sense | 4635 | * minimum watermarks. The lowmem reserve ratio can only make sense |
4478 | * if in function of the boot time zone sizes. | 4636 | * if in function of the boot time zone sizes. |
4479 | */ | 4637 | */ |
4480 | int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, | 4638 | int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, |
@@ -4501,7 +4659,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, | |||
4501 | ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); | 4659 | ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); |
4502 | if (!write || (ret == -EINVAL)) | 4660 | if (!write || (ret == -EINVAL)) |
4503 | return ret; | 4661 | return ret; |
4504 | for_each_zone(zone) { | 4662 | for_each_populated_zone(zone) { |
4505 | for_each_online_cpu(cpu) { | 4663 | for_each_online_cpu(cpu) { |
4506 | unsigned long high; | 4664 | unsigned long high; |
4507 | high = zone->present_pages / percpu_pagelist_fraction; | 4665 | high = zone->present_pages / percpu_pagelist_fraction; |
@@ -4581,23 +4739,13 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
4581 | else if (hashdist) | 4739 | else if (hashdist) |
4582 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); | 4740 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); |
4583 | else { | 4741 | else { |
4584 | unsigned long order = get_order(size); | ||
4585 | table = (void*) __get_free_pages(GFP_ATOMIC, order); | ||
4586 | /* | 4742 | /* |
4587 | * If bucketsize is not a power-of-two, we may free | 4743 | * If bucketsize is not a power-of-two, we may free |
4588 | * some pages at the end of hash table. | 4744 | * some pages at the end of hash table which |
4745 | * alloc_pages_exact() automatically does | ||
4589 | */ | 4746 | */ |
4590 | if (table) { | 4747 | if (get_order(size) < MAX_ORDER) |
4591 | unsigned long alloc_end = (unsigned long)table + | 4748 | table = alloc_pages_exact(size, GFP_ATOMIC); |
4592 | (PAGE_SIZE << order); | ||
4593 | unsigned long used = (unsigned long)table + | ||
4594 | PAGE_ALIGN(size); | ||
4595 | split_page(virt_to_page(table), order); | ||
4596 | while (used < alloc_end) { | ||
4597 | free_page(used); | ||
4598 | used += PAGE_SIZE; | ||
4599 | } | ||
4600 | } | ||
4601 | } | 4749 | } |
4602 | } while (!table && size > PAGE_SIZE && --log2qty); | 4750 | } while (!table && size > PAGE_SIZE && --log2qty); |
4603 | 4751 | ||
@@ -4615,6 +4763,16 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
4615 | if (_hash_mask) | 4763 | if (_hash_mask) |
4616 | *_hash_mask = (1 << log2qty) - 1; | 4764 | *_hash_mask = (1 << log2qty) - 1; |
4617 | 4765 | ||
4766 | /* | ||
4767 | * If hashdist is set, the table allocation is done with __vmalloc() | ||
4768 | * which invokes the kmemleak_alloc() callback. This function may also | ||
4769 | * be called before the slab and kmemleak are initialised when | ||
4770 | * kmemleak simply buffers the request to be executed later | ||
4771 | * (GFP_ATOMIC flag ignored in this case). | ||
4772 | */ | ||
4773 | if (!hashdist) | ||
4774 | kmemleak_alloc(table, size, 1, GFP_ATOMIC); | ||
4775 | |||
4618 | return table; | 4776 | return table; |
4619 | } | 4777 | } |
4620 | 4778 | ||
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 791905c991df..f22b4ebbd8dc 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -69,7 +69,7 @@ static int __init alloc_node_page_cgroup(int nid) | |||
69 | return 0; | 69 | return 0; |
70 | } | 70 | } |
71 | 71 | ||
72 | void __init page_cgroup_init(void) | 72 | void __init page_cgroup_init_flatmem(void) |
73 | { | 73 | { |
74 | 74 | ||
75 | int nid, fail; | 75 | int nid, fail; |
@@ -83,12 +83,12 @@ void __init page_cgroup_init(void) | |||
83 | goto fail; | 83 | goto fail; |
84 | } | 84 | } |
85 | printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); | 85 | printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); |
86 | printk(KERN_INFO "please try cgroup_disable=memory option if you" | 86 | printk(KERN_INFO "please try 'cgroup_disable=memory' option if you" |
87 | " don't want\n"); | 87 | " don't want memory cgroups\n"); |
88 | return; | 88 | return; |
89 | fail: | 89 | fail: |
90 | printk(KERN_CRIT "allocation of page_cgroup was failed.\n"); | 90 | printk(KERN_CRIT "allocation of page_cgroup failed.\n"); |
91 | printk(KERN_CRIT "please try cgroup_disable=memory boot option\n"); | 91 | printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n"); |
92 | panic("Out of memory"); | 92 | panic("Out of memory"); |
93 | } | 93 | } |
94 | 94 | ||
@@ -99,6 +99,8 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) | |||
99 | unsigned long pfn = page_to_pfn(page); | 99 | unsigned long pfn = page_to_pfn(page); |
100 | struct mem_section *section = __pfn_to_section(pfn); | 100 | struct mem_section *section = __pfn_to_section(pfn); |
101 | 101 | ||
102 | if (!section->page_cgroup) | ||
103 | return NULL; | ||
102 | return section->page_cgroup + pfn; | 104 | return section->page_cgroup + pfn; |
103 | } | 105 | } |
104 | 106 | ||
@@ -113,16 +115,11 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn) | |||
113 | if (!section->page_cgroup) { | 115 | if (!section->page_cgroup) { |
114 | nid = page_to_nid(pfn_to_page(pfn)); | 116 | nid = page_to_nid(pfn_to_page(pfn)); |
115 | table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; | 117 | table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; |
116 | if (slab_is_available()) { | 118 | VM_BUG_ON(!slab_is_available()); |
117 | base = kmalloc_node(table_size, | 119 | base = kmalloc_node(table_size, |
118 | GFP_KERNEL | __GFP_NOWARN, nid); | 120 | GFP_KERNEL | __GFP_NOWARN, nid); |
119 | if (!base) | 121 | if (!base) |
120 | base = vmalloc_node(table_size, nid); | 122 | base = vmalloc_node(table_size, nid); |
121 | } else { | ||
122 | base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), | ||
123 | table_size, | ||
124 | PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); | ||
125 | } | ||
126 | } else { | 123 | } else { |
127 | /* | 124 | /* |
128 | * We don't have to allocate page_cgroup again, but | 125 | * We don't have to allocate page_cgroup again, but |
@@ -257,14 +254,14 @@ void __init page_cgroup_init(void) | |||
257 | fail = init_section_page_cgroup(pfn); | 254 | fail = init_section_page_cgroup(pfn); |
258 | } | 255 | } |
259 | if (fail) { | 256 | if (fail) { |
260 | printk(KERN_CRIT "try cgroup_disable=memory boot option\n"); | 257 | printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n"); |
261 | panic("Out of memory"); | 258 | panic("Out of memory"); |
262 | } else { | 259 | } else { |
263 | hotplug_memory_notifier(page_cgroup_callback, 0); | 260 | hotplug_memory_notifier(page_cgroup_callback, 0); |
264 | } | 261 | } |
265 | printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); | 262 | printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); |
266 | printk(KERN_INFO "please try cgroup_disable=memory option if you don't" | 263 | printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't" |
267 | " want\n"); | 264 | " want memory cgroups\n"); |
268 | } | 265 | } |
269 | 266 | ||
270 | void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) | 267 | void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) |
@@ -314,8 +311,6 @@ static int swap_cgroup_prepare(int type) | |||
314 | struct swap_cgroup_ctrl *ctrl; | 311 | struct swap_cgroup_ctrl *ctrl; |
315 | unsigned long idx, max; | 312 | unsigned long idx, max; |
316 | 313 | ||
317 | if (!do_swap_account) | ||
318 | return 0; | ||
319 | ctrl = &swap_cgroup_ctrl[type]; | 314 | ctrl = &swap_cgroup_ctrl[type]; |
320 | 315 | ||
321 | for (idx = 0; idx < ctrl->length; idx++) { | 316 | for (idx = 0; idx < ctrl->length; idx++) { |
@@ -352,9 +347,6 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) | |||
352 | struct swap_cgroup *sc; | 347 | struct swap_cgroup *sc; |
353 | unsigned short old; | 348 | unsigned short old; |
354 | 349 | ||
355 | if (!do_swap_account) | ||
356 | return 0; | ||
357 | |||
358 | ctrl = &swap_cgroup_ctrl[type]; | 350 | ctrl = &swap_cgroup_ctrl[type]; |
359 | 351 | ||
360 | mappage = ctrl->map[idx]; | 352 | mappage = ctrl->map[idx]; |
@@ -383,9 +375,6 @@ unsigned short lookup_swap_cgroup(swp_entry_t ent) | |||
383 | struct swap_cgroup *sc; | 375 | struct swap_cgroup *sc; |
384 | unsigned short ret; | 376 | unsigned short ret; |
385 | 377 | ||
386 | if (!do_swap_account) | ||
387 | return 0; | ||
388 | |||
389 | ctrl = &swap_cgroup_ctrl[type]; | 378 | ctrl = &swap_cgroup_ctrl[type]; |
390 | mappage = ctrl->map[idx]; | 379 | mappage = ctrl->map[idx]; |
391 | sc = page_address(mappage); | 380 | sc = page_address(mappage); |
diff --git a/mm/page_io.c b/mm/page_io.c index 3023c475e041..c6f3e5071de3 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -120,7 +120,7 @@ out: | |||
120 | return ret; | 120 | return ret; |
121 | } | 121 | } |
122 | 122 | ||
123 | int swap_readpage(struct file *file, struct page *page) | 123 | int swap_readpage(struct page *page) |
124 | { | 124 | { |
125 | struct bio *bio; | 125 | struct bio *bio; |
126 | int ret = 0; | 126 | int ret = 0; |
diff --git a/mm/percpu.c b/mm/percpu.c index 1aa5d8fbca12..c0b2c1a76e81 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -23,7 +23,7 @@ | |||
23 | * Allocation is done in offset-size areas of single unit space. Ie, | 23 | * Allocation is done in offset-size areas of single unit space. Ie, |
24 | * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, | 24 | * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, |
25 | * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring | 25 | * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring |
26 | * percpu base registers UNIT_SIZE apart. | 26 | * percpu base registers pcpu_unit_size apart. |
27 | * | 27 | * |
28 | * There are usually many small percpu allocations many of them as | 28 | * There are usually many small percpu allocations many of them as |
29 | * small as 4 bytes. The allocator organizes chunks into lists | 29 | * small as 4 bytes. The allocator organizes chunks into lists |
@@ -38,8 +38,8 @@ | |||
38 | * region and negative allocated. Allocation inside a chunk is done | 38 | * region and negative allocated. Allocation inside a chunk is done |
39 | * by scanning this map sequentially and serving the first matching | 39 | * by scanning this map sequentially and serving the first matching |
40 | * entry. This is mostly copied from the percpu_modalloc() allocator. | 40 | * entry. This is mostly copied from the percpu_modalloc() allocator. |
41 | * Chunks are also linked into a rb tree to ease address to chunk | 41 | * Chunks can be determined from the address using the index field |
42 | * mapping during free. | 42 | * in the page struct. The index field contains a pointer to the chunk. |
43 | * | 43 | * |
44 | * To use this allocator, arch code should do the followings. | 44 | * To use this allocator, arch code should do the followings. |
45 | * | 45 | * |
@@ -61,7 +61,6 @@ | |||
61 | #include <linux/mutex.h> | 61 | #include <linux/mutex.h> |
62 | #include <linux/percpu.h> | 62 | #include <linux/percpu.h> |
63 | #include <linux/pfn.h> | 63 | #include <linux/pfn.h> |
64 | #include <linux/rbtree.h> | ||
65 | #include <linux/slab.h> | 64 | #include <linux/slab.h> |
66 | #include <linux/spinlock.h> | 65 | #include <linux/spinlock.h> |
67 | #include <linux/vmalloc.h> | 66 | #include <linux/vmalloc.h> |
@@ -88,7 +87,6 @@ | |||
88 | 87 | ||
89 | struct pcpu_chunk { | 88 | struct pcpu_chunk { |
90 | struct list_head list; /* linked to pcpu_slot lists */ | 89 | struct list_head list; /* linked to pcpu_slot lists */ |
91 | struct rb_node rb_node; /* key is chunk->vm->addr */ | ||
92 | int free_size; /* free bytes in the chunk */ | 90 | int free_size; /* free bytes in the chunk */ |
93 | int contig_hint; /* max contiguous size hint */ | 91 | int contig_hint; /* max contiguous size hint */ |
94 | struct vm_struct *vm; /* mapped vmalloc region */ | 92 | struct vm_struct *vm; /* mapped vmalloc region */ |
@@ -110,9 +108,21 @@ static size_t pcpu_chunk_struct_size __read_mostly; | |||
110 | void *pcpu_base_addr __read_mostly; | 108 | void *pcpu_base_addr __read_mostly; |
111 | EXPORT_SYMBOL_GPL(pcpu_base_addr); | 109 | EXPORT_SYMBOL_GPL(pcpu_base_addr); |
112 | 110 | ||
113 | /* optional reserved chunk, only accessible for reserved allocations */ | 111 | /* |
112 | * The first chunk which always exists. Note that unlike other | ||
113 | * chunks, this one can be allocated and mapped in several different | ||
114 | * ways and thus often doesn't live in the vmalloc area. | ||
115 | */ | ||
116 | static struct pcpu_chunk *pcpu_first_chunk; | ||
117 | |||
118 | /* | ||
119 | * Optional reserved chunk. This chunk reserves part of the first | ||
120 | * chunk and serves it for reserved allocations. The amount of | ||
121 | * reserved offset is in pcpu_reserved_chunk_limit. When reserved | ||
122 | * area doesn't exist, the following variables contain NULL and 0 | ||
123 | * respectively. | ||
124 | */ | ||
114 | static struct pcpu_chunk *pcpu_reserved_chunk; | 125 | static struct pcpu_chunk *pcpu_reserved_chunk; |
115 | /* offset limit of the reserved chunk */ | ||
116 | static int pcpu_reserved_chunk_limit; | 126 | static int pcpu_reserved_chunk_limit; |
117 | 127 | ||
118 | /* | 128 | /* |
@@ -121,7 +131,7 @@ static int pcpu_reserved_chunk_limit; | |||
121 | * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former | 131 | * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former |
122 | * protects allocation/reclaim paths, chunks and chunk->page arrays. | 132 | * protects allocation/reclaim paths, chunks and chunk->page arrays. |
123 | * The latter is a spinlock and protects the index data structures - | 133 | * The latter is a spinlock and protects the index data structures - |
124 | * chunk slots, rbtree, chunks and area maps in chunks. | 134 | * chunk slots, chunks and area maps in chunks. |
125 | * | 135 | * |
126 | * During allocation, pcpu_alloc_mutex is kept locked all the time and | 136 | * During allocation, pcpu_alloc_mutex is kept locked all the time and |
127 | * pcpu_lock is grabbed and released as necessary. All actual memory | 137 | * pcpu_lock is grabbed and released as necessary. All actual memory |
@@ -140,7 +150,6 @@ static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */ | |||
140 | static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ | 150 | static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ |
141 | 151 | ||
142 | static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ | 152 | static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ |
143 | static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */ | ||
144 | 153 | ||
145 | /* reclaim work to release fully free chunks, scheduled from free path */ | 154 | /* reclaim work to release fully free chunks, scheduled from free path */ |
146 | static void pcpu_reclaim(struct work_struct *work); | 155 | static void pcpu_reclaim(struct work_struct *work); |
@@ -191,6 +200,18 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, | |||
191 | return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; | 200 | return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; |
192 | } | 201 | } |
193 | 202 | ||
203 | /* set the pointer to a chunk in a page struct */ | ||
204 | static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu) | ||
205 | { | ||
206 | page->index = (unsigned long)pcpu; | ||
207 | } | ||
208 | |||
209 | /* obtain pointer to a chunk from a page struct */ | ||
210 | static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) | ||
211 | { | ||
212 | return (struct pcpu_chunk *)page->index; | ||
213 | } | ||
214 | |||
194 | /** | 215 | /** |
195 | * pcpu_mem_alloc - allocate memory | 216 | * pcpu_mem_alloc - allocate memory |
196 | * @size: bytes to allocate | 217 | * @size: bytes to allocate |
@@ -257,93 +278,26 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) | |||
257 | } | 278 | } |
258 | } | 279 | } |
259 | 280 | ||
260 | static struct rb_node **pcpu_chunk_rb_search(void *addr, | ||
261 | struct rb_node **parentp) | ||
262 | { | ||
263 | struct rb_node **p = &pcpu_addr_root.rb_node; | ||
264 | struct rb_node *parent = NULL; | ||
265 | struct pcpu_chunk *chunk; | ||
266 | |||
267 | while (*p) { | ||
268 | parent = *p; | ||
269 | chunk = rb_entry(parent, struct pcpu_chunk, rb_node); | ||
270 | |||
271 | if (addr < chunk->vm->addr) | ||
272 | p = &(*p)->rb_left; | ||
273 | else if (addr > chunk->vm->addr) | ||
274 | p = &(*p)->rb_right; | ||
275 | else | ||
276 | break; | ||
277 | } | ||
278 | |||
279 | if (parentp) | ||
280 | *parentp = parent; | ||
281 | return p; | ||
282 | } | ||
283 | |||
284 | /** | 281 | /** |
285 | * pcpu_chunk_addr_search - search for chunk containing specified address | 282 | * pcpu_chunk_addr_search - determine chunk containing specified address |
286 | * @addr: address to search for | 283 | * @addr: address for which the chunk needs to be determined. |
287 | * | ||
288 | * Look for chunk which might contain @addr. More specifically, it | ||
289 | * searchs for the chunk with the highest start address which isn't | ||
290 | * beyond @addr. | ||
291 | * | ||
292 | * CONTEXT: | ||
293 | * pcpu_lock. | ||
294 | * | 284 | * |
295 | * RETURNS: | 285 | * RETURNS: |
296 | * The address of the found chunk. | 286 | * The address of the found chunk. |
297 | */ | 287 | */ |
298 | static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) | 288 | static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) |
299 | { | 289 | { |
300 | struct rb_node *n, *parent; | 290 | void *first_start = pcpu_first_chunk->vm->addr; |
301 | struct pcpu_chunk *chunk; | ||
302 | 291 | ||
303 | /* is it in the reserved chunk? */ | 292 | /* is it in the first chunk? */ |
304 | if (pcpu_reserved_chunk) { | 293 | if (addr >= first_start && addr < first_start + pcpu_chunk_size) { |
305 | void *start = pcpu_reserved_chunk->vm->addr; | 294 | /* is it in the reserved area? */ |
306 | 295 | if (addr < first_start + pcpu_reserved_chunk_limit) | |
307 | if (addr >= start && addr < start + pcpu_reserved_chunk_limit) | ||
308 | return pcpu_reserved_chunk; | 296 | return pcpu_reserved_chunk; |
297 | return pcpu_first_chunk; | ||
309 | } | 298 | } |
310 | 299 | ||
311 | /* nah... search the regular ones */ | 300 | return pcpu_get_page_chunk(vmalloc_to_page(addr)); |
312 | n = *pcpu_chunk_rb_search(addr, &parent); | ||
313 | if (!n) { | ||
314 | /* no exactly matching chunk, the parent is the closest */ | ||
315 | n = parent; | ||
316 | BUG_ON(!n); | ||
317 | } | ||
318 | chunk = rb_entry(n, struct pcpu_chunk, rb_node); | ||
319 | |||
320 | if (addr < chunk->vm->addr) { | ||
321 | /* the parent was the next one, look for the previous one */ | ||
322 | n = rb_prev(n); | ||
323 | BUG_ON(!n); | ||
324 | chunk = rb_entry(n, struct pcpu_chunk, rb_node); | ||
325 | } | ||
326 | |||
327 | return chunk; | ||
328 | } | ||
329 | |||
330 | /** | ||
331 | * pcpu_chunk_addr_insert - insert chunk into address rb tree | ||
332 | * @new: chunk to insert | ||
333 | * | ||
334 | * Insert @new into address rb tree. | ||
335 | * | ||
336 | * CONTEXT: | ||
337 | * pcpu_lock. | ||
338 | */ | ||
339 | static void pcpu_chunk_addr_insert(struct pcpu_chunk *new) | ||
340 | { | ||
341 | struct rb_node **p, *parent; | ||
342 | |||
343 | p = pcpu_chunk_rb_search(new->vm->addr, &parent); | ||
344 | BUG_ON(*p); | ||
345 | rb_link_node(&new->rb_node, parent, p); | ||
346 | rb_insert_color(&new->rb_node, &pcpu_addr_root); | ||
347 | } | 301 | } |
348 | 302 | ||
349 | /** | 303 | /** |
@@ -755,6 +709,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) | |||
755 | alloc_mask, 0); | 709 | alloc_mask, 0); |
756 | if (!*pagep) | 710 | if (!*pagep) |
757 | goto err; | 711 | goto err; |
712 | pcpu_set_page_chunk(*pagep, chunk); | ||
758 | } | 713 | } |
759 | } | 714 | } |
760 | 715 | ||
@@ -879,7 +834,6 @@ restart: | |||
879 | 834 | ||
880 | spin_lock_irq(&pcpu_lock); | 835 | spin_lock_irq(&pcpu_lock); |
881 | pcpu_chunk_relocate(chunk, -1); | 836 | pcpu_chunk_relocate(chunk, -1); |
882 | pcpu_chunk_addr_insert(chunk); | ||
883 | goto restart; | 837 | goto restart; |
884 | 838 | ||
885 | area_found: | 839 | area_found: |
@@ -968,7 +922,6 @@ static void pcpu_reclaim(struct work_struct *work) | |||
968 | if (chunk == list_first_entry(head, struct pcpu_chunk, list)) | 922 | if (chunk == list_first_entry(head, struct pcpu_chunk, list)) |
969 | continue; | 923 | continue; |
970 | 924 | ||
971 | rb_erase(&chunk->rb_node, &pcpu_addr_root); | ||
972 | list_move(&chunk->list, &todo); | 925 | list_move(&chunk->list, &todo); |
973 | } | 926 | } |
974 | 927 | ||
@@ -1147,7 +1100,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, | |||
1147 | 1100 | ||
1148 | if (reserved_size) { | 1101 | if (reserved_size) { |
1149 | schunk->free_size = reserved_size; | 1102 | schunk->free_size = reserved_size; |
1150 | pcpu_reserved_chunk = schunk; /* not for dynamic alloc */ | 1103 | pcpu_reserved_chunk = schunk; |
1104 | pcpu_reserved_chunk_limit = static_size + reserved_size; | ||
1151 | } else { | 1105 | } else { |
1152 | schunk->free_size = dyn_size; | 1106 | schunk->free_size = dyn_size; |
1153 | dyn_size = 0; /* dynamic area covered */ | 1107 | dyn_size = 0; /* dynamic area covered */ |
@@ -1158,8 +1112,6 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, | |||
1158 | if (schunk->free_size) | 1112 | if (schunk->free_size) |
1159 | schunk->map[schunk->map_used++] = schunk->free_size; | 1113 | schunk->map[schunk->map_used++] = schunk->free_size; |
1160 | 1114 | ||
1161 | pcpu_reserved_chunk_limit = static_size + schunk->free_size; | ||
1162 | |||
1163 | /* init dynamic chunk if necessary */ | 1115 | /* init dynamic chunk if necessary */ |
1164 | if (dyn_size) { | 1116 | if (dyn_size) { |
1165 | dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); | 1117 | dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); |
@@ -1226,13 +1178,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, | |||
1226 | } | 1178 | } |
1227 | 1179 | ||
1228 | /* link the first chunk in */ | 1180 | /* link the first chunk in */ |
1229 | if (!dchunk) { | 1181 | pcpu_first_chunk = dchunk ?: schunk; |
1230 | pcpu_chunk_relocate(schunk, -1); | 1182 | pcpu_chunk_relocate(pcpu_first_chunk, -1); |
1231 | pcpu_chunk_addr_insert(schunk); | ||
1232 | } else { | ||
1233 | pcpu_chunk_relocate(dchunk, -1); | ||
1234 | pcpu_chunk_addr_insert(dchunk); | ||
1235 | } | ||
1236 | 1183 | ||
1237 | /* we're done */ | 1184 | /* we're done */ |
1238 | pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); | 1185 | pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); |
diff --git a/mm/readahead.c b/mm/readahead.c index 133b6d525513..aa1aa2345235 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -133,15 +133,12 @@ out: | |||
133 | } | 133 | } |
134 | 134 | ||
135 | /* | 135 | /* |
136 | * do_page_cache_readahead actually reads a chunk of disk. It allocates all | 136 | * __do_page_cache_readahead() actually reads a chunk of disk. It allocates all |
137 | * the pages first, then submits them all for I/O. This avoids the very bad | 137 | * the pages first, then submits them all for I/O. This avoids the very bad |
138 | * behaviour which would occur if page allocations are causing VM writeback. | 138 | * behaviour which would occur if page allocations are causing VM writeback. |
139 | * We really don't want to intermingle reads and writes like that. | 139 | * We really don't want to intermingle reads and writes like that. |
140 | * | 140 | * |
141 | * Returns the number of pages requested, or the maximum amount of I/O allowed. | 141 | * Returns the number of pages requested, or the maximum amount of I/O allowed. |
142 | * | ||
143 | * do_page_cache_readahead() returns -1 if it encountered request queue | ||
144 | * congestion. | ||
145 | */ | 142 | */ |
146 | static int | 143 | static int |
147 | __do_page_cache_readahead(struct address_space *mapping, struct file *filp, | 144 | __do_page_cache_readahead(struct address_space *mapping, struct file *filp, |
@@ -210,6 +207,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
210 | if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) | 207 | if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) |
211 | return -EINVAL; | 208 | return -EINVAL; |
212 | 209 | ||
210 | nr_to_read = max_sane_readahead(nr_to_read); | ||
213 | while (nr_to_read) { | 211 | while (nr_to_read) { |
214 | int err; | 212 | int err; |
215 | 213 | ||
@@ -231,22 +229,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
231 | } | 229 | } |
232 | 230 | ||
233 | /* | 231 | /* |
234 | * This version skips the IO if the queue is read-congested, and will tell the | ||
235 | * block layer to abandon the readahead if request allocation would block. | ||
236 | * | ||
237 | * force_page_cache_readahead() will ignore queue congestion and will block on | ||
238 | * request queues. | ||
239 | */ | ||
240 | int do_page_cache_readahead(struct address_space *mapping, struct file *filp, | ||
241 | pgoff_t offset, unsigned long nr_to_read) | ||
242 | { | ||
243 | if (bdi_read_congested(mapping->backing_dev_info)) | ||
244 | return -1; | ||
245 | |||
246 | return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0); | ||
247 | } | ||
248 | |||
249 | /* | ||
250 | * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a | 232 | * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a |
251 | * sensible upper limit. | 233 | * sensible upper limit. |
252 | */ | 234 | */ |
@@ -259,7 +241,7 @@ unsigned long max_sane_readahead(unsigned long nr) | |||
259 | /* | 241 | /* |
260 | * Submit IO for the read-ahead request in file_ra_state. | 242 | * Submit IO for the read-ahead request in file_ra_state. |
261 | */ | 243 | */ |
262 | static unsigned long ra_submit(struct file_ra_state *ra, | 244 | unsigned long ra_submit(struct file_ra_state *ra, |
263 | struct address_space *mapping, struct file *filp) | 245 | struct address_space *mapping, struct file *filp) |
264 | { | 246 | { |
265 | int actual; | 247 | int actual; |
@@ -348,6 +330,59 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra, | |||
348 | */ | 330 | */ |
349 | 331 | ||
350 | /* | 332 | /* |
333 | * Count contiguously cached pages from @offset-1 to @offset-@max, | ||
334 | * this count is a conservative estimation of | ||
335 | * - length of the sequential read sequence, or | ||
336 | * - thrashing threshold in memory tight systems | ||
337 | */ | ||
338 | static pgoff_t count_history_pages(struct address_space *mapping, | ||
339 | struct file_ra_state *ra, | ||
340 | pgoff_t offset, unsigned long max) | ||
341 | { | ||
342 | pgoff_t head; | ||
343 | |||
344 | rcu_read_lock(); | ||
345 | head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max); | ||
346 | rcu_read_unlock(); | ||
347 | |||
348 | return offset - 1 - head; | ||
349 | } | ||
350 | |||
351 | /* | ||
352 | * page cache context based read-ahead | ||
353 | */ | ||
354 | static int try_context_readahead(struct address_space *mapping, | ||
355 | struct file_ra_state *ra, | ||
356 | pgoff_t offset, | ||
357 | unsigned long req_size, | ||
358 | unsigned long max) | ||
359 | { | ||
360 | pgoff_t size; | ||
361 | |||
362 | size = count_history_pages(mapping, ra, offset, max); | ||
363 | |||
364 | /* | ||
365 | * no history pages: | ||
366 | * it could be a random read | ||
367 | */ | ||
368 | if (!size) | ||
369 | return 0; | ||
370 | |||
371 | /* | ||
372 | * starts from beginning of file: | ||
373 | * it is a strong indication of long-run stream (or whole-file-read) | ||
374 | */ | ||
375 | if (size >= offset) | ||
376 | size *= 2; | ||
377 | |||
378 | ra->start = offset; | ||
379 | ra->size = get_init_ra_size(size + req_size, max); | ||
380 | ra->async_size = ra->size; | ||
381 | |||
382 | return 1; | ||
383 | } | ||
384 | |||
385 | /* | ||
351 | * A minimal readahead algorithm for trivial sequential/random reads. | 386 | * A minimal readahead algorithm for trivial sequential/random reads. |
352 | */ | 387 | */ |
353 | static unsigned long | 388 | static unsigned long |
@@ -356,34 +391,26 @@ ondemand_readahead(struct address_space *mapping, | |||
356 | bool hit_readahead_marker, pgoff_t offset, | 391 | bool hit_readahead_marker, pgoff_t offset, |
357 | unsigned long req_size) | 392 | unsigned long req_size) |
358 | { | 393 | { |
359 | int max = ra->ra_pages; /* max readahead pages */ | 394 | unsigned long max = max_sane_readahead(ra->ra_pages); |
360 | pgoff_t prev_offset; | 395 | |
361 | int sequential; | 396 | /* |
397 | * start of file | ||
398 | */ | ||
399 | if (!offset) | ||
400 | goto initial_readahead; | ||
362 | 401 | ||
363 | /* | 402 | /* |
364 | * It's the expected callback offset, assume sequential access. | 403 | * It's the expected callback offset, assume sequential access. |
365 | * Ramp up sizes, and push forward the readahead window. | 404 | * Ramp up sizes, and push forward the readahead window. |
366 | */ | 405 | */ |
367 | if (offset && (offset == (ra->start + ra->size - ra->async_size) || | 406 | if ((offset == (ra->start + ra->size - ra->async_size) || |
368 | offset == (ra->start + ra->size))) { | 407 | offset == (ra->start + ra->size))) { |
369 | ra->start += ra->size; | 408 | ra->start += ra->size; |
370 | ra->size = get_next_ra_size(ra, max); | 409 | ra->size = get_next_ra_size(ra, max); |
371 | ra->async_size = ra->size; | 410 | ra->async_size = ra->size; |
372 | goto readit; | 411 | goto readit; |
373 | } | 412 | } |
374 | 413 | ||
375 | prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT; | ||
376 | sequential = offset - prev_offset <= 1UL || req_size > max; | ||
377 | |||
378 | /* | ||
379 | * Standalone, small read. | ||
380 | * Read as is, and do not pollute the readahead state. | ||
381 | */ | ||
382 | if (!hit_readahead_marker && !sequential) { | ||
383 | return __do_page_cache_readahead(mapping, filp, | ||
384 | offset, req_size, 0); | ||
385 | } | ||
386 | |||
387 | /* | 414 | /* |
388 | * Hit a marked page without valid readahead state. | 415 | * Hit a marked page without valid readahead state. |
389 | * E.g. interleaved reads. | 416 | * E.g. interleaved reads. |
@@ -394,7 +421,7 @@ ondemand_readahead(struct address_space *mapping, | |||
394 | pgoff_t start; | 421 | pgoff_t start; |
395 | 422 | ||
396 | rcu_read_lock(); | 423 | rcu_read_lock(); |
397 | start = radix_tree_next_hole(&mapping->page_tree, offset,max+1); | 424 | start = radix_tree_next_hole(&mapping->page_tree, offset+1,max); |
398 | rcu_read_unlock(); | 425 | rcu_read_unlock(); |
399 | 426 | ||
400 | if (!start || start - offset > max) | 427 | if (!start || start - offset > max) |
@@ -402,23 +429,53 @@ ondemand_readahead(struct address_space *mapping, | |||
402 | 429 | ||
403 | ra->start = start; | 430 | ra->start = start; |
404 | ra->size = start - offset; /* old async_size */ | 431 | ra->size = start - offset; /* old async_size */ |
432 | ra->size += req_size; | ||
405 | ra->size = get_next_ra_size(ra, max); | 433 | ra->size = get_next_ra_size(ra, max); |
406 | ra->async_size = ra->size; | 434 | ra->async_size = ra->size; |
407 | goto readit; | 435 | goto readit; |
408 | } | 436 | } |
409 | 437 | ||
410 | /* | 438 | /* |
411 | * It may be one of | 439 | * oversize read |
412 | * - first read on start of file | 440 | */ |
413 | * - sequential cache miss | 441 | if (req_size > max) |
414 | * - oversize random read | 442 | goto initial_readahead; |
415 | * Start readahead for it. | 443 | |
444 | /* | ||
445 | * sequential cache miss | ||
446 | */ | ||
447 | if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL) | ||
448 | goto initial_readahead; | ||
449 | |||
450 | /* | ||
451 | * Query the page cache and look for the traces(cached history pages) | ||
452 | * that a sequential stream would leave behind. | ||
453 | */ | ||
454 | if (try_context_readahead(mapping, ra, offset, req_size, max)) | ||
455 | goto readit; | ||
456 | |||
457 | /* | ||
458 | * standalone, small random read | ||
459 | * Read as is, and do not pollute the readahead state. | ||
416 | */ | 460 | */ |
461 | return __do_page_cache_readahead(mapping, filp, offset, req_size, 0); | ||
462 | |||
463 | initial_readahead: | ||
417 | ra->start = offset; | 464 | ra->start = offset; |
418 | ra->size = get_init_ra_size(req_size, max); | 465 | ra->size = get_init_ra_size(req_size, max); |
419 | ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; | 466 | ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; |
420 | 467 | ||
421 | readit: | 468 | readit: |
469 | /* | ||
470 | * Will this read hit the readahead marker made by itself? | ||
471 | * If so, trigger the readahead marker hit now, and merge | ||
472 | * the resulted next readahead window into the current one. | ||
473 | */ | ||
474 | if (offset == ra->start && ra->size == ra->async_size) { | ||
475 | ra->async_size = get_next_ra_size(ra, max); | ||
476 | ra->size += ra->async_size; | ||
477 | } | ||
478 | |||
422 | return ra_submit(ra, mapping, filp); | 479 | return ra_submit(ra, mapping, filp); |
423 | } | 480 | } |
424 | 481 | ||
@@ -333,7 +333,9 @@ static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) | |||
333 | * repeatedly from either page_referenced_anon or page_referenced_file. | 333 | * repeatedly from either page_referenced_anon or page_referenced_file. |
334 | */ | 334 | */ |
335 | static int page_referenced_one(struct page *page, | 335 | static int page_referenced_one(struct page *page, |
336 | struct vm_area_struct *vma, unsigned int *mapcount) | 336 | struct vm_area_struct *vma, |
337 | unsigned int *mapcount, | ||
338 | unsigned long *vm_flags) | ||
337 | { | 339 | { |
338 | struct mm_struct *mm = vma->vm_mm; | 340 | struct mm_struct *mm = vma->vm_mm; |
339 | unsigned long address; | 341 | unsigned long address; |
@@ -381,11 +383,14 @@ out_unmap: | |||
381 | (*mapcount)--; | 383 | (*mapcount)--; |
382 | pte_unmap_unlock(pte, ptl); | 384 | pte_unmap_unlock(pte, ptl); |
383 | out: | 385 | out: |
386 | if (referenced) | ||
387 | *vm_flags |= vma->vm_flags; | ||
384 | return referenced; | 388 | return referenced; |
385 | } | 389 | } |
386 | 390 | ||
387 | static int page_referenced_anon(struct page *page, | 391 | static int page_referenced_anon(struct page *page, |
388 | struct mem_cgroup *mem_cont) | 392 | struct mem_cgroup *mem_cont, |
393 | unsigned long *vm_flags) | ||
389 | { | 394 | { |
390 | unsigned int mapcount; | 395 | unsigned int mapcount; |
391 | struct anon_vma *anon_vma; | 396 | struct anon_vma *anon_vma; |
@@ -405,7 +410,8 @@ static int page_referenced_anon(struct page *page, | |||
405 | */ | 410 | */ |
406 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) | 411 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) |
407 | continue; | 412 | continue; |
408 | referenced += page_referenced_one(page, vma, &mapcount); | 413 | referenced += page_referenced_one(page, vma, |
414 | &mapcount, vm_flags); | ||
409 | if (!mapcount) | 415 | if (!mapcount) |
410 | break; | 416 | break; |
411 | } | 417 | } |
@@ -418,6 +424,7 @@ static int page_referenced_anon(struct page *page, | |||
418 | * page_referenced_file - referenced check for object-based rmap | 424 | * page_referenced_file - referenced check for object-based rmap |
419 | * @page: the page we're checking references on. | 425 | * @page: the page we're checking references on. |
420 | * @mem_cont: target memory controller | 426 | * @mem_cont: target memory controller |
427 | * @vm_flags: collect encountered vma->vm_flags who actually referenced the page | ||
421 | * | 428 | * |
422 | * For an object-based mapped page, find all the places it is mapped and | 429 | * For an object-based mapped page, find all the places it is mapped and |
423 | * check/clear the referenced flag. This is done by following the page->mapping | 430 | * check/clear the referenced flag. This is done by following the page->mapping |
@@ -427,7 +434,8 @@ static int page_referenced_anon(struct page *page, | |||
427 | * This function is only called from page_referenced for object-based pages. | 434 | * This function is only called from page_referenced for object-based pages. |
428 | */ | 435 | */ |
429 | static int page_referenced_file(struct page *page, | 436 | static int page_referenced_file(struct page *page, |
430 | struct mem_cgroup *mem_cont) | 437 | struct mem_cgroup *mem_cont, |
438 | unsigned long *vm_flags) | ||
431 | { | 439 | { |
432 | unsigned int mapcount; | 440 | unsigned int mapcount; |
433 | struct address_space *mapping = page->mapping; | 441 | struct address_space *mapping = page->mapping; |
@@ -467,7 +475,8 @@ static int page_referenced_file(struct page *page, | |||
467 | */ | 475 | */ |
468 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) | 476 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) |
469 | continue; | 477 | continue; |
470 | referenced += page_referenced_one(page, vma, &mapcount); | 478 | referenced += page_referenced_one(page, vma, |
479 | &mapcount, vm_flags); | ||
471 | if (!mapcount) | 480 | if (!mapcount) |
472 | break; | 481 | break; |
473 | } | 482 | } |
@@ -481,29 +490,35 @@ static int page_referenced_file(struct page *page, | |||
481 | * @page: the page to test | 490 | * @page: the page to test |
482 | * @is_locked: caller holds lock on the page | 491 | * @is_locked: caller holds lock on the page |
483 | * @mem_cont: target memory controller | 492 | * @mem_cont: target memory controller |
493 | * @vm_flags: collect encountered vma->vm_flags who actually referenced the page | ||
484 | * | 494 | * |
485 | * Quick test_and_clear_referenced for all mappings to a page, | 495 | * Quick test_and_clear_referenced for all mappings to a page, |
486 | * returns the number of ptes which referenced the page. | 496 | * returns the number of ptes which referenced the page. |
487 | */ | 497 | */ |
488 | int page_referenced(struct page *page, int is_locked, | 498 | int page_referenced(struct page *page, |
489 | struct mem_cgroup *mem_cont) | 499 | int is_locked, |
500 | struct mem_cgroup *mem_cont, | ||
501 | unsigned long *vm_flags) | ||
490 | { | 502 | { |
491 | int referenced = 0; | 503 | int referenced = 0; |
492 | 504 | ||
493 | if (TestClearPageReferenced(page)) | 505 | if (TestClearPageReferenced(page)) |
494 | referenced++; | 506 | referenced++; |
495 | 507 | ||
508 | *vm_flags = 0; | ||
496 | if (page_mapped(page) && page->mapping) { | 509 | if (page_mapped(page) && page->mapping) { |
497 | if (PageAnon(page)) | 510 | if (PageAnon(page)) |
498 | referenced += page_referenced_anon(page, mem_cont); | 511 | referenced += page_referenced_anon(page, mem_cont, |
512 | vm_flags); | ||
499 | else if (is_locked) | 513 | else if (is_locked) |
500 | referenced += page_referenced_file(page, mem_cont); | 514 | referenced += page_referenced_file(page, mem_cont, |
515 | vm_flags); | ||
501 | else if (!trylock_page(page)) | 516 | else if (!trylock_page(page)) |
502 | referenced++; | 517 | referenced++; |
503 | else { | 518 | else { |
504 | if (page->mapping) | 519 | if (page->mapping) |
505 | referenced += | 520 | referenced += page_referenced_file(page, |
506 | page_referenced_file(page, mem_cont); | 521 | mem_cont, vm_flags); |
507 | unlock_page(page); | 522 | unlock_page(page); |
508 | } | 523 | } |
509 | } | 524 | } |
@@ -688,8 +703,10 @@ void page_add_new_anon_rmap(struct page *page, | |||
688 | */ | 703 | */ |
689 | void page_add_file_rmap(struct page *page) | 704 | void page_add_file_rmap(struct page *page) |
690 | { | 705 | { |
691 | if (atomic_inc_and_test(&page->_mapcount)) | 706 | if (atomic_inc_and_test(&page->_mapcount)) { |
692 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 707 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
708 | mem_cgroup_update_mapped_file_stat(page, 1); | ||
709 | } | ||
693 | } | 710 | } |
694 | 711 | ||
695 | #ifdef CONFIG_DEBUG_VM | 712 | #ifdef CONFIG_DEBUG_VM |
@@ -738,6 +755,7 @@ void page_remove_rmap(struct page *page) | |||
738 | mem_cgroup_uncharge_page(page); | 755 | mem_cgroup_uncharge_page(page); |
739 | __dec_zone_page_state(page, | 756 | __dec_zone_page_state(page, |
740 | PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); | 757 | PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); |
758 | mem_cgroup_update_mapped_file_stat(page, -1); | ||
741 | /* | 759 | /* |
742 | * It would be tidy to reset the PageAnon mapping here, | 760 | * It would be tidy to reset the PageAnon mapping here, |
743 | * but that might overwrite a racing page_add_anon_rmap | 761 | * but that might overwrite a racing page_add_anon_rmap |
@@ -1202,7 +1220,6 @@ int try_to_unmap(struct page *page, int migration) | |||
1202 | return ret; | 1220 | return ret; |
1203 | } | 1221 | } |
1204 | 1222 | ||
1205 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
1206 | /** | 1223 | /** |
1207 | * try_to_munlock - try to munlock a page | 1224 | * try_to_munlock - try to munlock a page |
1208 | * @page: the page to be munlocked | 1225 | * @page: the page to be munlocked |
@@ -1226,4 +1243,4 @@ int try_to_munlock(struct page *page) | |||
1226 | else | 1243 | else |
1227 | return try_to_unmap_file(page, 1, 0); | 1244 | return try_to_unmap_file(page, 1, 0); |
1228 | } | 1245 | } |
1229 | #endif | 1246 | |
diff --git a/mm/shmem.c b/mm/shmem.c index b25f95ce3db7..d713239ce2ce 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -1097,7 +1097,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
1097 | shmem_swp_unmap(entry); | 1097 | shmem_swp_unmap(entry); |
1098 | unlock: | 1098 | unlock: |
1099 | spin_unlock(&info->lock); | 1099 | spin_unlock(&info->lock); |
1100 | swap_free(swap); | 1100 | swapcache_free(swap, NULL); |
1101 | redirty: | 1101 | redirty: |
1102 | set_page_dirty(page); | 1102 | set_page_dirty(page); |
1103 | if (wbc->for_reclaim) | 1103 | if (wbc->for_reclaim) |
@@ -1558,6 +1558,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, int mode, | |||
1558 | spin_lock_init(&info->lock); | 1558 | spin_lock_init(&info->lock); |
1559 | info->flags = flags & VM_NORESERVE; | 1559 | info->flags = flags & VM_NORESERVE; |
1560 | INIT_LIST_HEAD(&info->swaplist); | 1560 | INIT_LIST_HEAD(&info->swaplist); |
1561 | cache_no_acl(inode); | ||
1561 | 1562 | ||
1562 | switch (mode & S_IFMT) { | 1563 | switch (mode & S_IFMT) { |
1563 | default: | 1564 | default: |
@@ -2388,7 +2389,6 @@ static void shmem_destroy_inode(struct inode *inode) | |||
2388 | /* only struct inode is valid if it's an inline symlink */ | 2389 | /* only struct inode is valid if it's an inline symlink */ |
2389 | mpol_free_shared_policy(&SHMEM_I(inode)->policy); | 2390 | mpol_free_shared_policy(&SHMEM_I(inode)->policy); |
2390 | } | 2391 | } |
2391 | shmem_acl_destroy_inode(inode); | ||
2392 | kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); | 2392 | kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); |
2393 | } | 2393 | } |
2394 | 2394 | ||
@@ -2397,10 +2397,6 @@ static void init_once(void *foo) | |||
2397 | struct shmem_inode_info *p = (struct shmem_inode_info *) foo; | 2397 | struct shmem_inode_info *p = (struct shmem_inode_info *) foo; |
2398 | 2398 | ||
2399 | inode_init_once(&p->vfs_inode); | 2399 | inode_init_once(&p->vfs_inode); |
2400 | #ifdef CONFIG_TMPFS_POSIX_ACL | ||
2401 | p->i_acl = NULL; | ||
2402 | p->i_default_acl = NULL; | ||
2403 | #endif | ||
2404 | } | 2400 | } |
2405 | 2401 | ||
2406 | static int init_inodecache(void) | 2402 | static int init_inodecache(void) |
@@ -2612,7 +2608,7 @@ int shmem_unuse(swp_entry_t entry, struct page *page) | |||
2612 | * @size: size to be set for the file | 2608 | * @size: size to be set for the file |
2613 | * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size | 2609 | * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size |
2614 | */ | 2610 | */ |
2615 | struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) | 2611 | struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) |
2616 | { | 2612 | { |
2617 | int error; | 2613 | int error; |
2618 | struct file *file; | 2614 | struct file *file; |
@@ -2659,6 +2655,7 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) | |||
2659 | if (error) | 2655 | if (error) |
2660 | goto close_file; | 2656 | goto close_file; |
2661 | #endif | 2657 | #endif |
2658 | ima_counts_get(file); | ||
2662 | return file; | 2659 | return file; |
2663 | 2660 | ||
2664 | close_file: | 2661 | close_file: |
@@ -2684,7 +2681,6 @@ int shmem_zero_setup(struct vm_area_struct *vma) | |||
2684 | if (IS_ERR(file)) | 2681 | if (IS_ERR(file)) |
2685 | return PTR_ERR(file); | 2682 | return PTR_ERR(file); |
2686 | 2683 | ||
2687 | ima_shm_check(file); | ||
2688 | if (vma->vm_file) | 2684 | if (vma->vm_file) |
2689 | fput(vma->vm_file); | 2685 | fput(vma->vm_file); |
2690 | vma->vm_file = file; | 2686 | vma->vm_file = file; |
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c index 8e5aadd7dcd6..606a8e757a42 100644 --- a/mm/shmem_acl.c +++ b/mm/shmem_acl.c | |||
@@ -22,11 +22,11 @@ shmem_get_acl(struct inode *inode, int type) | |||
22 | spin_lock(&inode->i_lock); | 22 | spin_lock(&inode->i_lock); |
23 | switch(type) { | 23 | switch(type) { |
24 | case ACL_TYPE_ACCESS: | 24 | case ACL_TYPE_ACCESS: |
25 | acl = posix_acl_dup(SHMEM_I(inode)->i_acl); | 25 | acl = posix_acl_dup(inode->i_acl); |
26 | break; | 26 | break; |
27 | 27 | ||
28 | case ACL_TYPE_DEFAULT: | 28 | case ACL_TYPE_DEFAULT: |
29 | acl = posix_acl_dup(SHMEM_I(inode)->i_default_acl); | 29 | acl = posix_acl_dup(inode->i_default_acl); |
30 | break; | 30 | break; |
31 | } | 31 | } |
32 | spin_unlock(&inode->i_lock); | 32 | spin_unlock(&inode->i_lock); |
@@ -45,13 +45,13 @@ shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl) | |||
45 | spin_lock(&inode->i_lock); | 45 | spin_lock(&inode->i_lock); |
46 | switch(type) { | 46 | switch(type) { |
47 | case ACL_TYPE_ACCESS: | 47 | case ACL_TYPE_ACCESS: |
48 | free = SHMEM_I(inode)->i_acl; | 48 | free = inode->i_acl; |
49 | SHMEM_I(inode)->i_acl = posix_acl_dup(acl); | 49 | inode->i_acl = posix_acl_dup(acl); |
50 | break; | 50 | break; |
51 | 51 | ||
52 | case ACL_TYPE_DEFAULT: | 52 | case ACL_TYPE_DEFAULT: |
53 | free = SHMEM_I(inode)->i_default_acl; | 53 | free = inode->i_default_acl; |
54 | SHMEM_I(inode)->i_default_acl = posix_acl_dup(acl); | 54 | inode->i_default_acl = posix_acl_dup(acl); |
55 | break; | 55 | break; |
56 | } | 56 | } |
57 | spin_unlock(&inode->i_lock); | 57 | spin_unlock(&inode->i_lock); |
@@ -155,23 +155,6 @@ shmem_acl_init(struct inode *inode, struct inode *dir) | |||
155 | } | 155 | } |
156 | 156 | ||
157 | /** | 157 | /** |
158 | * shmem_acl_destroy_inode - destroy acls hanging off the in-memory inode | ||
159 | * | ||
160 | * This is done before destroying the actual inode. | ||
161 | */ | ||
162 | |||
163 | void | ||
164 | shmem_acl_destroy_inode(struct inode *inode) | ||
165 | { | ||
166 | if (SHMEM_I(inode)->i_acl) | ||
167 | posix_acl_release(SHMEM_I(inode)->i_acl); | ||
168 | SHMEM_I(inode)->i_acl = NULL; | ||
169 | if (SHMEM_I(inode)->i_default_acl) | ||
170 | posix_acl_release(SHMEM_I(inode)->i_default_acl); | ||
171 | SHMEM_I(inode)->i_default_acl = NULL; | ||
172 | } | ||
173 | |||
174 | /** | ||
175 | * shmem_check_acl - check_acl() callback for generic_permission() | 158 | * shmem_check_acl - check_acl() callback for generic_permission() |
176 | */ | 159 | */ |
177 | static int | 160 | static int |
@@ -102,17 +102,19 @@ | |||
102 | #include <linux/cpu.h> | 102 | #include <linux/cpu.h> |
103 | #include <linux/sysctl.h> | 103 | #include <linux/sysctl.h> |
104 | #include <linux/module.h> | 104 | #include <linux/module.h> |
105 | #include <trace/kmemtrace.h> | 105 | #include <linux/kmemtrace.h> |
106 | #include <linux/rcupdate.h> | 106 | #include <linux/rcupdate.h> |
107 | #include <linux/string.h> | 107 | #include <linux/string.h> |
108 | #include <linux/uaccess.h> | 108 | #include <linux/uaccess.h> |
109 | #include <linux/nodemask.h> | 109 | #include <linux/nodemask.h> |
110 | #include <linux/kmemleak.h> | ||
110 | #include <linux/mempolicy.h> | 111 | #include <linux/mempolicy.h> |
111 | #include <linux/mutex.h> | 112 | #include <linux/mutex.h> |
112 | #include <linux/fault-inject.h> | 113 | #include <linux/fault-inject.h> |
113 | #include <linux/rtmutex.h> | 114 | #include <linux/rtmutex.h> |
114 | #include <linux/reciprocal_div.h> | 115 | #include <linux/reciprocal_div.h> |
115 | #include <linux/debugobjects.h> | 116 | #include <linux/debugobjects.h> |
117 | #include <linux/kmemcheck.h> | ||
116 | 118 | ||
117 | #include <asm/cacheflush.h> | 119 | #include <asm/cacheflush.h> |
118 | #include <asm/tlbflush.h> | 120 | #include <asm/tlbflush.h> |
@@ -178,13 +180,13 @@ | |||
178 | SLAB_STORE_USER | \ | 180 | SLAB_STORE_USER | \ |
179 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ | 181 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ |
180 | SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ | 182 | SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ |
181 | SLAB_DEBUG_OBJECTS) | 183 | SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) |
182 | #else | 184 | #else |
183 | # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ | 185 | # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ |
184 | SLAB_CACHE_DMA | \ | 186 | SLAB_CACHE_DMA | \ |
185 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ | 187 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ |
186 | SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ | 188 | SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ |
187 | SLAB_DEBUG_OBJECTS) | 189 | SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) |
188 | #endif | 190 | #endif |
189 | 191 | ||
190 | /* | 192 | /* |
@@ -315,7 +317,7 @@ static int drain_freelist(struct kmem_cache *cache, | |||
315 | struct kmem_list3 *l3, int tofree); | 317 | struct kmem_list3 *l3, int tofree); |
316 | static void free_block(struct kmem_cache *cachep, void **objpp, int len, | 318 | static void free_block(struct kmem_cache *cachep, void **objpp, int len, |
317 | int node); | 319 | int node); |
318 | static int enable_cpucache(struct kmem_cache *cachep); | 320 | static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); |
319 | static void cache_reap(struct work_struct *unused); | 321 | static void cache_reap(struct work_struct *unused); |
320 | 322 | ||
321 | /* | 323 | /* |
@@ -373,87 +375,6 @@ static void kmem_list3_init(struct kmem_list3 *parent) | |||
373 | MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ | 375 | MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ |
374 | } while (0) | 376 | } while (0) |
375 | 377 | ||
376 | /* | ||
377 | * struct kmem_cache | ||
378 | * | ||
379 | * manages a cache. | ||
380 | */ | ||
381 | |||
382 | struct kmem_cache { | ||
383 | /* 1) per-cpu data, touched during every alloc/free */ | ||
384 | struct array_cache *array[NR_CPUS]; | ||
385 | /* 2) Cache tunables. Protected by cache_chain_mutex */ | ||
386 | unsigned int batchcount; | ||
387 | unsigned int limit; | ||
388 | unsigned int shared; | ||
389 | |||
390 | unsigned int buffer_size; | ||
391 | u32 reciprocal_buffer_size; | ||
392 | /* 3) touched by every alloc & free from the backend */ | ||
393 | |||
394 | unsigned int flags; /* constant flags */ | ||
395 | unsigned int num; /* # of objs per slab */ | ||
396 | |||
397 | /* 4) cache_grow/shrink */ | ||
398 | /* order of pgs per slab (2^n) */ | ||
399 | unsigned int gfporder; | ||
400 | |||
401 | /* force GFP flags, e.g. GFP_DMA */ | ||
402 | gfp_t gfpflags; | ||
403 | |||
404 | size_t colour; /* cache colouring range */ | ||
405 | unsigned int colour_off; /* colour offset */ | ||
406 | struct kmem_cache *slabp_cache; | ||
407 | unsigned int slab_size; | ||
408 | unsigned int dflags; /* dynamic flags */ | ||
409 | |||
410 | /* constructor func */ | ||
411 | void (*ctor)(void *obj); | ||
412 | |||
413 | /* 5) cache creation/removal */ | ||
414 | const char *name; | ||
415 | struct list_head next; | ||
416 | |||
417 | /* 6) statistics */ | ||
418 | #if STATS | ||
419 | unsigned long num_active; | ||
420 | unsigned long num_allocations; | ||
421 | unsigned long high_mark; | ||
422 | unsigned long grown; | ||
423 | unsigned long reaped; | ||
424 | unsigned long errors; | ||
425 | unsigned long max_freeable; | ||
426 | unsigned long node_allocs; | ||
427 | unsigned long node_frees; | ||
428 | unsigned long node_overflow; | ||
429 | atomic_t allochit; | ||
430 | atomic_t allocmiss; | ||
431 | atomic_t freehit; | ||
432 | atomic_t freemiss; | ||
433 | #endif | ||
434 | #if DEBUG | ||
435 | /* | ||
436 | * If debugging is enabled, then the allocator can add additional | ||
437 | * fields and/or padding to every object. buffer_size contains the total | ||
438 | * object size including these internal fields, the following two | ||
439 | * variables contain the offset to the user object and its size. | ||
440 | */ | ||
441 | int obj_offset; | ||
442 | int obj_size; | ||
443 | #endif | ||
444 | /* | ||
445 | * We put nodelists[] at the end of kmem_cache, because we want to size | ||
446 | * this array to nr_node_ids slots instead of MAX_NUMNODES | ||
447 | * (see kmem_cache_init()) | ||
448 | * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache | ||
449 | * is statically defined, so we reserve the max number of nodes. | ||
450 | */ | ||
451 | struct kmem_list3 *nodelists[MAX_NUMNODES]; | ||
452 | /* | ||
453 | * Do not add fields after nodelists[] | ||
454 | */ | ||
455 | }; | ||
456 | |||
457 | #define CFLGS_OFF_SLAB (0x80000000UL) | 378 | #define CFLGS_OFF_SLAB (0x80000000UL) |
458 | #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) | 379 | #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) |
459 | 380 | ||
@@ -752,6 +673,7 @@ static enum { | |||
752 | NONE, | 673 | NONE, |
753 | PARTIAL_AC, | 674 | PARTIAL_AC, |
754 | PARTIAL_L3, | 675 | PARTIAL_L3, |
676 | EARLY, | ||
755 | FULL | 677 | FULL |
756 | } g_cpucache_up; | 678 | } g_cpucache_up; |
757 | 679 | ||
@@ -760,7 +682,7 @@ static enum { | |||
760 | */ | 682 | */ |
761 | int slab_is_available(void) | 683 | int slab_is_available(void) |
762 | { | 684 | { |
763 | return g_cpucache_up == FULL; | 685 | return g_cpucache_up >= EARLY; |
764 | } | 686 | } |
765 | 687 | ||
766 | static DEFINE_PER_CPU(struct delayed_work, reap_work); | 688 | static DEFINE_PER_CPU(struct delayed_work, reap_work); |
@@ -890,7 +812,6 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, | |||
890 | */ | 812 | */ |
891 | 813 | ||
892 | static int use_alien_caches __read_mostly = 1; | 814 | static int use_alien_caches __read_mostly = 1; |
893 | static int numa_platform __read_mostly = 1; | ||
894 | static int __init noaliencache_setup(char *s) | 815 | static int __init noaliencache_setup(char *s) |
895 | { | 816 | { |
896 | use_alien_caches = 0; | 817 | use_alien_caches = 0; |
@@ -958,12 +879,20 @@ static void __cpuinit start_cpu_timer(int cpu) | |||
958 | } | 879 | } |
959 | 880 | ||
960 | static struct array_cache *alloc_arraycache(int node, int entries, | 881 | static struct array_cache *alloc_arraycache(int node, int entries, |
961 | int batchcount) | 882 | int batchcount, gfp_t gfp) |
962 | { | 883 | { |
963 | int memsize = sizeof(void *) * entries + sizeof(struct array_cache); | 884 | int memsize = sizeof(void *) * entries + sizeof(struct array_cache); |
964 | struct array_cache *nc = NULL; | 885 | struct array_cache *nc = NULL; |
965 | 886 | ||
966 | nc = kmalloc_node(memsize, GFP_KERNEL, node); | 887 | nc = kmalloc_node(memsize, gfp, node); |
888 | /* | ||
889 | * The array_cache structures contain pointers to free object. | ||
890 | * However, when such objects are allocated or transfered to another | ||
891 | * cache the pointers are not cleared and they could be counted as | ||
892 | * valid references during a kmemleak scan. Therefore, kmemleak must | ||
893 | * not scan such objects. | ||
894 | */ | ||
895 | kmemleak_no_scan(nc); | ||
967 | if (nc) { | 896 | if (nc) { |
968 | nc->avail = 0; | 897 | nc->avail = 0; |
969 | nc->limit = entries; | 898 | nc->limit = entries; |
@@ -1003,7 +932,7 @@ static int transfer_objects(struct array_cache *to, | |||
1003 | #define drain_alien_cache(cachep, alien) do { } while (0) | 932 | #define drain_alien_cache(cachep, alien) do { } while (0) |
1004 | #define reap_alien(cachep, l3) do { } while (0) | 933 | #define reap_alien(cachep, l3) do { } while (0) |
1005 | 934 | ||
1006 | static inline struct array_cache **alloc_alien_cache(int node, int limit) | 935 | static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) |
1007 | { | 936 | { |
1008 | return (struct array_cache **)BAD_ALIEN_MAGIC; | 937 | return (struct array_cache **)BAD_ALIEN_MAGIC; |
1009 | } | 938 | } |
@@ -1034,7 +963,7 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep, | |||
1034 | static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); | 963 | static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); |
1035 | static void *alternate_node_alloc(struct kmem_cache *, gfp_t); | 964 | static void *alternate_node_alloc(struct kmem_cache *, gfp_t); |
1036 | 965 | ||
1037 | static struct array_cache **alloc_alien_cache(int node, int limit) | 966 | static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) |
1038 | { | 967 | { |
1039 | struct array_cache **ac_ptr; | 968 | struct array_cache **ac_ptr; |
1040 | int memsize = sizeof(void *) * nr_node_ids; | 969 | int memsize = sizeof(void *) * nr_node_ids; |
@@ -1042,14 +971,14 @@ static struct array_cache **alloc_alien_cache(int node, int limit) | |||
1042 | 971 | ||
1043 | if (limit > 1) | 972 | if (limit > 1) |
1044 | limit = 12; | 973 | limit = 12; |
1045 | ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node); | 974 | ac_ptr = kmalloc_node(memsize, gfp, node); |
1046 | if (ac_ptr) { | 975 | if (ac_ptr) { |
1047 | for_each_node(i) { | 976 | for_each_node(i) { |
1048 | if (i == node || !node_online(i)) { | 977 | if (i == node || !node_online(i)) { |
1049 | ac_ptr[i] = NULL; | 978 | ac_ptr[i] = NULL; |
1050 | continue; | 979 | continue; |
1051 | } | 980 | } |
1052 | ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d); | 981 | ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); |
1053 | if (!ac_ptr[i]) { | 982 | if (!ac_ptr[i]) { |
1054 | for (i--; i >= 0; i--) | 983 | for (i--; i >= 0; i--) |
1055 | kfree(ac_ptr[i]); | 984 | kfree(ac_ptr[i]); |
@@ -1282,20 +1211,20 @@ static int __cpuinit cpuup_prepare(long cpu) | |||
1282 | struct array_cache **alien = NULL; | 1211 | struct array_cache **alien = NULL; |
1283 | 1212 | ||
1284 | nc = alloc_arraycache(node, cachep->limit, | 1213 | nc = alloc_arraycache(node, cachep->limit, |
1285 | cachep->batchcount); | 1214 | cachep->batchcount, GFP_KERNEL); |
1286 | if (!nc) | 1215 | if (!nc) |
1287 | goto bad; | 1216 | goto bad; |
1288 | if (cachep->shared) { | 1217 | if (cachep->shared) { |
1289 | shared = alloc_arraycache(node, | 1218 | shared = alloc_arraycache(node, |
1290 | cachep->shared * cachep->batchcount, | 1219 | cachep->shared * cachep->batchcount, |
1291 | 0xbaadf00d); | 1220 | 0xbaadf00d, GFP_KERNEL); |
1292 | if (!shared) { | 1221 | if (!shared) { |
1293 | kfree(nc); | 1222 | kfree(nc); |
1294 | goto bad; | 1223 | goto bad; |
1295 | } | 1224 | } |
1296 | } | 1225 | } |
1297 | if (use_alien_caches) { | 1226 | if (use_alien_caches) { |
1298 | alien = alloc_alien_cache(node, cachep->limit); | 1227 | alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL); |
1299 | if (!alien) { | 1228 | if (!alien) { |
1300 | kfree(shared); | 1229 | kfree(shared); |
1301 | kfree(nc); | 1230 | kfree(nc); |
@@ -1399,10 +1328,9 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, | |||
1399 | { | 1328 | { |
1400 | struct kmem_list3 *ptr; | 1329 | struct kmem_list3 *ptr; |
1401 | 1330 | ||
1402 | ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid); | 1331 | ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_NOWAIT, nodeid); |
1403 | BUG_ON(!ptr); | 1332 | BUG_ON(!ptr); |
1404 | 1333 | ||
1405 | local_irq_disable(); | ||
1406 | memcpy(ptr, list, sizeof(struct kmem_list3)); | 1334 | memcpy(ptr, list, sizeof(struct kmem_list3)); |
1407 | /* | 1335 | /* |
1408 | * Do not assume that spinlocks can be initialized via memcpy: | 1336 | * Do not assume that spinlocks can be initialized via memcpy: |
@@ -1411,7 +1339,6 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, | |||
1411 | 1339 | ||
1412 | MAKE_ALL_LISTS(cachep, ptr, nodeid); | 1340 | MAKE_ALL_LISTS(cachep, ptr, nodeid); |
1413 | cachep->nodelists[nodeid] = ptr; | 1341 | cachep->nodelists[nodeid] = ptr; |
1414 | local_irq_enable(); | ||
1415 | } | 1342 | } |
1416 | 1343 | ||
1417 | /* | 1344 | /* |
@@ -1443,10 +1370,8 @@ void __init kmem_cache_init(void) | |||
1443 | int order; | 1370 | int order; |
1444 | int node; | 1371 | int node; |
1445 | 1372 | ||
1446 | if (num_possible_nodes() == 1) { | 1373 | if (num_possible_nodes() == 1) |
1447 | use_alien_caches = 0; | 1374 | use_alien_caches = 0; |
1448 | numa_platform = 0; | ||
1449 | } | ||
1450 | 1375 | ||
1451 | for (i = 0; i < NUM_INIT_LISTS; i++) { | 1376 | for (i = 0; i < NUM_INIT_LISTS; i++) { |
1452 | kmem_list3_init(&initkmem_list3[i]); | 1377 | kmem_list3_init(&initkmem_list3[i]); |
@@ -1575,9 +1500,8 @@ void __init kmem_cache_init(void) | |||
1575 | { | 1500 | { |
1576 | struct array_cache *ptr; | 1501 | struct array_cache *ptr; |
1577 | 1502 | ||
1578 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); | 1503 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); |
1579 | 1504 | ||
1580 | local_irq_disable(); | ||
1581 | BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); | 1505 | BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); |
1582 | memcpy(ptr, cpu_cache_get(&cache_cache), | 1506 | memcpy(ptr, cpu_cache_get(&cache_cache), |
1583 | sizeof(struct arraycache_init)); | 1507 | sizeof(struct arraycache_init)); |
@@ -1587,11 +1511,9 @@ void __init kmem_cache_init(void) | |||
1587 | spin_lock_init(&ptr->lock); | 1511 | spin_lock_init(&ptr->lock); |
1588 | 1512 | ||
1589 | cache_cache.array[smp_processor_id()] = ptr; | 1513 | cache_cache.array[smp_processor_id()] = ptr; |
1590 | local_irq_enable(); | ||
1591 | 1514 | ||
1592 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); | 1515 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); |
1593 | 1516 | ||
1594 | local_irq_disable(); | ||
1595 | BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep) | 1517 | BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep) |
1596 | != &initarray_generic.cache); | 1518 | != &initarray_generic.cache); |
1597 | memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), | 1519 | memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), |
@@ -1603,7 +1525,6 @@ void __init kmem_cache_init(void) | |||
1603 | 1525 | ||
1604 | malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = | 1526 | malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = |
1605 | ptr; | 1527 | ptr; |
1606 | local_irq_enable(); | ||
1607 | } | 1528 | } |
1608 | /* 5) Replace the bootstrap kmem_list3's */ | 1529 | /* 5) Replace the bootstrap kmem_list3's */ |
1609 | { | 1530 | { |
@@ -1622,19 +1543,22 @@ void __init kmem_cache_init(void) | |||
1622 | } | 1543 | } |
1623 | } | 1544 | } |
1624 | 1545 | ||
1625 | /* 6) resize the head arrays to their final sizes */ | 1546 | g_cpucache_up = EARLY; |
1626 | { | ||
1627 | struct kmem_cache *cachep; | ||
1628 | mutex_lock(&cache_chain_mutex); | ||
1629 | list_for_each_entry(cachep, &cache_chain, next) | ||
1630 | if (enable_cpucache(cachep)) | ||
1631 | BUG(); | ||
1632 | mutex_unlock(&cache_chain_mutex); | ||
1633 | } | ||
1634 | 1547 | ||
1635 | /* Annotate slab for lockdep -- annotate the malloc caches */ | 1548 | /* Annotate slab for lockdep -- annotate the malloc caches */ |
1636 | init_lock_keys(); | 1549 | init_lock_keys(); |
1550 | } | ||
1551 | |||
1552 | void __init kmem_cache_init_late(void) | ||
1553 | { | ||
1554 | struct kmem_cache *cachep; | ||
1637 | 1555 | ||
1556 | /* 6) resize the head arrays to their final sizes */ | ||
1557 | mutex_lock(&cache_chain_mutex); | ||
1558 | list_for_each_entry(cachep, &cache_chain, next) | ||
1559 | if (enable_cpucache(cachep, GFP_NOWAIT)) | ||
1560 | BUG(); | ||
1561 | mutex_unlock(&cache_chain_mutex); | ||
1638 | 1562 | ||
1639 | /* Done! */ | 1563 | /* Done! */ |
1640 | g_cpucache_up = FULL; | 1564 | g_cpucache_up = FULL; |
@@ -1689,7 +1613,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
1689 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | 1613 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) |
1690 | flags |= __GFP_RECLAIMABLE; | 1614 | flags |= __GFP_RECLAIMABLE; |
1691 | 1615 | ||
1692 | page = alloc_pages_node(nodeid, flags, cachep->gfporder); | 1616 | page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); |
1693 | if (!page) | 1617 | if (!page) |
1694 | return NULL; | 1618 | return NULL; |
1695 | 1619 | ||
@@ -1702,6 +1626,16 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
1702 | NR_SLAB_UNRECLAIMABLE, nr_pages); | 1626 | NR_SLAB_UNRECLAIMABLE, nr_pages); |
1703 | for (i = 0; i < nr_pages; i++) | 1627 | for (i = 0; i < nr_pages; i++) |
1704 | __SetPageSlab(page + i); | 1628 | __SetPageSlab(page + i); |
1629 | |||
1630 | if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { | ||
1631 | kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); | ||
1632 | |||
1633 | if (cachep->ctor) | ||
1634 | kmemcheck_mark_uninitialized_pages(page, nr_pages); | ||
1635 | else | ||
1636 | kmemcheck_mark_unallocated_pages(page, nr_pages); | ||
1637 | } | ||
1638 | |||
1705 | return page_address(page); | 1639 | return page_address(page); |
1706 | } | 1640 | } |
1707 | 1641 | ||
@@ -1714,6 +1648,8 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) | |||
1714 | struct page *page = virt_to_page(addr); | 1648 | struct page *page = virt_to_page(addr); |
1715 | const unsigned long nr_freed = i; | 1649 | const unsigned long nr_freed = i; |
1716 | 1650 | ||
1651 | kmemcheck_free_shadow(page, cachep->gfporder); | ||
1652 | |||
1717 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | 1653 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) |
1718 | sub_zone_page_state(page_zone(page), | 1654 | sub_zone_page_state(page_zone(page), |
1719 | NR_SLAB_RECLAIMABLE, nr_freed); | 1655 | NR_SLAB_RECLAIMABLE, nr_freed); |
@@ -2064,10 +2000,10 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, | |||
2064 | return left_over; | 2000 | return left_over; |
2065 | } | 2001 | } |
2066 | 2002 | ||
2067 | static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) | 2003 | static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) |
2068 | { | 2004 | { |
2069 | if (g_cpucache_up == FULL) | 2005 | if (g_cpucache_up == FULL) |
2070 | return enable_cpucache(cachep); | 2006 | return enable_cpucache(cachep, gfp); |
2071 | 2007 | ||
2072 | if (g_cpucache_up == NONE) { | 2008 | if (g_cpucache_up == NONE) { |
2073 | /* | 2009 | /* |
@@ -2089,7 +2025,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) | |||
2089 | g_cpucache_up = PARTIAL_AC; | 2025 | g_cpucache_up = PARTIAL_AC; |
2090 | } else { | 2026 | } else { |
2091 | cachep->array[smp_processor_id()] = | 2027 | cachep->array[smp_processor_id()] = |
2092 | kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); | 2028 | kmalloc(sizeof(struct arraycache_init), gfp); |
2093 | 2029 | ||
2094 | if (g_cpucache_up == PARTIAL_AC) { | 2030 | if (g_cpucache_up == PARTIAL_AC) { |
2095 | set_up_list3s(cachep, SIZE_L3); | 2031 | set_up_list3s(cachep, SIZE_L3); |
@@ -2099,7 +2035,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) | |||
2099 | for_each_online_node(node) { | 2035 | for_each_online_node(node) { |
2100 | cachep->nodelists[node] = | 2036 | cachep->nodelists[node] = |
2101 | kmalloc_node(sizeof(struct kmem_list3), | 2037 | kmalloc_node(sizeof(struct kmem_list3), |
2102 | GFP_KERNEL, node); | 2038 | gfp, node); |
2103 | BUG_ON(!cachep->nodelists[node]); | 2039 | BUG_ON(!cachep->nodelists[node]); |
2104 | kmem_list3_init(cachep->nodelists[node]); | 2040 | kmem_list3_init(cachep->nodelists[node]); |
2105 | } | 2041 | } |
@@ -2153,6 +2089,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2153 | { | 2089 | { |
2154 | size_t left_over, slab_size, ralign; | 2090 | size_t left_over, slab_size, ralign; |
2155 | struct kmem_cache *cachep = NULL, *pc; | 2091 | struct kmem_cache *cachep = NULL, *pc; |
2092 | gfp_t gfp; | ||
2156 | 2093 | ||
2157 | /* | 2094 | /* |
2158 | * Sanity checks... these are all serious usage bugs. | 2095 | * Sanity checks... these are all serious usage bugs. |
@@ -2168,8 +2105,10 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2168 | * We use cache_chain_mutex to ensure a consistent view of | 2105 | * We use cache_chain_mutex to ensure a consistent view of |
2169 | * cpu_online_mask as well. Please see cpuup_callback | 2106 | * cpu_online_mask as well. Please see cpuup_callback |
2170 | */ | 2107 | */ |
2171 | get_online_cpus(); | 2108 | if (slab_is_available()) { |
2172 | mutex_lock(&cache_chain_mutex); | 2109 | get_online_cpus(); |
2110 | mutex_lock(&cache_chain_mutex); | ||
2111 | } | ||
2173 | 2112 | ||
2174 | list_for_each_entry(pc, &cache_chain, next) { | 2113 | list_for_each_entry(pc, &cache_chain, next) { |
2175 | char tmp; | 2114 | char tmp; |
@@ -2278,8 +2217,13 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2278 | */ | 2217 | */ |
2279 | align = ralign; | 2218 | align = ralign; |
2280 | 2219 | ||
2220 | if (slab_is_available()) | ||
2221 | gfp = GFP_KERNEL; | ||
2222 | else | ||
2223 | gfp = GFP_NOWAIT; | ||
2224 | |||
2281 | /* Get cache's description obj. */ | 2225 | /* Get cache's description obj. */ |
2282 | cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL); | 2226 | cachep = kmem_cache_zalloc(&cache_cache, gfp); |
2283 | if (!cachep) | 2227 | if (!cachep) |
2284 | goto oops; | 2228 | goto oops; |
2285 | 2229 | ||
@@ -2353,6 +2297,15 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2353 | /* really off slab. No need for manual alignment */ | 2297 | /* really off slab. No need for manual alignment */ |
2354 | slab_size = | 2298 | slab_size = |
2355 | cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); | 2299 | cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); |
2300 | |||
2301 | #ifdef CONFIG_PAGE_POISONING | ||
2302 | /* If we're going to use the generic kernel_map_pages() | ||
2303 | * poisoning, then it's going to smash the contents of | ||
2304 | * the redzone and userword anyhow, so switch them off. | ||
2305 | */ | ||
2306 | if (size % PAGE_SIZE == 0 && flags & SLAB_POISON) | ||
2307 | flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); | ||
2308 | #endif | ||
2356 | } | 2309 | } |
2357 | 2310 | ||
2358 | cachep->colour_off = cache_line_size(); | 2311 | cachep->colour_off = cache_line_size(); |
@@ -2382,7 +2335,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2382 | cachep->ctor = ctor; | 2335 | cachep->ctor = ctor; |
2383 | cachep->name = name; | 2336 | cachep->name = name; |
2384 | 2337 | ||
2385 | if (setup_cpu_cache(cachep)) { | 2338 | if (setup_cpu_cache(cachep, gfp)) { |
2386 | __kmem_cache_destroy(cachep); | 2339 | __kmem_cache_destroy(cachep); |
2387 | cachep = NULL; | 2340 | cachep = NULL; |
2388 | goto oops; | 2341 | goto oops; |
@@ -2394,8 +2347,10 @@ oops: | |||
2394 | if (!cachep && (flags & SLAB_PANIC)) | 2347 | if (!cachep && (flags & SLAB_PANIC)) |
2395 | panic("kmem_cache_create(): failed to create slab `%s'\n", | 2348 | panic("kmem_cache_create(): failed to create slab `%s'\n", |
2396 | name); | 2349 | name); |
2397 | mutex_unlock(&cache_chain_mutex); | 2350 | if (slab_is_available()) { |
2398 | put_online_cpus(); | 2351 | mutex_unlock(&cache_chain_mutex); |
2352 | put_online_cpus(); | ||
2353 | } | ||
2399 | return cachep; | 2354 | return cachep; |
2400 | } | 2355 | } |
2401 | EXPORT_SYMBOL(kmem_cache_create); | 2356 | EXPORT_SYMBOL(kmem_cache_create); |
@@ -2621,6 +2576,14 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, | |||
2621 | /* Slab management obj is off-slab. */ | 2576 | /* Slab management obj is off-slab. */ |
2622 | slabp = kmem_cache_alloc_node(cachep->slabp_cache, | 2577 | slabp = kmem_cache_alloc_node(cachep->slabp_cache, |
2623 | local_flags, nodeid); | 2578 | local_flags, nodeid); |
2579 | /* | ||
2580 | * If the first object in the slab is leaked (it's allocated | ||
2581 | * but no one has a reference to it), we want to make sure | ||
2582 | * kmemleak does not treat the ->s_mem pointer as a reference | ||
2583 | * to the object. Otherwise we will not report the leak. | ||
2584 | */ | ||
2585 | kmemleak_scan_area(slabp, offsetof(struct slab, list), | ||
2586 | sizeof(struct list_head), local_flags); | ||
2624 | if (!slabp) | 2587 | if (!slabp) |
2625 | return NULL; | 2588 | return NULL; |
2626 | } else { | 2589 | } else { |
@@ -3141,6 +3104,12 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3141 | STATS_INC_ALLOCMISS(cachep); | 3104 | STATS_INC_ALLOCMISS(cachep); |
3142 | objp = cache_alloc_refill(cachep, flags); | 3105 | objp = cache_alloc_refill(cachep, flags); |
3143 | } | 3106 | } |
3107 | /* | ||
3108 | * To avoid a false negative, if an object that is in one of the | ||
3109 | * per-CPU caches is leaked, we need to make sure kmemleak doesn't | ||
3110 | * treat the array pointers as a reference to the object. | ||
3111 | */ | ||
3112 | kmemleak_erase(&ac->entry[ac->avail]); | ||
3144 | return objp; | 3113 | return objp; |
3145 | } | 3114 | } |
3146 | 3115 | ||
@@ -3219,7 +3188,7 @@ retry: | |||
3219 | if (local_flags & __GFP_WAIT) | 3188 | if (local_flags & __GFP_WAIT) |
3220 | local_irq_enable(); | 3189 | local_irq_enable(); |
3221 | kmem_flagcheck(cache, flags); | 3190 | kmem_flagcheck(cache, flags); |
3222 | obj = kmem_getpages(cache, local_flags, -1); | 3191 | obj = kmem_getpages(cache, local_flags, numa_node_id()); |
3223 | if (local_flags & __GFP_WAIT) | 3192 | if (local_flags & __GFP_WAIT) |
3224 | local_irq_disable(); | 3193 | local_irq_disable(); |
3225 | if (obj) { | 3194 | if (obj) { |
@@ -3327,6 +3296,8 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, | |||
3327 | unsigned long save_flags; | 3296 | unsigned long save_flags; |
3328 | void *ptr; | 3297 | void *ptr; |
3329 | 3298 | ||
3299 | flags &= gfp_allowed_mask; | ||
3300 | |||
3330 | lockdep_trace_alloc(flags); | 3301 | lockdep_trace_alloc(flags); |
3331 | 3302 | ||
3332 | if (slab_should_failslab(cachep, flags)) | 3303 | if (slab_should_failslab(cachep, flags)) |
@@ -3360,6 +3331,11 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, | |||
3360 | out: | 3331 | out: |
3361 | local_irq_restore(save_flags); | 3332 | local_irq_restore(save_flags); |
3362 | ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); | 3333 | ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); |
3334 | kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags, | ||
3335 | flags); | ||
3336 | |||
3337 | if (likely(ptr)) | ||
3338 | kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep)); | ||
3363 | 3339 | ||
3364 | if (unlikely((flags & __GFP_ZERO) && ptr)) | 3340 | if (unlikely((flags & __GFP_ZERO) && ptr)) |
3365 | memset(ptr, 0, obj_size(cachep)); | 3341 | memset(ptr, 0, obj_size(cachep)); |
@@ -3405,6 +3381,8 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) | |||
3405 | unsigned long save_flags; | 3381 | unsigned long save_flags; |
3406 | void *objp; | 3382 | void *objp; |
3407 | 3383 | ||
3384 | flags &= gfp_allowed_mask; | ||
3385 | |||
3408 | lockdep_trace_alloc(flags); | 3386 | lockdep_trace_alloc(flags); |
3409 | 3387 | ||
3410 | if (slab_should_failslab(cachep, flags)) | 3388 | if (slab_should_failslab(cachep, flags)) |
@@ -3415,8 +3393,13 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) | |||
3415 | objp = __do_cache_alloc(cachep, flags); | 3393 | objp = __do_cache_alloc(cachep, flags); |
3416 | local_irq_restore(save_flags); | 3394 | local_irq_restore(save_flags); |
3417 | objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); | 3395 | objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); |
3396 | kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags, | ||
3397 | flags); | ||
3418 | prefetchw(objp); | 3398 | prefetchw(objp); |
3419 | 3399 | ||
3400 | if (likely(objp)) | ||
3401 | kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep)); | ||
3402 | |||
3420 | if (unlikely((flags & __GFP_ZERO) && objp)) | 3403 | if (unlikely((flags & __GFP_ZERO) && objp)) |
3421 | memset(objp, 0, obj_size(cachep)); | 3404 | memset(objp, 0, obj_size(cachep)); |
3422 | 3405 | ||
@@ -3530,8 +3513,11 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) | |||
3530 | struct array_cache *ac = cpu_cache_get(cachep); | 3513 | struct array_cache *ac = cpu_cache_get(cachep); |
3531 | 3514 | ||
3532 | check_irq_off(); | 3515 | check_irq_off(); |
3516 | kmemleak_free_recursive(objp, cachep->flags); | ||
3533 | objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); | 3517 | objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); |
3534 | 3518 | ||
3519 | kmemcheck_slab_free(cachep, objp, obj_size(cachep)); | ||
3520 | |||
3535 | /* | 3521 | /* |
3536 | * Skip calling cache_free_alien() when the platform is not numa. | 3522 | * Skip calling cache_free_alien() when the platform is not numa. |
3537 | * This will avoid cache misses that happen while accessing slabp (which | 3523 | * This will avoid cache misses that happen while accessing slabp (which |
@@ -3539,7 +3525,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) | |||
3539 | * variable to skip the call, which is mostly likely to be present in | 3525 | * variable to skip the call, which is mostly likely to be present in |
3540 | * the cache. | 3526 | * the cache. |
3541 | */ | 3527 | */ |
3542 | if (numa_platform && cache_free_alien(cachep, objp)) | 3528 | if (nr_online_nodes > 1 && cache_free_alien(cachep, objp)) |
3543 | return; | 3529 | return; |
3544 | 3530 | ||
3545 | if (likely(ac->avail < ac->limit)) { | 3531 | if (likely(ac->avail < ac->limit)) { |
@@ -3802,7 +3788,7 @@ EXPORT_SYMBOL_GPL(kmem_cache_name); | |||
3802 | /* | 3788 | /* |
3803 | * This initializes kmem_list3 or resizes various caches for all nodes. | 3789 | * This initializes kmem_list3 or resizes various caches for all nodes. |
3804 | */ | 3790 | */ |
3805 | static int alloc_kmemlist(struct kmem_cache *cachep) | 3791 | static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) |
3806 | { | 3792 | { |
3807 | int node; | 3793 | int node; |
3808 | struct kmem_list3 *l3; | 3794 | struct kmem_list3 *l3; |
@@ -3812,7 +3798,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep) | |||
3812 | for_each_online_node(node) { | 3798 | for_each_online_node(node) { |
3813 | 3799 | ||
3814 | if (use_alien_caches) { | 3800 | if (use_alien_caches) { |
3815 | new_alien = alloc_alien_cache(node, cachep->limit); | 3801 | new_alien = alloc_alien_cache(node, cachep->limit, gfp); |
3816 | if (!new_alien) | 3802 | if (!new_alien) |
3817 | goto fail; | 3803 | goto fail; |
3818 | } | 3804 | } |
@@ -3821,7 +3807,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep) | |||
3821 | if (cachep->shared) { | 3807 | if (cachep->shared) { |
3822 | new_shared = alloc_arraycache(node, | 3808 | new_shared = alloc_arraycache(node, |
3823 | cachep->shared*cachep->batchcount, | 3809 | cachep->shared*cachep->batchcount, |
3824 | 0xbaadf00d); | 3810 | 0xbaadf00d, gfp); |
3825 | if (!new_shared) { | 3811 | if (!new_shared) { |
3826 | free_alien_cache(new_alien); | 3812 | free_alien_cache(new_alien); |
3827 | goto fail; | 3813 | goto fail; |
@@ -3850,7 +3836,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep) | |||
3850 | free_alien_cache(new_alien); | 3836 | free_alien_cache(new_alien); |
3851 | continue; | 3837 | continue; |
3852 | } | 3838 | } |
3853 | l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node); | 3839 | l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node); |
3854 | if (!l3) { | 3840 | if (!l3) { |
3855 | free_alien_cache(new_alien); | 3841 | free_alien_cache(new_alien); |
3856 | kfree(new_shared); | 3842 | kfree(new_shared); |
@@ -3906,18 +3892,18 @@ static void do_ccupdate_local(void *info) | |||
3906 | 3892 | ||
3907 | /* Always called with the cache_chain_mutex held */ | 3893 | /* Always called with the cache_chain_mutex held */ |
3908 | static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | 3894 | static int do_tune_cpucache(struct kmem_cache *cachep, int limit, |
3909 | int batchcount, int shared) | 3895 | int batchcount, int shared, gfp_t gfp) |
3910 | { | 3896 | { |
3911 | struct ccupdate_struct *new; | 3897 | struct ccupdate_struct *new; |
3912 | int i; | 3898 | int i; |
3913 | 3899 | ||
3914 | new = kzalloc(sizeof(*new), GFP_KERNEL); | 3900 | new = kzalloc(sizeof(*new), gfp); |
3915 | if (!new) | 3901 | if (!new) |
3916 | return -ENOMEM; | 3902 | return -ENOMEM; |
3917 | 3903 | ||
3918 | for_each_online_cpu(i) { | 3904 | for_each_online_cpu(i) { |
3919 | new->new[i] = alloc_arraycache(cpu_to_node(i), limit, | 3905 | new->new[i] = alloc_arraycache(cpu_to_node(i), limit, |
3920 | batchcount); | 3906 | batchcount, gfp); |
3921 | if (!new->new[i]) { | 3907 | if (!new->new[i]) { |
3922 | for (i--; i >= 0; i--) | 3908 | for (i--; i >= 0; i--) |
3923 | kfree(new->new[i]); | 3909 | kfree(new->new[i]); |
@@ -3944,11 +3930,11 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | |||
3944 | kfree(ccold); | 3930 | kfree(ccold); |
3945 | } | 3931 | } |
3946 | kfree(new); | 3932 | kfree(new); |
3947 | return alloc_kmemlist(cachep); | 3933 | return alloc_kmemlist(cachep, gfp); |
3948 | } | 3934 | } |
3949 | 3935 | ||
3950 | /* Called with cache_chain_mutex held always */ | 3936 | /* Called with cache_chain_mutex held always */ |
3951 | static int enable_cpucache(struct kmem_cache *cachep) | 3937 | static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) |
3952 | { | 3938 | { |
3953 | int err; | 3939 | int err; |
3954 | int limit, shared; | 3940 | int limit, shared; |
@@ -3994,7 +3980,7 @@ static int enable_cpucache(struct kmem_cache *cachep) | |||
3994 | if (limit > 32) | 3980 | if (limit > 32) |
3995 | limit = 32; | 3981 | limit = 32; |
3996 | #endif | 3982 | #endif |
3997 | err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared); | 3983 | err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp); |
3998 | if (err) | 3984 | if (err) |
3999 | printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", | 3985 | printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", |
4000 | cachep->name, -err); | 3986 | cachep->name, -err); |
@@ -4300,7 +4286,8 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, | |||
4300 | res = 0; | 4286 | res = 0; |
4301 | } else { | 4287 | } else { |
4302 | res = do_tune_cpucache(cachep, limit, | 4288 | res = do_tune_cpucache(cachep, limit, |
4303 | batchcount, shared); | 4289 | batchcount, shared, |
4290 | GFP_KERNEL); | ||
4304 | } | 4291 | } |
4305 | break; | 4292 | break; |
4306 | } | 4293 | } |
@@ -46,7 +46,7 @@ | |||
46 | * NUMA support in SLOB is fairly simplistic, pushing most of the real | 46 | * NUMA support in SLOB is fairly simplistic, pushing most of the real |
47 | * logic down to the page allocator, and simply doing the node accounting | 47 | * logic down to the page allocator, and simply doing the node accounting |
48 | * on the upper levels. In the event that a node id is explicitly | 48 | * on the upper levels. In the event that a node id is explicitly |
49 | * provided, alloc_pages_node() with the specified node id is used | 49 | * provided, alloc_pages_exact_node() with the specified node id is used |
50 | * instead. The common case (or when the node id isn't explicitly provided) | 50 | * instead. The common case (or when the node id isn't explicitly provided) |
51 | * will default to the current node, as per numa_node_id(). | 51 | * will default to the current node, as per numa_node_id(). |
52 | * | 52 | * |
@@ -66,7 +66,8 @@ | |||
66 | #include <linux/module.h> | 66 | #include <linux/module.h> |
67 | #include <linux/rcupdate.h> | 67 | #include <linux/rcupdate.h> |
68 | #include <linux/list.h> | 68 | #include <linux/list.h> |
69 | #include <trace/kmemtrace.h> | 69 | #include <linux/kmemtrace.h> |
70 | #include <linux/kmemleak.h> | ||
70 | #include <asm/atomic.h> | 71 | #include <asm/atomic.h> |
71 | 72 | ||
72 | /* | 73 | /* |
@@ -132,17 +133,17 @@ static LIST_HEAD(free_slob_large); | |||
132 | */ | 133 | */ |
133 | static inline int is_slob_page(struct slob_page *sp) | 134 | static inline int is_slob_page(struct slob_page *sp) |
134 | { | 135 | { |
135 | return PageSlobPage((struct page *)sp); | 136 | return PageSlab((struct page *)sp); |
136 | } | 137 | } |
137 | 138 | ||
138 | static inline void set_slob_page(struct slob_page *sp) | 139 | static inline void set_slob_page(struct slob_page *sp) |
139 | { | 140 | { |
140 | __SetPageSlobPage((struct page *)sp); | 141 | __SetPageSlab((struct page *)sp); |
141 | } | 142 | } |
142 | 143 | ||
143 | static inline void clear_slob_page(struct slob_page *sp) | 144 | static inline void clear_slob_page(struct slob_page *sp) |
144 | { | 145 | { |
145 | __ClearPageSlobPage((struct page *)sp); | 146 | __ClearPageSlab((struct page *)sp); |
146 | } | 147 | } |
147 | 148 | ||
148 | static inline struct slob_page *slob_page(const void *addr) | 149 | static inline struct slob_page *slob_page(const void *addr) |
@@ -243,7 +244,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node) | |||
243 | 244 | ||
244 | #ifdef CONFIG_NUMA | 245 | #ifdef CONFIG_NUMA |
245 | if (node != -1) | 246 | if (node != -1) |
246 | page = alloc_pages_node(node, gfp, order); | 247 | page = alloc_pages_exact_node(node, gfp, order); |
247 | else | 248 | else |
248 | #endif | 249 | #endif |
249 | page = alloc_pages(gfp, order); | 250 | page = alloc_pages(gfp, order); |
@@ -509,6 +510,7 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) | |||
509 | size, PAGE_SIZE << order, gfp, node); | 510 | size, PAGE_SIZE << order, gfp, node); |
510 | } | 511 | } |
511 | 512 | ||
513 | kmemleak_alloc(ret, size, 1, gfp); | ||
512 | return ret; | 514 | return ret; |
513 | } | 515 | } |
514 | EXPORT_SYMBOL(__kmalloc_node); | 516 | EXPORT_SYMBOL(__kmalloc_node); |
@@ -521,6 +523,7 @@ void kfree(const void *block) | |||
521 | 523 | ||
522 | if (unlikely(ZERO_OR_NULL_PTR(block))) | 524 | if (unlikely(ZERO_OR_NULL_PTR(block))) |
523 | return; | 525 | return; |
526 | kmemleak_free(block); | ||
524 | 527 | ||
525 | sp = slob_page(block); | 528 | sp = slob_page(block); |
526 | if (is_slob_page(sp)) { | 529 | if (is_slob_page(sp)) { |
@@ -584,12 +587,14 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, | |||
584 | } else if (flags & SLAB_PANIC) | 587 | } else if (flags & SLAB_PANIC) |
585 | panic("Cannot create slab cache %s\n", name); | 588 | panic("Cannot create slab cache %s\n", name); |
586 | 589 | ||
590 | kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL); | ||
587 | return c; | 591 | return c; |
588 | } | 592 | } |
589 | EXPORT_SYMBOL(kmem_cache_create); | 593 | EXPORT_SYMBOL(kmem_cache_create); |
590 | 594 | ||
591 | void kmem_cache_destroy(struct kmem_cache *c) | 595 | void kmem_cache_destroy(struct kmem_cache *c) |
592 | { | 596 | { |
597 | kmemleak_free(c); | ||
593 | slob_free(c, sizeof(struct kmem_cache)); | 598 | slob_free(c, sizeof(struct kmem_cache)); |
594 | } | 599 | } |
595 | EXPORT_SYMBOL(kmem_cache_destroy); | 600 | EXPORT_SYMBOL(kmem_cache_destroy); |
@@ -613,6 +618,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) | |||
613 | if (c->ctor) | 618 | if (c->ctor) |
614 | c->ctor(b); | 619 | c->ctor(b); |
615 | 620 | ||
621 | kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags); | ||
616 | return b; | 622 | return b; |
617 | } | 623 | } |
618 | EXPORT_SYMBOL(kmem_cache_alloc_node); | 624 | EXPORT_SYMBOL(kmem_cache_alloc_node); |
@@ -635,6 +641,7 @@ static void kmem_rcu_free(struct rcu_head *head) | |||
635 | 641 | ||
636 | void kmem_cache_free(struct kmem_cache *c, void *b) | 642 | void kmem_cache_free(struct kmem_cache *c, void *b) |
637 | { | 643 | { |
644 | kmemleak_free_recursive(b, c->flags); | ||
638 | if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) { | 645 | if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) { |
639 | struct slob_rcu *slob_rcu; | 646 | struct slob_rcu *slob_rcu; |
640 | slob_rcu = b + (c->size - sizeof(struct slob_rcu)); | 647 | slob_rcu = b + (c->size - sizeof(struct slob_rcu)); |
@@ -17,9 +17,11 @@ | |||
17 | #include <linux/slab.h> | 17 | #include <linux/slab.h> |
18 | #include <linux/proc_fs.h> | 18 | #include <linux/proc_fs.h> |
19 | #include <linux/seq_file.h> | 19 | #include <linux/seq_file.h> |
20 | #include <trace/kmemtrace.h> | 20 | #include <linux/kmemtrace.h> |
21 | #include <linux/kmemcheck.h> | ||
21 | #include <linux/cpu.h> | 22 | #include <linux/cpu.h> |
22 | #include <linux/cpuset.h> | 23 | #include <linux/cpuset.h> |
24 | #include <linux/kmemleak.h> | ||
23 | #include <linux/mempolicy.h> | 25 | #include <linux/mempolicy.h> |
24 | #include <linux/ctype.h> | 26 | #include <linux/ctype.h> |
25 | #include <linux/debugobjects.h> | 27 | #include <linux/debugobjects.h> |
@@ -143,10 +145,10 @@ | |||
143 | * Set of flags that will prevent slab merging | 145 | * Set of flags that will prevent slab merging |
144 | */ | 146 | */ |
145 | #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ | 147 | #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ |
146 | SLAB_TRACE | SLAB_DESTROY_BY_RCU) | 148 | SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE) |
147 | 149 | ||
148 | #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ | 150 | #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ |
149 | SLAB_CACHE_DMA) | 151 | SLAB_CACHE_DMA | SLAB_NOTRACK) |
150 | 152 | ||
151 | #ifndef ARCH_KMALLOC_MINALIGN | 153 | #ifndef ARCH_KMALLOC_MINALIGN |
152 | #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) | 154 | #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) |
@@ -832,6 +834,11 @@ static inline unsigned long slabs_node(struct kmem_cache *s, int node) | |||
832 | return atomic_long_read(&n->nr_slabs); | 834 | return atomic_long_read(&n->nr_slabs); |
833 | } | 835 | } |
834 | 836 | ||
837 | static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) | ||
838 | { | ||
839 | return atomic_long_read(&n->nr_slabs); | ||
840 | } | ||
841 | |||
835 | static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) | 842 | static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) |
836 | { | 843 | { |
837 | struct kmem_cache_node *n = get_node(s, node); | 844 | struct kmem_cache_node *n = get_node(s, node); |
@@ -1050,6 +1057,8 @@ static inline unsigned long kmem_cache_flags(unsigned long objsize, | |||
1050 | 1057 | ||
1051 | static inline unsigned long slabs_node(struct kmem_cache *s, int node) | 1058 | static inline unsigned long slabs_node(struct kmem_cache *s, int node) |
1052 | { return 0; } | 1059 | { return 0; } |
1060 | static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) | ||
1061 | { return 0; } | ||
1053 | static inline void inc_slabs_node(struct kmem_cache *s, int node, | 1062 | static inline void inc_slabs_node(struct kmem_cache *s, int node, |
1054 | int objects) {} | 1063 | int objects) {} |
1055 | static inline void dec_slabs_node(struct kmem_cache *s, int node, | 1064 | static inline void dec_slabs_node(struct kmem_cache *s, int node, |
@@ -1064,6 +1073,8 @@ static inline struct page *alloc_slab_page(gfp_t flags, int node, | |||
1064 | { | 1073 | { |
1065 | int order = oo_order(oo); | 1074 | int order = oo_order(oo); |
1066 | 1075 | ||
1076 | flags |= __GFP_NOTRACK; | ||
1077 | |||
1067 | if (node == -1) | 1078 | if (node == -1) |
1068 | return alloc_pages(flags, order); | 1079 | return alloc_pages(flags, order); |
1069 | else | 1080 | else |
@@ -1074,11 +1085,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1074 | { | 1085 | { |
1075 | struct page *page; | 1086 | struct page *page; |
1076 | struct kmem_cache_order_objects oo = s->oo; | 1087 | struct kmem_cache_order_objects oo = s->oo; |
1088 | gfp_t alloc_gfp; | ||
1077 | 1089 | ||
1078 | flags |= s->allocflags; | 1090 | flags |= s->allocflags; |
1079 | 1091 | ||
1080 | page = alloc_slab_page(flags | __GFP_NOWARN | __GFP_NORETRY, node, | 1092 | /* |
1081 | oo); | 1093 | * Let the initial higher-order allocation fail under memory pressure |
1094 | * so we fall-back to the minimum order allocation. | ||
1095 | */ | ||
1096 | alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; | ||
1097 | |||
1098 | page = alloc_slab_page(alloc_gfp, node, oo); | ||
1082 | if (unlikely(!page)) { | 1099 | if (unlikely(!page)) { |
1083 | oo = s->min; | 1100 | oo = s->min; |
1084 | /* | 1101 | /* |
@@ -1091,6 +1108,24 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1091 | 1108 | ||
1092 | stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); | 1109 | stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); |
1093 | } | 1110 | } |
1111 | |||
1112 | if (kmemcheck_enabled | ||
1113 | && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) | ||
1114 | { | ||
1115 | int pages = 1 << oo_order(oo); | ||
1116 | |||
1117 | kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); | ||
1118 | |||
1119 | /* | ||
1120 | * Objects from caches that have a constructor don't get | ||
1121 | * cleared when they're allocated, so we need to do it here. | ||
1122 | */ | ||
1123 | if (s->ctor) | ||
1124 | kmemcheck_mark_uninitialized_pages(page, pages); | ||
1125 | else | ||
1126 | kmemcheck_mark_unallocated_pages(page, pages); | ||
1127 | } | ||
1128 | |||
1094 | page->objects = oo_objects(oo); | 1129 | page->objects = oo_objects(oo); |
1095 | mod_zone_page_state(page_zone(page), | 1130 | mod_zone_page_state(page_zone(page), |
1096 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? | 1131 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? |
@@ -1164,6 +1199,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page) | |||
1164 | __ClearPageSlubDebug(page); | 1199 | __ClearPageSlubDebug(page); |
1165 | } | 1200 | } |
1166 | 1201 | ||
1202 | kmemcheck_free_shadow(page, compound_order(page)); | ||
1203 | |||
1167 | mod_zone_page_state(page_zone(page), | 1204 | mod_zone_page_state(page_zone(page), |
1168 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? | 1205 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? |
1169 | NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, | 1206 | NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, |
@@ -1484,6 +1521,65 @@ static inline int node_match(struct kmem_cache_cpu *c, int node) | |||
1484 | return 1; | 1521 | return 1; |
1485 | } | 1522 | } |
1486 | 1523 | ||
1524 | static int count_free(struct page *page) | ||
1525 | { | ||
1526 | return page->objects - page->inuse; | ||
1527 | } | ||
1528 | |||
1529 | static unsigned long count_partial(struct kmem_cache_node *n, | ||
1530 | int (*get_count)(struct page *)) | ||
1531 | { | ||
1532 | unsigned long flags; | ||
1533 | unsigned long x = 0; | ||
1534 | struct page *page; | ||
1535 | |||
1536 | spin_lock_irqsave(&n->list_lock, flags); | ||
1537 | list_for_each_entry(page, &n->partial, lru) | ||
1538 | x += get_count(page); | ||
1539 | spin_unlock_irqrestore(&n->list_lock, flags); | ||
1540 | return x; | ||
1541 | } | ||
1542 | |||
1543 | static inline unsigned long node_nr_objs(struct kmem_cache_node *n) | ||
1544 | { | ||
1545 | #ifdef CONFIG_SLUB_DEBUG | ||
1546 | return atomic_long_read(&n->total_objects); | ||
1547 | #else | ||
1548 | return 0; | ||
1549 | #endif | ||
1550 | } | ||
1551 | |||
1552 | static noinline void | ||
1553 | slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) | ||
1554 | { | ||
1555 | int node; | ||
1556 | |||
1557 | printk(KERN_WARNING | ||
1558 | "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", | ||
1559 | nid, gfpflags); | ||
1560 | printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " | ||
1561 | "default order: %d, min order: %d\n", s->name, s->objsize, | ||
1562 | s->size, oo_order(s->oo), oo_order(s->min)); | ||
1563 | |||
1564 | for_each_online_node(node) { | ||
1565 | struct kmem_cache_node *n = get_node(s, node); | ||
1566 | unsigned long nr_slabs; | ||
1567 | unsigned long nr_objs; | ||
1568 | unsigned long nr_free; | ||
1569 | |||
1570 | if (!n) | ||
1571 | continue; | ||
1572 | |||
1573 | nr_free = count_partial(n, count_free); | ||
1574 | nr_slabs = node_nr_slabs(n); | ||
1575 | nr_objs = node_nr_objs(n); | ||
1576 | |||
1577 | printk(KERN_WARNING | ||
1578 | " node %d: slabs: %ld, objs: %ld, free: %ld\n", | ||
1579 | node, nr_slabs, nr_objs, nr_free); | ||
1580 | } | ||
1581 | } | ||
1582 | |||
1487 | /* | 1583 | /* |
1488 | * Slow path. The lockless freelist is empty or we need to perform | 1584 | * Slow path. The lockless freelist is empty or we need to perform |
1489 | * debugging duties. | 1585 | * debugging duties. |
@@ -1565,6 +1661,8 @@ new_slab: | |||
1565 | c->page = new; | 1661 | c->page = new; |
1566 | goto load_freelist; | 1662 | goto load_freelist; |
1567 | } | 1663 | } |
1664 | if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) | ||
1665 | slab_out_of_memory(s, gfpflags, node); | ||
1568 | return NULL; | 1666 | return NULL; |
1569 | debug: | 1667 | debug: |
1570 | if (!alloc_debug_processing(s, c->page, object, addr)) | 1668 | if (!alloc_debug_processing(s, c->page, object, addr)) |
@@ -1594,6 +1692,8 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, | |||
1594 | unsigned long flags; | 1692 | unsigned long flags; |
1595 | unsigned int objsize; | 1693 | unsigned int objsize; |
1596 | 1694 | ||
1695 | gfpflags &= gfp_allowed_mask; | ||
1696 | |||
1597 | lockdep_trace_alloc(gfpflags); | 1697 | lockdep_trace_alloc(gfpflags); |
1598 | might_sleep_if(gfpflags & __GFP_WAIT); | 1698 | might_sleep_if(gfpflags & __GFP_WAIT); |
1599 | 1699 | ||
@@ -1617,6 +1717,9 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, | |||
1617 | if (unlikely((gfpflags & __GFP_ZERO) && object)) | 1717 | if (unlikely((gfpflags & __GFP_ZERO) && object)) |
1618 | memset(object, 0, objsize); | 1718 | memset(object, 0, objsize); |
1619 | 1719 | ||
1720 | kmemcheck_slab_alloc(s, gfpflags, object, c->objsize); | ||
1721 | kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags); | ||
1722 | |||
1620 | return object; | 1723 | return object; |
1621 | } | 1724 | } |
1622 | 1725 | ||
@@ -1746,8 +1849,10 @@ static __always_inline void slab_free(struct kmem_cache *s, | |||
1746 | struct kmem_cache_cpu *c; | 1849 | struct kmem_cache_cpu *c; |
1747 | unsigned long flags; | 1850 | unsigned long flags; |
1748 | 1851 | ||
1852 | kmemleak_free_recursive(x, s->flags); | ||
1749 | local_irq_save(flags); | 1853 | local_irq_save(flags); |
1750 | c = get_cpu_slab(s, smp_processor_id()); | 1854 | c = get_cpu_slab(s, smp_processor_id()); |
1855 | kmemcheck_slab_free(s, object, c->objsize); | ||
1751 | debug_check_no_locks_freed(object, c->objsize); | 1856 | debug_check_no_locks_freed(object, c->objsize); |
1752 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) | 1857 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) |
1753 | debug_check_no_obj_freed(object, c->objsize); | 1858 | debug_check_no_obj_freed(object, c->objsize); |
@@ -2557,13 +2662,16 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, | |||
2557 | if (gfp_flags & SLUB_DMA) | 2662 | if (gfp_flags & SLUB_DMA) |
2558 | flags = SLAB_CACHE_DMA; | 2663 | flags = SLAB_CACHE_DMA; |
2559 | 2664 | ||
2560 | down_write(&slub_lock); | 2665 | /* |
2666 | * This function is called with IRQs disabled during early-boot on | ||
2667 | * single CPU so there's no need to take slub_lock here. | ||
2668 | */ | ||
2561 | if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, | 2669 | if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, |
2562 | flags, NULL)) | 2670 | flags, NULL)) |
2563 | goto panic; | 2671 | goto panic; |
2564 | 2672 | ||
2565 | list_add(&s->list, &slab_caches); | 2673 | list_add(&s->list, &slab_caches); |
2566 | up_write(&slub_lock); | 2674 | |
2567 | if (sysfs_slab_add(s)) | 2675 | if (sysfs_slab_add(s)) |
2568 | goto panic; | 2676 | goto panic; |
2569 | return s; | 2677 | return s; |
@@ -2596,6 +2704,7 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) | |||
2596 | struct kmem_cache *s; | 2704 | struct kmem_cache *s; |
2597 | char *text; | 2705 | char *text; |
2598 | size_t realsize; | 2706 | size_t realsize; |
2707 | unsigned long slabflags; | ||
2599 | 2708 | ||
2600 | s = kmalloc_caches_dma[index]; | 2709 | s = kmalloc_caches_dma[index]; |
2601 | if (s) | 2710 | if (s) |
@@ -2617,9 +2726,18 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) | |||
2617 | (unsigned int)realsize); | 2726 | (unsigned int)realsize); |
2618 | s = kmalloc(kmem_size, flags & ~SLUB_DMA); | 2727 | s = kmalloc(kmem_size, flags & ~SLUB_DMA); |
2619 | 2728 | ||
2729 | /* | ||
2730 | * Must defer sysfs creation to a workqueue because we don't know | ||
2731 | * what context we are called from. Before sysfs comes up, we don't | ||
2732 | * need to do anything because our sysfs initcall will start by | ||
2733 | * adding all existing slabs to sysfs. | ||
2734 | */ | ||
2735 | slabflags = SLAB_CACHE_DMA|SLAB_NOTRACK; | ||
2736 | if (slab_state >= SYSFS) | ||
2737 | slabflags |= __SYSFS_ADD_DEFERRED; | ||
2738 | |||
2620 | if (!s || !text || !kmem_cache_open(s, flags, text, | 2739 | if (!s || !text || !kmem_cache_open(s, flags, text, |
2621 | realsize, ARCH_KMALLOC_MINALIGN, | 2740 | realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) { |
2622 | SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) { | ||
2623 | kfree(s); | 2741 | kfree(s); |
2624 | kfree(text); | 2742 | kfree(text); |
2625 | goto unlock_out; | 2743 | goto unlock_out; |
@@ -2628,7 +2746,8 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) | |||
2628 | list_add(&s->list, &slab_caches); | 2746 | list_add(&s->list, &slab_caches); |
2629 | kmalloc_caches_dma[index] = s; | 2747 | kmalloc_caches_dma[index] = s; |
2630 | 2748 | ||
2631 | schedule_work(&sysfs_add_work); | 2749 | if (slab_state >= SYSFS) |
2750 | schedule_work(&sysfs_add_work); | ||
2632 | 2751 | ||
2633 | unlock_out: | 2752 | unlock_out: |
2634 | up_write(&slub_lock); | 2753 | up_write(&slub_lock); |
@@ -2713,9 +2832,10 @@ EXPORT_SYMBOL(__kmalloc); | |||
2713 | 2832 | ||
2714 | static void *kmalloc_large_node(size_t size, gfp_t flags, int node) | 2833 | static void *kmalloc_large_node(size_t size, gfp_t flags, int node) |
2715 | { | 2834 | { |
2716 | struct page *page = alloc_pages_node(node, flags | __GFP_COMP, | 2835 | struct page *page; |
2717 | get_order(size)); | ||
2718 | 2836 | ||
2837 | flags |= __GFP_COMP | __GFP_NOTRACK; | ||
2838 | page = alloc_pages_node(node, flags, get_order(size)); | ||
2719 | if (page) | 2839 | if (page) |
2720 | return page_address(page); | 2840 | return page_address(page); |
2721 | else | 2841 | else |
@@ -3021,7 +3141,7 @@ void __init kmem_cache_init(void) | |||
3021 | * kmem_cache_open for slab_state == DOWN. | 3141 | * kmem_cache_open for slab_state == DOWN. |
3022 | */ | 3142 | */ |
3023 | create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", | 3143 | create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", |
3024 | sizeof(struct kmem_cache_node), GFP_KERNEL); | 3144 | sizeof(struct kmem_cache_node), GFP_NOWAIT); |
3025 | kmalloc_caches[0].refcount = -1; | 3145 | kmalloc_caches[0].refcount = -1; |
3026 | caches++; | 3146 | caches++; |
3027 | 3147 | ||
@@ -3034,16 +3154,16 @@ void __init kmem_cache_init(void) | |||
3034 | /* Caches that are not of the two-to-the-power-of size */ | 3154 | /* Caches that are not of the two-to-the-power-of size */ |
3035 | if (KMALLOC_MIN_SIZE <= 64) { | 3155 | if (KMALLOC_MIN_SIZE <= 64) { |
3036 | create_kmalloc_cache(&kmalloc_caches[1], | 3156 | create_kmalloc_cache(&kmalloc_caches[1], |
3037 | "kmalloc-96", 96, GFP_KERNEL); | 3157 | "kmalloc-96", 96, GFP_NOWAIT); |
3038 | caches++; | 3158 | caches++; |
3039 | create_kmalloc_cache(&kmalloc_caches[2], | 3159 | create_kmalloc_cache(&kmalloc_caches[2], |
3040 | "kmalloc-192", 192, GFP_KERNEL); | 3160 | "kmalloc-192", 192, GFP_NOWAIT); |
3041 | caches++; | 3161 | caches++; |
3042 | } | 3162 | } |
3043 | 3163 | ||
3044 | for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { | 3164 | for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { |
3045 | create_kmalloc_cache(&kmalloc_caches[i], | 3165 | create_kmalloc_cache(&kmalloc_caches[i], |
3046 | "kmalloc", 1 << i, GFP_KERNEL); | 3166 | "kmalloc", 1 << i, GFP_NOWAIT); |
3047 | caches++; | 3167 | caches++; |
3048 | } | 3168 | } |
3049 | 3169 | ||
@@ -3080,7 +3200,7 @@ void __init kmem_cache_init(void) | |||
3080 | /* Provide the correct kmalloc names now that the caches are up */ | 3200 | /* Provide the correct kmalloc names now that the caches are up */ |
3081 | for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) | 3201 | for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) |
3082 | kmalloc_caches[i]. name = | 3202 | kmalloc_caches[i]. name = |
3083 | kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); | 3203 | kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i); |
3084 | 3204 | ||
3085 | #ifdef CONFIG_SMP | 3205 | #ifdef CONFIG_SMP |
3086 | register_cpu_notifier(&slab_notifier); | 3206 | register_cpu_notifier(&slab_notifier); |
@@ -3098,6 +3218,10 @@ void __init kmem_cache_init(void) | |||
3098 | nr_cpu_ids, nr_node_ids); | 3218 | nr_cpu_ids, nr_node_ids); |
3099 | } | 3219 | } |
3100 | 3220 | ||
3221 | void __init kmem_cache_init_late(void) | ||
3222 | { | ||
3223 | } | ||
3224 | |||
3101 | /* | 3225 | /* |
3102 | * Find a mergeable slab cache | 3226 | * Find a mergeable slab cache |
3103 | */ | 3227 | */ |
@@ -3318,20 +3442,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, | |||
3318 | } | 3442 | } |
3319 | 3443 | ||
3320 | #ifdef CONFIG_SLUB_DEBUG | 3444 | #ifdef CONFIG_SLUB_DEBUG |
3321 | static unsigned long count_partial(struct kmem_cache_node *n, | ||
3322 | int (*get_count)(struct page *)) | ||
3323 | { | ||
3324 | unsigned long flags; | ||
3325 | unsigned long x = 0; | ||
3326 | struct page *page; | ||
3327 | |||
3328 | spin_lock_irqsave(&n->list_lock, flags); | ||
3329 | list_for_each_entry(page, &n->partial, lru) | ||
3330 | x += get_count(page); | ||
3331 | spin_unlock_irqrestore(&n->list_lock, flags); | ||
3332 | return x; | ||
3333 | } | ||
3334 | |||
3335 | static int count_inuse(struct page *page) | 3445 | static int count_inuse(struct page *page) |
3336 | { | 3446 | { |
3337 | return page->inuse; | 3447 | return page->inuse; |
@@ -3342,11 +3452,6 @@ static int count_total(struct page *page) | |||
3342 | return page->objects; | 3452 | return page->objects; |
3343 | } | 3453 | } |
3344 | 3454 | ||
3345 | static int count_free(struct page *page) | ||
3346 | { | ||
3347 | return page->objects - page->inuse; | ||
3348 | } | ||
3349 | |||
3350 | static int validate_slab(struct kmem_cache *s, struct page *page, | 3455 | static int validate_slab(struct kmem_cache *s, struct page *page, |
3351 | unsigned long *map) | 3456 | unsigned long *map) |
3352 | { | 3457 | { |
@@ -3715,7 +3820,7 @@ static int list_locations(struct kmem_cache *s, char *buf, | |||
3715 | to_cpumask(l->cpus)); | 3820 | to_cpumask(l->cpus)); |
3716 | } | 3821 | } |
3717 | 3822 | ||
3718 | if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && | 3823 | if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && |
3719 | len < PAGE_SIZE - 60) { | 3824 | len < PAGE_SIZE - 60) { |
3720 | len += sprintf(buf + len, " nodes="); | 3825 | len += sprintf(buf + len, " nodes="); |
3721 | len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, | 3826 | len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, |
@@ -4390,6 +4495,8 @@ static char *create_unique_id(struct kmem_cache *s) | |||
4390 | *p++ = 'a'; | 4495 | *p++ = 'a'; |
4391 | if (s->flags & SLAB_DEBUG_FREE) | 4496 | if (s->flags & SLAB_DEBUG_FREE) |
4392 | *p++ = 'F'; | 4497 | *p++ = 'F'; |
4498 | if (!(s->flags & SLAB_NOTRACK)) | ||
4499 | *p++ = 't'; | ||
4393 | if (p != name + 1) | 4500 | if (p != name + 1) |
4394 | *p++ = '-'; | 4501 | *p++ = '-'; |
4395 | p += sprintf(p, "%07d", s->size); | 4502 | p += sprintf(p, "%07d", s->size); |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 1416e7e9e02d..42cd38eba79f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -124,7 +124,6 @@ void __delete_from_swap_cache(struct page *page) | |||
124 | /** | 124 | /** |
125 | * add_to_swap - allocate swap space for a page | 125 | * add_to_swap - allocate swap space for a page |
126 | * @page: page we want to move to swap | 126 | * @page: page we want to move to swap |
127 | * @gfp_mask: memory allocation flags | ||
128 | * | 127 | * |
129 | * Allocate swap space for the page and add the page to the | 128 | * Allocate swap space for the page and add the page to the |
130 | * swap cache. Caller needs to hold the page lock. | 129 | * swap cache. Caller needs to hold the page lock. |
@@ -162,11 +161,11 @@ int add_to_swap(struct page *page) | |||
162 | return 1; | 161 | return 1; |
163 | case -EEXIST: | 162 | case -EEXIST: |
164 | /* Raced with "speculative" read_swap_cache_async */ | 163 | /* Raced with "speculative" read_swap_cache_async */ |
165 | swap_free(entry); | 164 | swapcache_free(entry, NULL); |
166 | continue; | 165 | continue; |
167 | default: | 166 | default: |
168 | /* -ENOMEM radix-tree allocation failure */ | 167 | /* -ENOMEM radix-tree allocation failure */ |
169 | swap_free(entry); | 168 | swapcache_free(entry, NULL); |
170 | return 0; | 169 | return 0; |
171 | } | 170 | } |
172 | } | 171 | } |
@@ -188,8 +187,7 @@ void delete_from_swap_cache(struct page *page) | |||
188 | __delete_from_swap_cache(page); | 187 | __delete_from_swap_cache(page); |
189 | spin_unlock_irq(&swapper_space.tree_lock); | 188 | spin_unlock_irq(&swapper_space.tree_lock); |
190 | 189 | ||
191 | mem_cgroup_uncharge_swapcache(page, entry); | 190 | swapcache_free(entry, page); |
192 | swap_free(entry); | ||
193 | page_cache_release(page); | 191 | page_cache_release(page); |
194 | } | 192 | } |
195 | 193 | ||
@@ -293,7 +291,10 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
293 | /* | 291 | /* |
294 | * Swap entry may have been freed since our caller observed it. | 292 | * Swap entry may have been freed since our caller observed it. |
295 | */ | 293 | */ |
296 | if (!swap_duplicate(entry)) | 294 | err = swapcache_prepare(entry); |
295 | if (err == -EEXIST) /* seems racy */ | ||
296 | continue; | ||
297 | if (err) /* swp entry is obsolete ? */ | ||
297 | break; | 298 | break; |
298 | 299 | ||
299 | /* | 300 | /* |
@@ -312,12 +313,12 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
312 | * Initiate read into locked page and return. | 313 | * Initiate read into locked page and return. |
313 | */ | 314 | */ |
314 | lru_cache_add_anon(new_page); | 315 | lru_cache_add_anon(new_page); |
315 | swap_readpage(NULL, new_page); | 316 | swap_readpage(new_page); |
316 | return new_page; | 317 | return new_page; |
317 | } | 318 | } |
318 | ClearPageSwapBacked(new_page); | 319 | ClearPageSwapBacked(new_page); |
319 | __clear_page_locked(new_page); | 320 | __clear_page_locked(new_page); |
320 | swap_free(entry); | 321 | swapcache_free(entry, NULL); |
321 | } while (err != -ENOMEM); | 322 | } while (err != -ENOMEM); |
322 | 323 | ||
323 | if (new_page) | 324 | if (new_page) |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 312fafe0ab6e..d1ade1a48ee7 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -53,6 +53,59 @@ static struct swap_info_struct swap_info[MAX_SWAPFILES]; | |||
53 | 53 | ||
54 | static DEFINE_MUTEX(swapon_mutex); | 54 | static DEFINE_MUTEX(swapon_mutex); |
55 | 55 | ||
56 | /* For reference count accounting in swap_map */ | ||
57 | /* enum for swap_map[] handling. internal use only */ | ||
58 | enum { | ||
59 | SWAP_MAP = 0, /* ops for reference from swap users */ | ||
60 | SWAP_CACHE, /* ops for reference from swap cache */ | ||
61 | }; | ||
62 | |||
63 | static inline int swap_count(unsigned short ent) | ||
64 | { | ||
65 | return ent & SWAP_COUNT_MASK; | ||
66 | } | ||
67 | |||
68 | static inline bool swap_has_cache(unsigned short ent) | ||
69 | { | ||
70 | return !!(ent & SWAP_HAS_CACHE); | ||
71 | } | ||
72 | |||
73 | static inline unsigned short encode_swapmap(int count, bool has_cache) | ||
74 | { | ||
75 | unsigned short ret = count; | ||
76 | |||
77 | if (has_cache) | ||
78 | return SWAP_HAS_CACHE | ret; | ||
79 | return ret; | ||
80 | } | ||
81 | |||
82 | /* returnes 1 if swap entry is freed */ | ||
83 | static int | ||
84 | __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) | ||
85 | { | ||
86 | int type = si - swap_info; | ||
87 | swp_entry_t entry = swp_entry(type, offset); | ||
88 | struct page *page; | ||
89 | int ret = 0; | ||
90 | |||
91 | page = find_get_page(&swapper_space, entry.val); | ||
92 | if (!page) | ||
93 | return 0; | ||
94 | /* | ||
95 | * This function is called from scan_swap_map() and it's called | ||
96 | * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here. | ||
97 | * We have to use trylock for avoiding deadlock. This is a special | ||
98 | * case and you should use try_to_free_swap() with explicit lock_page() | ||
99 | * in usual operations. | ||
100 | */ | ||
101 | if (trylock_page(page)) { | ||
102 | ret = try_to_free_swap(page); | ||
103 | unlock_page(page); | ||
104 | } | ||
105 | page_cache_release(page); | ||
106 | return ret; | ||
107 | } | ||
108 | |||
56 | /* | 109 | /* |
57 | * We need this because the bdev->unplug_fn can sleep and we cannot | 110 | * We need this because the bdev->unplug_fn can sleep and we cannot |
58 | * hold swap_lock while calling the unplug_fn. And swap_lock | 111 | * hold swap_lock while calling the unplug_fn. And swap_lock |
@@ -167,7 +220,8 @@ static int wait_for_discard(void *word) | |||
167 | #define SWAPFILE_CLUSTER 256 | 220 | #define SWAPFILE_CLUSTER 256 |
168 | #define LATENCY_LIMIT 256 | 221 | #define LATENCY_LIMIT 256 |
169 | 222 | ||
170 | static inline unsigned long scan_swap_map(struct swap_info_struct *si) | 223 | static inline unsigned long scan_swap_map(struct swap_info_struct *si, |
224 | int cache) | ||
171 | { | 225 | { |
172 | unsigned long offset; | 226 | unsigned long offset; |
173 | unsigned long scan_base; | 227 | unsigned long scan_base; |
@@ -273,6 +327,19 @@ checks: | |||
273 | goto no_page; | 327 | goto no_page; |
274 | if (offset > si->highest_bit) | 328 | if (offset > si->highest_bit) |
275 | scan_base = offset = si->lowest_bit; | 329 | scan_base = offset = si->lowest_bit; |
330 | |||
331 | /* reuse swap entry of cache-only swap if not busy. */ | ||
332 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { | ||
333 | int swap_was_freed; | ||
334 | spin_unlock(&swap_lock); | ||
335 | swap_was_freed = __try_to_reclaim_swap(si, offset); | ||
336 | spin_lock(&swap_lock); | ||
337 | /* entry was freed successfully, try to use this again */ | ||
338 | if (swap_was_freed) | ||
339 | goto checks; | ||
340 | goto scan; /* check next one */ | ||
341 | } | ||
342 | |||
276 | if (si->swap_map[offset]) | 343 | if (si->swap_map[offset]) |
277 | goto scan; | 344 | goto scan; |
278 | 345 | ||
@@ -285,7 +352,10 @@ checks: | |||
285 | si->lowest_bit = si->max; | 352 | si->lowest_bit = si->max; |
286 | si->highest_bit = 0; | 353 | si->highest_bit = 0; |
287 | } | 354 | } |
288 | si->swap_map[offset] = 1; | 355 | if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */ |
356 | si->swap_map[offset] = encode_swapmap(0, true); | ||
357 | else /* at suspend */ | ||
358 | si->swap_map[offset] = encode_swapmap(1, false); | ||
289 | si->cluster_next = offset + 1; | 359 | si->cluster_next = offset + 1; |
290 | si->flags -= SWP_SCANNING; | 360 | si->flags -= SWP_SCANNING; |
291 | 361 | ||
@@ -351,6 +421,10 @@ scan: | |||
351 | spin_lock(&swap_lock); | 421 | spin_lock(&swap_lock); |
352 | goto checks; | 422 | goto checks; |
353 | } | 423 | } |
424 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { | ||
425 | spin_lock(&swap_lock); | ||
426 | goto checks; | ||
427 | } | ||
354 | if (unlikely(--latency_ration < 0)) { | 428 | if (unlikely(--latency_ration < 0)) { |
355 | cond_resched(); | 429 | cond_resched(); |
356 | latency_ration = LATENCY_LIMIT; | 430 | latency_ration = LATENCY_LIMIT; |
@@ -362,6 +436,10 @@ scan: | |||
362 | spin_lock(&swap_lock); | 436 | spin_lock(&swap_lock); |
363 | goto checks; | 437 | goto checks; |
364 | } | 438 | } |
439 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { | ||
440 | spin_lock(&swap_lock); | ||
441 | goto checks; | ||
442 | } | ||
365 | if (unlikely(--latency_ration < 0)) { | 443 | if (unlikely(--latency_ration < 0)) { |
366 | cond_resched(); | 444 | cond_resched(); |
367 | latency_ration = LATENCY_LIMIT; | 445 | latency_ration = LATENCY_LIMIT; |
@@ -401,7 +479,8 @@ swp_entry_t get_swap_page(void) | |||
401 | continue; | 479 | continue; |
402 | 480 | ||
403 | swap_list.next = next; | 481 | swap_list.next = next; |
404 | offset = scan_swap_map(si); | 482 | /* This is called for allocating swap entry for cache */ |
483 | offset = scan_swap_map(si, SWAP_CACHE); | ||
405 | if (offset) { | 484 | if (offset) { |
406 | spin_unlock(&swap_lock); | 485 | spin_unlock(&swap_lock); |
407 | return swp_entry(type, offset); | 486 | return swp_entry(type, offset); |
@@ -415,6 +494,7 @@ noswap: | |||
415 | return (swp_entry_t) {0}; | 494 | return (swp_entry_t) {0}; |
416 | } | 495 | } |
417 | 496 | ||
497 | /* The only caller of this function is now susupend routine */ | ||
418 | swp_entry_t get_swap_page_of_type(int type) | 498 | swp_entry_t get_swap_page_of_type(int type) |
419 | { | 499 | { |
420 | struct swap_info_struct *si; | 500 | struct swap_info_struct *si; |
@@ -424,7 +504,8 @@ swp_entry_t get_swap_page_of_type(int type) | |||
424 | si = swap_info + type; | 504 | si = swap_info + type; |
425 | if (si->flags & SWP_WRITEOK) { | 505 | if (si->flags & SWP_WRITEOK) { |
426 | nr_swap_pages--; | 506 | nr_swap_pages--; |
427 | offset = scan_swap_map(si); | 507 | /* This is called for allocating swap entry, not cache */ |
508 | offset = scan_swap_map(si, SWAP_MAP); | ||
428 | if (offset) { | 509 | if (offset) { |
429 | spin_unlock(&swap_lock); | 510 | spin_unlock(&swap_lock); |
430 | return swp_entry(type, offset); | 511 | return swp_entry(type, offset); |
@@ -471,26 +552,40 @@ out: | |||
471 | return NULL; | 552 | return NULL; |
472 | } | 553 | } |
473 | 554 | ||
474 | static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent) | 555 | static int swap_entry_free(struct swap_info_struct *p, |
556 | swp_entry_t ent, int cache) | ||
475 | { | 557 | { |
476 | unsigned long offset = swp_offset(ent); | 558 | unsigned long offset = swp_offset(ent); |
477 | int count = p->swap_map[offset]; | 559 | int count = swap_count(p->swap_map[offset]); |
478 | 560 | bool has_cache; | |
479 | if (count < SWAP_MAP_MAX) { | 561 | |
480 | count--; | 562 | has_cache = swap_has_cache(p->swap_map[offset]); |
481 | p->swap_map[offset] = count; | 563 | |
482 | if (!count) { | 564 | if (cache == SWAP_MAP) { /* dropping usage count of swap */ |
483 | if (offset < p->lowest_bit) | 565 | if (count < SWAP_MAP_MAX) { |
484 | p->lowest_bit = offset; | 566 | count--; |
485 | if (offset > p->highest_bit) | 567 | p->swap_map[offset] = encode_swapmap(count, has_cache); |
486 | p->highest_bit = offset; | ||
487 | if (p->prio > swap_info[swap_list.next].prio) | ||
488 | swap_list.next = p - swap_info; | ||
489 | nr_swap_pages++; | ||
490 | p->inuse_pages--; | ||
491 | mem_cgroup_uncharge_swap(ent); | ||
492 | } | 568 | } |
569 | } else { /* dropping swap cache flag */ | ||
570 | VM_BUG_ON(!has_cache); | ||
571 | p->swap_map[offset] = encode_swapmap(count, false); | ||
572 | |||
573 | } | ||
574 | /* return code. */ | ||
575 | count = p->swap_map[offset]; | ||
576 | /* free if no reference */ | ||
577 | if (!count) { | ||
578 | if (offset < p->lowest_bit) | ||
579 | p->lowest_bit = offset; | ||
580 | if (offset > p->highest_bit) | ||
581 | p->highest_bit = offset; | ||
582 | if (p->prio > swap_info[swap_list.next].prio) | ||
583 | swap_list.next = p - swap_info; | ||
584 | nr_swap_pages++; | ||
585 | p->inuse_pages--; | ||
493 | } | 586 | } |
587 | if (!swap_count(count)) | ||
588 | mem_cgroup_uncharge_swap(ent); | ||
494 | return count; | 589 | return count; |
495 | } | 590 | } |
496 | 591 | ||
@@ -504,9 +599,33 @@ void swap_free(swp_entry_t entry) | |||
504 | 599 | ||
505 | p = swap_info_get(entry); | 600 | p = swap_info_get(entry); |
506 | if (p) { | 601 | if (p) { |
507 | swap_entry_free(p, entry); | 602 | swap_entry_free(p, entry, SWAP_MAP); |
603 | spin_unlock(&swap_lock); | ||
604 | } | ||
605 | } | ||
606 | |||
607 | /* | ||
608 | * Called after dropping swapcache to decrease refcnt to swap entries. | ||
609 | */ | ||
610 | void swapcache_free(swp_entry_t entry, struct page *page) | ||
611 | { | ||
612 | struct swap_info_struct *p; | ||
613 | int ret; | ||
614 | |||
615 | p = swap_info_get(entry); | ||
616 | if (p) { | ||
617 | ret = swap_entry_free(p, entry, SWAP_CACHE); | ||
618 | if (page) { | ||
619 | bool swapout; | ||
620 | if (ret) | ||
621 | swapout = true; /* the end of swap out */ | ||
622 | else | ||
623 | swapout = false; /* no more swap users! */ | ||
624 | mem_cgroup_uncharge_swapcache(page, entry, swapout); | ||
625 | } | ||
508 | spin_unlock(&swap_lock); | 626 | spin_unlock(&swap_lock); |
509 | } | 627 | } |
628 | return; | ||
510 | } | 629 | } |
511 | 630 | ||
512 | /* | 631 | /* |
@@ -521,8 +640,7 @@ static inline int page_swapcount(struct page *page) | |||
521 | entry.val = page_private(page); | 640 | entry.val = page_private(page); |
522 | p = swap_info_get(entry); | 641 | p = swap_info_get(entry); |
523 | if (p) { | 642 | if (p) { |
524 | /* Subtract the 1 for the swap cache itself */ | 643 | count = swap_count(p->swap_map[swp_offset(entry)]); |
525 | count = p->swap_map[swp_offset(entry)] - 1; | ||
526 | spin_unlock(&swap_lock); | 644 | spin_unlock(&swap_lock); |
527 | } | 645 | } |
528 | return count; | 646 | return count; |
@@ -584,7 +702,7 @@ int free_swap_and_cache(swp_entry_t entry) | |||
584 | 702 | ||
585 | p = swap_info_get(entry); | 703 | p = swap_info_get(entry); |
586 | if (p) { | 704 | if (p) { |
587 | if (swap_entry_free(p, entry) == 1) { | 705 | if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) { |
588 | page = find_get_page(&swapper_space, entry.val); | 706 | page = find_get_page(&swapper_space, entry.val); |
589 | if (page && !trylock_page(page)) { | 707 | if (page && !trylock_page(page)) { |
590 | page_cache_release(page); | 708 | page_cache_release(page); |
@@ -891,7 +1009,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
891 | i = 1; | 1009 | i = 1; |
892 | } | 1010 | } |
893 | count = si->swap_map[i]; | 1011 | count = si->swap_map[i]; |
894 | if (count && count != SWAP_MAP_BAD) | 1012 | if (count && swap_count(count) != SWAP_MAP_BAD) |
895 | break; | 1013 | break; |
896 | } | 1014 | } |
897 | return i; | 1015 | return i; |
@@ -995,13 +1113,13 @@ static int try_to_unuse(unsigned int type) | |||
995 | */ | 1113 | */ |
996 | shmem = 0; | 1114 | shmem = 0; |
997 | swcount = *swap_map; | 1115 | swcount = *swap_map; |
998 | if (swcount > 1) { | 1116 | if (swap_count(swcount)) { |
999 | if (start_mm == &init_mm) | 1117 | if (start_mm == &init_mm) |
1000 | shmem = shmem_unuse(entry, page); | 1118 | shmem = shmem_unuse(entry, page); |
1001 | else | 1119 | else |
1002 | retval = unuse_mm(start_mm, entry, page); | 1120 | retval = unuse_mm(start_mm, entry, page); |
1003 | } | 1121 | } |
1004 | if (*swap_map > 1) { | 1122 | if (swap_count(*swap_map)) { |
1005 | int set_start_mm = (*swap_map >= swcount); | 1123 | int set_start_mm = (*swap_map >= swcount); |
1006 | struct list_head *p = &start_mm->mmlist; | 1124 | struct list_head *p = &start_mm->mmlist; |
1007 | struct mm_struct *new_start_mm = start_mm; | 1125 | struct mm_struct *new_start_mm = start_mm; |
@@ -1011,7 +1129,7 @@ static int try_to_unuse(unsigned int type) | |||
1011 | atomic_inc(&new_start_mm->mm_users); | 1129 | atomic_inc(&new_start_mm->mm_users); |
1012 | atomic_inc(&prev_mm->mm_users); | 1130 | atomic_inc(&prev_mm->mm_users); |
1013 | spin_lock(&mmlist_lock); | 1131 | spin_lock(&mmlist_lock); |
1014 | while (*swap_map > 1 && !retval && !shmem && | 1132 | while (swap_count(*swap_map) && !retval && !shmem && |
1015 | (p = p->next) != &start_mm->mmlist) { | 1133 | (p = p->next) != &start_mm->mmlist) { |
1016 | mm = list_entry(p, struct mm_struct, mmlist); | 1134 | mm = list_entry(p, struct mm_struct, mmlist); |
1017 | if (!atomic_inc_not_zero(&mm->mm_users)) | 1135 | if (!atomic_inc_not_zero(&mm->mm_users)) |
@@ -1023,14 +1141,16 @@ static int try_to_unuse(unsigned int type) | |||
1023 | cond_resched(); | 1141 | cond_resched(); |
1024 | 1142 | ||
1025 | swcount = *swap_map; | 1143 | swcount = *swap_map; |
1026 | if (swcount <= 1) | 1144 | if (!swap_count(swcount)) /* any usage ? */ |
1027 | ; | 1145 | ; |
1028 | else if (mm == &init_mm) { | 1146 | else if (mm == &init_mm) { |
1029 | set_start_mm = 1; | 1147 | set_start_mm = 1; |
1030 | shmem = shmem_unuse(entry, page); | 1148 | shmem = shmem_unuse(entry, page); |
1031 | } else | 1149 | } else |
1032 | retval = unuse_mm(mm, entry, page); | 1150 | retval = unuse_mm(mm, entry, page); |
1033 | if (set_start_mm && *swap_map < swcount) { | 1151 | |
1152 | if (set_start_mm && | ||
1153 | swap_count(*swap_map) < swcount) { | ||
1034 | mmput(new_start_mm); | 1154 | mmput(new_start_mm); |
1035 | atomic_inc(&mm->mm_users); | 1155 | atomic_inc(&mm->mm_users); |
1036 | new_start_mm = mm; | 1156 | new_start_mm = mm; |
@@ -1057,21 +1177,25 @@ static int try_to_unuse(unsigned int type) | |||
1057 | } | 1177 | } |
1058 | 1178 | ||
1059 | /* | 1179 | /* |
1060 | * How could swap count reach 0x7fff when the maximum | 1180 | * How could swap count reach 0x7ffe ? |
1061 | * pid is 0x7fff, and there's no way to repeat a swap | 1181 | * There's no way to repeat a swap page within an mm |
1062 | * page within an mm (except in shmem, where it's the | 1182 | * (except in shmem, where it's the shared object which takes |
1063 | * shared object which takes the reference count)? | 1183 | * the reference count)? |
1064 | * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. | 1184 | * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned |
1065 | * | 1185 | * short is too small....) |
1066 | * If that's wrong, then we should worry more about | 1186 | * If that's wrong, then we should worry more about |
1067 | * exit_mmap() and do_munmap() cases described above: | 1187 | * exit_mmap() and do_munmap() cases described above: |
1068 | * we might be resetting SWAP_MAP_MAX too early here. | 1188 | * we might be resetting SWAP_MAP_MAX too early here. |
1069 | * We know "Undead"s can happen, they're okay, so don't | 1189 | * We know "Undead"s can happen, they're okay, so don't |
1070 | * report them; but do report if we reset SWAP_MAP_MAX. | 1190 | * report them; but do report if we reset SWAP_MAP_MAX. |
1071 | */ | 1191 | */ |
1072 | if (*swap_map == SWAP_MAP_MAX) { | 1192 | /* We might release the lock_page() in unuse_mm(). */ |
1193 | if (!PageSwapCache(page) || page_private(page) != entry.val) | ||
1194 | goto retry; | ||
1195 | |||
1196 | if (swap_count(*swap_map) == SWAP_MAP_MAX) { | ||
1073 | spin_lock(&swap_lock); | 1197 | spin_lock(&swap_lock); |
1074 | *swap_map = 1; | 1198 | *swap_map = encode_swapmap(0, true); |
1075 | spin_unlock(&swap_lock); | 1199 | spin_unlock(&swap_lock); |
1076 | reset_overflow = 1; | 1200 | reset_overflow = 1; |
1077 | } | 1201 | } |
@@ -1089,7 +1213,8 @@ static int try_to_unuse(unsigned int type) | |||
1089 | * pages would be incorrect if swap supported "shared | 1213 | * pages would be incorrect if swap supported "shared |
1090 | * private" pages, but they are handled by tmpfs files. | 1214 | * private" pages, but they are handled by tmpfs files. |
1091 | */ | 1215 | */ |
1092 | if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { | 1216 | if (swap_count(*swap_map) && |
1217 | PageDirty(page) && PageSwapCache(page)) { | ||
1093 | struct writeback_control wbc = { | 1218 | struct writeback_control wbc = { |
1094 | .sync_mode = WB_SYNC_NONE, | 1219 | .sync_mode = WB_SYNC_NONE, |
1095 | }; | 1220 | }; |
@@ -1116,6 +1241,7 @@ static int try_to_unuse(unsigned int type) | |||
1116 | * mark page dirty so shrink_page_list will preserve it. | 1241 | * mark page dirty so shrink_page_list will preserve it. |
1117 | */ | 1242 | */ |
1118 | SetPageDirty(page); | 1243 | SetPageDirty(page); |
1244 | retry: | ||
1119 | unlock_page(page); | 1245 | unlock_page(page); |
1120 | page_cache_release(page); | 1246 | page_cache_release(page); |
1121 | 1247 | ||
@@ -1942,15 +2068,23 @@ void si_swapinfo(struct sysinfo *val) | |||
1942 | * | 2068 | * |
1943 | * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as | 2069 | * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as |
1944 | * "permanent", but will be reclaimed by the next swapoff. | 2070 | * "permanent", but will be reclaimed by the next swapoff. |
2071 | * Returns error code in following case. | ||
2072 | * - success -> 0 | ||
2073 | * - swp_entry is invalid -> EINVAL | ||
2074 | * - swp_entry is migration entry -> EINVAL | ||
2075 | * - swap-cache reference is requested but there is already one. -> EEXIST | ||
2076 | * - swap-cache reference is requested but the entry is not used. -> ENOENT | ||
1945 | */ | 2077 | */ |
1946 | int swap_duplicate(swp_entry_t entry) | 2078 | static int __swap_duplicate(swp_entry_t entry, bool cache) |
1947 | { | 2079 | { |
1948 | struct swap_info_struct * p; | 2080 | struct swap_info_struct * p; |
1949 | unsigned long offset, type; | 2081 | unsigned long offset, type; |
1950 | int result = 0; | 2082 | int result = -EINVAL; |
2083 | int count; | ||
2084 | bool has_cache; | ||
1951 | 2085 | ||
1952 | if (is_migration_entry(entry)) | 2086 | if (is_migration_entry(entry)) |
1953 | return 1; | 2087 | return -EINVAL; |
1954 | 2088 | ||
1955 | type = swp_type(entry); | 2089 | type = swp_type(entry); |
1956 | if (type >= nr_swapfiles) | 2090 | if (type >= nr_swapfiles) |
@@ -1959,17 +2093,40 @@ int swap_duplicate(swp_entry_t entry) | |||
1959 | offset = swp_offset(entry); | 2093 | offset = swp_offset(entry); |
1960 | 2094 | ||
1961 | spin_lock(&swap_lock); | 2095 | spin_lock(&swap_lock); |
1962 | if (offset < p->max && p->swap_map[offset]) { | 2096 | |
1963 | if (p->swap_map[offset] < SWAP_MAP_MAX - 1) { | 2097 | if (unlikely(offset >= p->max)) |
1964 | p->swap_map[offset]++; | 2098 | goto unlock_out; |
1965 | result = 1; | 2099 | |
1966 | } else if (p->swap_map[offset] <= SWAP_MAP_MAX) { | 2100 | count = swap_count(p->swap_map[offset]); |
2101 | has_cache = swap_has_cache(p->swap_map[offset]); | ||
2102 | |||
2103 | if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */ | ||
2104 | |||
2105 | /* set SWAP_HAS_CACHE if there is no cache and entry is used */ | ||
2106 | if (!has_cache && count) { | ||
2107 | p->swap_map[offset] = encode_swapmap(count, true); | ||
2108 | result = 0; | ||
2109 | } else if (has_cache) /* someone added cache */ | ||
2110 | result = -EEXIST; | ||
2111 | else if (!count) /* no users */ | ||
2112 | result = -ENOENT; | ||
2113 | |||
2114 | } else if (count || has_cache) { | ||
2115 | if (count < SWAP_MAP_MAX - 1) { | ||
2116 | p->swap_map[offset] = encode_swapmap(count + 1, | ||
2117 | has_cache); | ||
2118 | result = 0; | ||
2119 | } else if (count <= SWAP_MAP_MAX) { | ||
1967 | if (swap_overflow++ < 5) | 2120 | if (swap_overflow++ < 5) |
1968 | printk(KERN_WARNING "swap_dup: swap entry overflow\n"); | 2121 | printk(KERN_WARNING |
1969 | p->swap_map[offset] = SWAP_MAP_MAX; | 2122 | "swap_dup: swap entry overflow\n"); |
1970 | result = 1; | 2123 | p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX, |
2124 | has_cache); | ||
2125 | result = 0; | ||
1971 | } | 2126 | } |
1972 | } | 2127 | } else |
2128 | result = -ENOENT; /* unused swap entry */ | ||
2129 | unlock_out: | ||
1973 | spin_unlock(&swap_lock); | 2130 | spin_unlock(&swap_lock); |
1974 | out: | 2131 | out: |
1975 | return result; | 2132 | return result; |
@@ -1978,6 +2135,27 @@ bad_file: | |||
1978 | printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); | 2135 | printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); |
1979 | goto out; | 2136 | goto out; |
1980 | } | 2137 | } |
2138 | /* | ||
2139 | * increase reference count of swap entry by 1. | ||
2140 | */ | ||
2141 | void swap_duplicate(swp_entry_t entry) | ||
2142 | { | ||
2143 | __swap_duplicate(entry, SWAP_MAP); | ||
2144 | } | ||
2145 | |||
2146 | /* | ||
2147 | * @entry: swap entry for which we allocate swap cache. | ||
2148 | * | ||
2149 | * Called when allocating swap cache for exising swap entry, | ||
2150 | * This can return error codes. Returns 0 at success. | ||
2151 | * -EBUSY means there is a swap cache. | ||
2152 | * Note: return code is different from swap_duplicate(). | ||
2153 | */ | ||
2154 | int swapcache_prepare(swp_entry_t entry) | ||
2155 | { | ||
2156 | return __swap_duplicate(entry, SWAP_CACHE); | ||
2157 | } | ||
2158 | |||
1981 | 2159 | ||
1982 | struct swap_info_struct * | 2160 | struct swap_info_struct * |
1983 | get_swap_info_struct(unsigned type) | 2161 | get_swap_info_struct(unsigned type) |
@@ -2016,7 +2194,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | |||
2016 | /* Don't read in free or bad pages */ | 2194 | /* Don't read in free or bad pages */ |
2017 | if (!si->swap_map[toff]) | 2195 | if (!si->swap_map[toff]) |
2018 | break; | 2196 | break; |
2019 | if (si->swap_map[toff] == SWAP_MAP_BAD) | 2197 | if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) |
2020 | break; | 2198 | break; |
2021 | } | 2199 | } |
2022 | /* Count contiguous allocated slots below our target */ | 2200 | /* Count contiguous allocated slots below our target */ |
@@ -2024,7 +2202,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | |||
2024 | /* Don't read in free or bad pages */ | 2202 | /* Don't read in free or bad pages */ |
2025 | if (!si->swap_map[toff]) | 2203 | if (!si->swap_map[toff]) |
2026 | break; | 2204 | break; |
2027 | if (si->swap_map[toff] == SWAP_MAP_BAD) | 2205 | if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) |
2028 | break; | 2206 | break; |
2029 | } | 2207 | } |
2030 | spin_unlock(&swap_lock); | 2208 | spin_unlock(&swap_lock); |
diff --git a/mm/thrash.c b/mm/thrash.c index c4c5205a9c35..2372d4ed5dd8 100644 --- a/mm/thrash.c +++ b/mm/thrash.c | |||
@@ -26,47 +26,45 @@ static DEFINE_SPINLOCK(swap_token_lock); | |||
26 | struct mm_struct *swap_token_mm; | 26 | struct mm_struct *swap_token_mm; |
27 | static unsigned int global_faults; | 27 | static unsigned int global_faults; |
28 | 28 | ||
29 | void grab_swap_token(void) | 29 | void grab_swap_token(struct mm_struct *mm) |
30 | { | 30 | { |
31 | int current_interval; | 31 | int current_interval; |
32 | 32 | ||
33 | global_faults++; | 33 | global_faults++; |
34 | 34 | ||
35 | current_interval = global_faults - current->mm->faultstamp; | 35 | current_interval = global_faults - mm->faultstamp; |
36 | 36 | ||
37 | if (!spin_trylock(&swap_token_lock)) | 37 | if (!spin_trylock(&swap_token_lock)) |
38 | return; | 38 | return; |
39 | 39 | ||
40 | /* First come first served */ | 40 | /* First come first served */ |
41 | if (swap_token_mm == NULL) { | 41 | if (swap_token_mm == NULL) { |
42 | current->mm->token_priority = current->mm->token_priority + 2; | 42 | mm->token_priority = mm->token_priority + 2; |
43 | swap_token_mm = current->mm; | 43 | swap_token_mm = mm; |
44 | goto out; | 44 | goto out; |
45 | } | 45 | } |
46 | 46 | ||
47 | if (current->mm != swap_token_mm) { | 47 | if (mm != swap_token_mm) { |
48 | if (current_interval < current->mm->last_interval) | 48 | if (current_interval < mm->last_interval) |
49 | current->mm->token_priority++; | 49 | mm->token_priority++; |
50 | else { | 50 | else { |
51 | if (likely(current->mm->token_priority > 0)) | 51 | if (likely(mm->token_priority > 0)) |
52 | current->mm->token_priority--; | 52 | mm->token_priority--; |
53 | } | 53 | } |
54 | /* Check if we deserve the token */ | 54 | /* Check if we deserve the token */ |
55 | if (current->mm->token_priority > | 55 | if (mm->token_priority > swap_token_mm->token_priority) { |
56 | swap_token_mm->token_priority) { | 56 | mm->token_priority += 2; |
57 | current->mm->token_priority += 2; | 57 | swap_token_mm = mm; |
58 | swap_token_mm = current->mm; | ||
59 | } | 58 | } |
60 | } else { | 59 | } else { |
61 | /* Token holder came in again! */ | 60 | /* Token holder came in again! */ |
62 | current->mm->token_priority += 2; | 61 | mm->token_priority += 2; |
63 | } | 62 | } |
64 | 63 | ||
65 | out: | 64 | out: |
66 | current->mm->faultstamp = global_faults; | 65 | mm->faultstamp = global_faults; |
67 | current->mm->last_interval = current_interval; | 66 | mm->last_interval = current_interval; |
68 | spin_unlock(&swap_token_lock); | 67 | spin_unlock(&swap_token_lock); |
69 | return; | ||
70 | } | 68 | } |
71 | 69 | ||
72 | /* Called on process exit. */ | 70 | /* Called on process exit. */ |
diff --git a/mm/truncate.c b/mm/truncate.c index 12e1579f9165..ccc3ecf7cb98 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -267,8 +267,21 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) | |||
267 | } | 267 | } |
268 | EXPORT_SYMBOL(truncate_inode_pages); | 268 | EXPORT_SYMBOL(truncate_inode_pages); |
269 | 269 | ||
270 | unsigned long __invalidate_mapping_pages(struct address_space *mapping, | 270 | /** |
271 | pgoff_t start, pgoff_t end, bool be_atomic) | 271 | * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode |
272 | * @mapping: the address_space which holds the pages to invalidate | ||
273 | * @start: the offset 'from' which to invalidate | ||
274 | * @end: the offset 'to' which to invalidate (inclusive) | ||
275 | * | ||
276 | * This function only removes the unlocked pages, if you want to | ||
277 | * remove all the pages of one inode, you must call truncate_inode_pages. | ||
278 | * | ||
279 | * invalidate_mapping_pages() will not block on IO activity. It will not | ||
280 | * invalidate pages which are dirty, locked, under writeback or mapped into | ||
281 | * pagetables. | ||
282 | */ | ||
283 | unsigned long invalidate_mapping_pages(struct address_space *mapping, | ||
284 | pgoff_t start, pgoff_t end) | ||
272 | { | 285 | { |
273 | struct pagevec pvec; | 286 | struct pagevec pvec; |
274 | pgoff_t next = start; | 287 | pgoff_t next = start; |
@@ -309,30 +322,10 @@ unlock: | |||
309 | break; | 322 | break; |
310 | } | 323 | } |
311 | pagevec_release(&pvec); | 324 | pagevec_release(&pvec); |
312 | if (likely(!be_atomic)) | 325 | cond_resched(); |
313 | cond_resched(); | ||
314 | } | 326 | } |
315 | return ret; | 327 | return ret; |
316 | } | 328 | } |
317 | |||
318 | /** | ||
319 | * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode | ||
320 | * @mapping: the address_space which holds the pages to invalidate | ||
321 | * @start: the offset 'from' which to invalidate | ||
322 | * @end: the offset 'to' which to invalidate (inclusive) | ||
323 | * | ||
324 | * This function only removes the unlocked pages, if you want to | ||
325 | * remove all the pages of one inode, you must call truncate_inode_pages. | ||
326 | * | ||
327 | * invalidate_mapping_pages() will not block on IO activity. It will not | ||
328 | * invalidate pages which are dirty, locked, under writeback or mapped into | ||
329 | * pagetables. | ||
330 | */ | ||
331 | unsigned long invalidate_mapping_pages(struct address_space *mapping, | ||
332 | pgoff_t start, pgoff_t end) | ||
333 | { | ||
334 | return __invalidate_mapping_pages(mapping, start, end, false); | ||
335 | } | ||
336 | EXPORT_SYMBOL(invalidate_mapping_pages); | 329 | EXPORT_SYMBOL(invalidate_mapping_pages); |
337 | 330 | ||
338 | /* | 331 | /* |
@@ -4,9 +4,11 @@ | |||
4 | #include <linux/module.h> | 4 | #include <linux/module.h> |
5 | #include <linux/err.h> | 5 | #include <linux/err.h> |
6 | #include <linux/sched.h> | 6 | #include <linux/sched.h> |
7 | #include <linux/tracepoint.h> | ||
8 | #include <asm/uaccess.h> | 7 | #include <asm/uaccess.h> |
9 | 8 | ||
9 | #define CREATE_TRACE_POINTS | ||
10 | #include <trace/events/kmem.h> | ||
11 | |||
10 | /** | 12 | /** |
11 | * kstrdup - allocate space for and copy an existing string | 13 | * kstrdup - allocate space for and copy an existing string |
12 | * @s: the string to duplicate | 14 | * @s: the string to duplicate |
@@ -166,6 +168,10 @@ EXPORT_SYMBOL(krealloc); | |||
166 | * | 168 | * |
167 | * The memory of the object @p points to is zeroed before freed. | 169 | * The memory of the object @p points to is zeroed before freed. |
168 | * If @p is %NULL, kzfree() does nothing. | 170 | * If @p is %NULL, kzfree() does nothing. |
171 | * | ||
172 | * Note: this function zeroes the whole allocated buffer which can be a good | ||
173 | * deal bigger than the requested buffer size passed to kmalloc(). So be | ||
174 | * careful when using this function in performance sensitive code. | ||
169 | */ | 175 | */ |
170 | void kzfree(const void *p) | 176 | void kzfree(const void *p) |
171 | { | 177 | { |
@@ -231,13 +237,21 @@ void arch_pick_mmap_layout(struct mm_struct *mm) | |||
231 | * @pages: array that receives pointers to the pages pinned. | 237 | * @pages: array that receives pointers to the pages pinned. |
232 | * Should be at least nr_pages long. | 238 | * Should be at least nr_pages long. |
233 | * | 239 | * |
234 | * Attempt to pin user pages in memory without taking mm->mmap_sem. | ||
235 | * If not successful, it will fall back to taking the lock and | ||
236 | * calling get_user_pages(). | ||
237 | * | ||
238 | * Returns number of pages pinned. This may be fewer than the number | 240 | * Returns number of pages pinned. This may be fewer than the number |
239 | * requested. If nr_pages is 0 or negative, returns 0. If no pages | 241 | * requested. If nr_pages is 0 or negative, returns 0. If no pages |
240 | * were pinned, returns -errno. | 242 | * were pinned, returns -errno. |
243 | * | ||
244 | * get_user_pages_fast provides equivalent functionality to get_user_pages, | ||
245 | * operating on current and current->mm, with force=0 and vma=NULL. However | ||
246 | * unlike get_user_pages, it must be called without mmap_sem held. | ||
247 | * | ||
248 | * get_user_pages_fast may take mmap_sem and page table locks, so no | ||
249 | * assumptions can be made about lack of locking. get_user_pages_fast is to be | ||
250 | * implemented in a way that is advantageous (vs get_user_pages()) when the | ||
251 | * user memory area is already faulted in and present in ptes. However if the | ||
252 | * pages have to be faulted in, it may turn out to be slightly slower so | ||
253 | * callers need to carefully consider what to use. On many architectures, | ||
254 | * get_user_pages_fast simply falls back to get_user_pages. | ||
241 | */ | 255 | */ |
242 | int __attribute__((weak)) get_user_pages_fast(unsigned long start, | 256 | int __attribute__((weak)) get_user_pages_fast(unsigned long start, |
243 | int nr_pages, int write, struct page **pages) | 257 | int nr_pages, int write, struct page **pages) |
@@ -255,13 +269,6 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start, | |||
255 | EXPORT_SYMBOL_GPL(get_user_pages_fast); | 269 | EXPORT_SYMBOL_GPL(get_user_pages_fast); |
256 | 270 | ||
257 | /* Tracepoints definitions. */ | 271 | /* Tracepoints definitions. */ |
258 | DEFINE_TRACE(kmalloc); | ||
259 | DEFINE_TRACE(kmem_cache_alloc); | ||
260 | DEFINE_TRACE(kmalloc_node); | ||
261 | DEFINE_TRACE(kmem_cache_alloc_node); | ||
262 | DEFINE_TRACE(kfree); | ||
263 | DEFINE_TRACE(kmem_cache_free); | ||
264 | |||
265 | EXPORT_TRACEPOINT_SYMBOL(kmalloc); | 272 | EXPORT_TRACEPOINT_SYMBOL(kmalloc); |
266 | EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); | 273 | EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); |
267 | EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); | 274 | EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 083716ea38c9..f8189a4b3e13 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -23,8 +23,8 @@ | |||
23 | #include <linux/rbtree.h> | 23 | #include <linux/rbtree.h> |
24 | #include <linux/radix-tree.h> | 24 | #include <linux/radix-tree.h> |
25 | #include <linux/rcupdate.h> | 25 | #include <linux/rcupdate.h> |
26 | #include <linux/bootmem.h> | ||
27 | #include <linux/pfn.h> | 26 | #include <linux/pfn.h> |
27 | #include <linux/kmemleak.h> | ||
28 | 28 | ||
29 | #include <asm/atomic.h> | 29 | #include <asm/atomic.h> |
30 | #include <asm/uaccess.h> | 30 | #include <asm/uaccess.h> |
@@ -1032,7 +1032,7 @@ void __init vmalloc_init(void) | |||
1032 | 1032 | ||
1033 | /* Import existing vmlist entries. */ | 1033 | /* Import existing vmlist entries. */ |
1034 | for (tmp = vmlist; tmp; tmp = tmp->next) { | 1034 | for (tmp = vmlist; tmp; tmp = tmp->next) { |
1035 | va = alloc_bootmem(sizeof(struct vmap_area)); | 1035 | va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); |
1036 | va->flags = tmp->flags | VM_VM_AREA; | 1036 | va->flags = tmp->flags | VM_VM_AREA; |
1037 | va->va_start = (unsigned long)tmp->addr; | 1037 | va->va_start = (unsigned long)tmp->addr; |
1038 | va->va_end = va->va_start + tmp->size; | 1038 | va->va_end = va->va_start + tmp->size; |
@@ -1327,6 +1327,9 @@ static void __vunmap(const void *addr, int deallocate_pages) | |||
1327 | void vfree(const void *addr) | 1327 | void vfree(const void *addr) |
1328 | { | 1328 | { |
1329 | BUG_ON(in_interrupt()); | 1329 | BUG_ON(in_interrupt()); |
1330 | |||
1331 | kmemleak_free(addr); | ||
1332 | |||
1330 | __vunmap(addr, 1); | 1333 | __vunmap(addr, 1); |
1331 | } | 1334 | } |
1332 | EXPORT_SYMBOL(vfree); | 1335 | EXPORT_SYMBOL(vfree); |
@@ -1439,8 +1442,17 @@ fail: | |||
1439 | 1442 | ||
1440 | void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) | 1443 | void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) |
1441 | { | 1444 | { |
1442 | return __vmalloc_area_node(area, gfp_mask, prot, -1, | 1445 | void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1, |
1443 | __builtin_return_address(0)); | 1446 | __builtin_return_address(0)); |
1447 | |||
1448 | /* | ||
1449 | * A ref_count = 3 is needed because the vm_struct and vmap_area | ||
1450 | * structures allocated in the __get_vm_area_node() function contain | ||
1451 | * references to the virtual address of the vmalloc'ed block. | ||
1452 | */ | ||
1453 | kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask); | ||
1454 | |||
1455 | return addr; | ||
1444 | } | 1456 | } |
1445 | 1457 | ||
1446 | /** | 1458 | /** |
@@ -1459,6 +1471,8 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, | |||
1459 | int node, void *caller) | 1471 | int node, void *caller) |
1460 | { | 1472 | { |
1461 | struct vm_struct *area; | 1473 | struct vm_struct *area; |
1474 | void *addr; | ||
1475 | unsigned long real_size = size; | ||
1462 | 1476 | ||
1463 | size = PAGE_ALIGN(size); | 1477 | size = PAGE_ALIGN(size); |
1464 | if (!size || (size >> PAGE_SHIFT) > num_physpages) | 1478 | if (!size || (size >> PAGE_SHIFT) > num_physpages) |
@@ -1470,7 +1484,16 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, | |||
1470 | if (!area) | 1484 | if (!area) |
1471 | return NULL; | 1485 | return NULL; |
1472 | 1486 | ||
1473 | return __vmalloc_area_node(area, gfp_mask, prot, node, caller); | 1487 | addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); |
1488 | |||
1489 | /* | ||
1490 | * A ref_count = 3 is needed because the vm_struct and vmap_area | ||
1491 | * structures allocated in the __get_vm_area_node() function contain | ||
1492 | * references to the virtual address of the vmalloc'ed block. | ||
1493 | */ | ||
1494 | kmemleak_alloc(addr, real_size, 3, gfp_mask); | ||
1495 | |||
1496 | return addr; | ||
1474 | } | 1497 | } |
1475 | 1498 | ||
1476 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) | 1499 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) |
diff --git a/mm/vmscan.c b/mm/vmscan.c index d254306562cd..54155268dfca 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -470,8 +470,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) | |||
470 | swp_entry_t swap = { .val = page_private(page) }; | 470 | swp_entry_t swap = { .val = page_private(page) }; |
471 | __delete_from_swap_cache(page); | 471 | __delete_from_swap_cache(page); |
472 | spin_unlock_irq(&mapping->tree_lock); | 472 | spin_unlock_irq(&mapping->tree_lock); |
473 | mem_cgroup_uncharge_swapcache(page, swap); | 473 | swapcache_free(swap, page); |
474 | swap_free(swap); | ||
475 | } else { | 474 | } else { |
476 | __remove_from_page_cache(page); | 475 | __remove_from_page_cache(page); |
477 | spin_unlock_irq(&mapping->tree_lock); | 476 | spin_unlock_irq(&mapping->tree_lock); |
@@ -514,7 +513,6 @@ int remove_mapping(struct address_space *mapping, struct page *page) | |||
514 | * | 513 | * |
515 | * lru_lock must not be held, interrupts must be enabled. | 514 | * lru_lock must not be held, interrupts must be enabled. |
516 | */ | 515 | */ |
517 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
518 | void putback_lru_page(struct page *page) | 516 | void putback_lru_page(struct page *page) |
519 | { | 517 | { |
520 | int lru; | 518 | int lru; |
@@ -568,20 +566,6 @@ redo: | |||
568 | put_page(page); /* drop ref from isolate */ | 566 | put_page(page); /* drop ref from isolate */ |
569 | } | 567 | } |
570 | 568 | ||
571 | #else /* CONFIG_UNEVICTABLE_LRU */ | ||
572 | |||
573 | void putback_lru_page(struct page *page) | ||
574 | { | ||
575 | int lru; | ||
576 | VM_BUG_ON(PageLRU(page)); | ||
577 | |||
578 | lru = !!TestClearPageActive(page) + page_is_file_cache(page); | ||
579 | lru_cache_add_lru(page, lru); | ||
580 | put_page(page); | ||
581 | } | ||
582 | #endif /* CONFIG_UNEVICTABLE_LRU */ | ||
583 | |||
584 | |||
585 | /* | 569 | /* |
586 | * shrink_page_list() returns the number of reclaimed pages | 570 | * shrink_page_list() returns the number of reclaimed pages |
587 | */ | 571 | */ |
@@ -593,6 +577,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
593 | struct pagevec freed_pvec; | 577 | struct pagevec freed_pvec; |
594 | int pgactivate = 0; | 578 | int pgactivate = 0; |
595 | unsigned long nr_reclaimed = 0; | 579 | unsigned long nr_reclaimed = 0; |
580 | unsigned long vm_flags; | ||
596 | 581 | ||
597 | cond_resched(); | 582 | cond_resched(); |
598 | 583 | ||
@@ -643,7 +628,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
643 | goto keep_locked; | 628 | goto keep_locked; |
644 | } | 629 | } |
645 | 630 | ||
646 | referenced = page_referenced(page, 1, sc->mem_cgroup); | 631 | referenced = page_referenced(page, 1, |
632 | sc->mem_cgroup, &vm_flags); | ||
647 | /* In active use or really unfreeable? Activate it. */ | 633 | /* In active use or really unfreeable? Activate it. */ |
648 | if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && | 634 | if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && |
649 | referenced && page_mapping_inuse(page)) | 635 | referenced && page_mapping_inuse(page)) |
@@ -851,7 +837,6 @@ int __isolate_lru_page(struct page *page, int mode, int file) | |||
851 | */ | 837 | */ |
852 | ClearPageLRU(page); | 838 | ClearPageLRU(page); |
853 | ret = 0; | 839 | ret = 0; |
854 | mem_cgroup_del_lru(page); | ||
855 | } | 840 | } |
856 | 841 | ||
857 | return ret; | 842 | return ret; |
@@ -899,12 +884,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
899 | switch (__isolate_lru_page(page, mode, file)) { | 884 | switch (__isolate_lru_page(page, mode, file)) { |
900 | case 0: | 885 | case 0: |
901 | list_move(&page->lru, dst); | 886 | list_move(&page->lru, dst); |
887 | mem_cgroup_del_lru(page); | ||
902 | nr_taken++; | 888 | nr_taken++; |
903 | break; | 889 | break; |
904 | 890 | ||
905 | case -EBUSY: | 891 | case -EBUSY: |
906 | /* else it is being freed elsewhere */ | 892 | /* else it is being freed elsewhere */ |
907 | list_move(&page->lru, src); | 893 | list_move(&page->lru, src); |
894 | mem_cgroup_rotate_lru_list(page, page_lru(page)); | ||
908 | continue; | 895 | continue; |
909 | 896 | ||
910 | default: | 897 | default: |
@@ -943,18 +930,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
943 | /* Check that we have not crossed a zone boundary. */ | 930 | /* Check that we have not crossed a zone boundary. */ |
944 | if (unlikely(page_zone_id(cursor_page) != zone_id)) | 931 | if (unlikely(page_zone_id(cursor_page) != zone_id)) |
945 | continue; | 932 | continue; |
946 | switch (__isolate_lru_page(cursor_page, mode, file)) { | 933 | if (__isolate_lru_page(cursor_page, mode, file) == 0) { |
947 | case 0: | ||
948 | list_move(&cursor_page->lru, dst); | 934 | list_move(&cursor_page->lru, dst); |
935 | mem_cgroup_del_lru(cursor_page); | ||
949 | nr_taken++; | 936 | nr_taken++; |
950 | scan++; | 937 | scan++; |
951 | break; | ||
952 | |||
953 | case -EBUSY: | ||
954 | /* else it is being freed elsewhere */ | ||
955 | list_move(&cursor_page->lru, src); | ||
956 | default: | ||
957 | break; /* ! on LRU or wrong list */ | ||
958 | } | 938 | } |
959 | } | 939 | } |
960 | } | 940 | } |
@@ -1061,6 +1041,19 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
1061 | unsigned long nr_scanned = 0; | 1041 | unsigned long nr_scanned = 0; |
1062 | unsigned long nr_reclaimed = 0; | 1042 | unsigned long nr_reclaimed = 0; |
1063 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1043 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
1044 | int lumpy_reclaim = 0; | ||
1045 | |||
1046 | /* | ||
1047 | * If we need a large contiguous chunk of memory, or have | ||
1048 | * trouble getting a small set of contiguous pages, we | ||
1049 | * will reclaim both active and inactive pages. | ||
1050 | * | ||
1051 | * We use the same threshold as pageout congestion_wait below. | ||
1052 | */ | ||
1053 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | ||
1054 | lumpy_reclaim = 1; | ||
1055 | else if (sc->order && priority < DEF_PRIORITY - 2) | ||
1056 | lumpy_reclaim = 1; | ||
1064 | 1057 | ||
1065 | pagevec_init(&pvec, 1); | 1058 | pagevec_init(&pvec, 1); |
1066 | 1059 | ||
@@ -1073,19 +1066,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
1073 | unsigned long nr_freed; | 1066 | unsigned long nr_freed; |
1074 | unsigned long nr_active; | 1067 | unsigned long nr_active; |
1075 | unsigned int count[NR_LRU_LISTS] = { 0, }; | 1068 | unsigned int count[NR_LRU_LISTS] = { 0, }; |
1076 | int mode = ISOLATE_INACTIVE; | 1069 | int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE; |
1077 | |||
1078 | /* | ||
1079 | * If we need a large contiguous chunk of memory, or have | ||
1080 | * trouble getting a small set of contiguous pages, we | ||
1081 | * will reclaim both active and inactive pages. | ||
1082 | * | ||
1083 | * We use the same threshold as pageout congestion_wait below. | ||
1084 | */ | ||
1085 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | ||
1086 | mode = ISOLATE_BOTH; | ||
1087 | else if (sc->order && priority < DEF_PRIORITY - 2) | ||
1088 | mode = ISOLATE_BOTH; | ||
1089 | 1070 | ||
1090 | nr_taken = sc->isolate_pages(sc->swap_cluster_max, | 1071 | nr_taken = sc->isolate_pages(sc->swap_cluster_max, |
1091 | &page_list, &nr_scan, sc->order, mode, | 1072 | &page_list, &nr_scan, sc->order, mode, |
@@ -1122,7 +1103,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
1122 | * but that should be acceptable to the caller | 1103 | * but that should be acceptable to the caller |
1123 | */ | 1104 | */ |
1124 | if (nr_freed < nr_taken && !current_is_kswapd() && | 1105 | if (nr_freed < nr_taken && !current_is_kswapd() && |
1125 | sc->order > PAGE_ALLOC_COSTLY_ORDER) { | 1106 | lumpy_reclaim) { |
1126 | congestion_wait(WRITE, HZ/10); | 1107 | congestion_wait(WRITE, HZ/10); |
1127 | 1108 | ||
1128 | /* | 1109 | /* |
@@ -1217,18 +1198,54 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority) | |||
1217 | * But we had to alter page->flags anyway. | 1198 | * But we had to alter page->flags anyway. |
1218 | */ | 1199 | */ |
1219 | 1200 | ||
1201 | static void move_active_pages_to_lru(struct zone *zone, | ||
1202 | struct list_head *list, | ||
1203 | enum lru_list lru) | ||
1204 | { | ||
1205 | unsigned long pgmoved = 0; | ||
1206 | struct pagevec pvec; | ||
1207 | struct page *page; | ||
1208 | |||
1209 | pagevec_init(&pvec, 1); | ||
1210 | |||
1211 | while (!list_empty(list)) { | ||
1212 | page = lru_to_page(list); | ||
1213 | prefetchw_prev_lru_page(page, list, flags); | ||
1214 | |||
1215 | VM_BUG_ON(PageLRU(page)); | ||
1216 | SetPageLRU(page); | ||
1217 | |||
1218 | VM_BUG_ON(!PageActive(page)); | ||
1219 | if (!is_active_lru(lru)) | ||
1220 | ClearPageActive(page); /* we are de-activating */ | ||
1221 | |||
1222 | list_move(&page->lru, &zone->lru[lru].list); | ||
1223 | mem_cgroup_add_lru_list(page, lru); | ||
1224 | pgmoved++; | ||
1225 | |||
1226 | if (!pagevec_add(&pvec, page) || list_empty(list)) { | ||
1227 | spin_unlock_irq(&zone->lru_lock); | ||
1228 | if (buffer_heads_over_limit) | ||
1229 | pagevec_strip(&pvec); | ||
1230 | __pagevec_release(&pvec); | ||
1231 | spin_lock_irq(&zone->lru_lock); | ||
1232 | } | ||
1233 | } | ||
1234 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); | ||
1235 | if (!is_active_lru(lru)) | ||
1236 | __count_vm_events(PGDEACTIVATE, pgmoved); | ||
1237 | } | ||
1220 | 1238 | ||
1221 | static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | 1239 | static void shrink_active_list(unsigned long nr_pages, struct zone *zone, |
1222 | struct scan_control *sc, int priority, int file) | 1240 | struct scan_control *sc, int priority, int file) |
1223 | { | 1241 | { |
1224 | unsigned long pgmoved; | 1242 | unsigned long pgmoved; |
1225 | int pgdeactivate = 0; | ||
1226 | unsigned long pgscanned; | 1243 | unsigned long pgscanned; |
1244 | unsigned long vm_flags; | ||
1227 | LIST_HEAD(l_hold); /* The pages which were snipped off */ | 1245 | LIST_HEAD(l_hold); /* The pages which were snipped off */ |
1246 | LIST_HEAD(l_active); | ||
1228 | LIST_HEAD(l_inactive); | 1247 | LIST_HEAD(l_inactive); |
1229 | struct page *page; | 1248 | struct page *page; |
1230 | struct pagevec pvec; | ||
1231 | enum lru_list lru; | ||
1232 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1249 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
1233 | 1250 | ||
1234 | lru_add_drain(); | 1251 | lru_add_drain(); |
@@ -1245,13 +1262,14 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1245 | } | 1262 | } |
1246 | reclaim_stat->recent_scanned[!!file] += pgmoved; | 1263 | reclaim_stat->recent_scanned[!!file] += pgmoved; |
1247 | 1264 | ||
1265 | __count_zone_vm_events(PGREFILL, zone, pgscanned); | ||
1248 | if (file) | 1266 | if (file) |
1249 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); | 1267 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); |
1250 | else | 1268 | else |
1251 | __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); | 1269 | __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); |
1252 | spin_unlock_irq(&zone->lru_lock); | 1270 | spin_unlock_irq(&zone->lru_lock); |
1253 | 1271 | ||
1254 | pgmoved = 0; | 1272 | pgmoved = 0; /* count referenced (mapping) mapped pages */ |
1255 | while (!list_empty(&l_hold)) { | 1273 | while (!list_empty(&l_hold)) { |
1256 | cond_resched(); | 1274 | cond_resched(); |
1257 | page = lru_to_page(&l_hold); | 1275 | page = lru_to_page(&l_hold); |
@@ -1264,58 +1282,44 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1264 | 1282 | ||
1265 | /* page_referenced clears PageReferenced */ | 1283 | /* page_referenced clears PageReferenced */ |
1266 | if (page_mapping_inuse(page) && | 1284 | if (page_mapping_inuse(page) && |
1267 | page_referenced(page, 0, sc->mem_cgroup)) | 1285 | page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { |
1268 | pgmoved++; | 1286 | pgmoved++; |
1287 | /* | ||
1288 | * Identify referenced, file-backed active pages and | ||
1289 | * give them one more trip around the active list. So | ||
1290 | * that executable code get better chances to stay in | ||
1291 | * memory under moderate memory pressure. Anon pages | ||
1292 | * are not likely to be evicted by use-once streaming | ||
1293 | * IO, plus JVM can create lots of anon VM_EXEC pages, | ||
1294 | * so we ignore them here. | ||
1295 | */ | ||
1296 | if ((vm_flags & VM_EXEC) && !PageAnon(page)) { | ||
1297 | list_add(&page->lru, &l_active); | ||
1298 | continue; | ||
1299 | } | ||
1300 | } | ||
1269 | 1301 | ||
1270 | list_add(&page->lru, &l_inactive); | 1302 | list_add(&page->lru, &l_inactive); |
1271 | } | 1303 | } |
1272 | 1304 | ||
1273 | /* | 1305 | /* |
1274 | * Move the pages to the [file or anon] inactive list. | 1306 | * Move pages back to the lru list. |
1275 | */ | 1307 | */ |
1276 | pagevec_init(&pvec, 1); | ||
1277 | lru = LRU_BASE + file * LRU_FILE; | ||
1278 | |||
1279 | spin_lock_irq(&zone->lru_lock); | 1308 | spin_lock_irq(&zone->lru_lock); |
1280 | /* | 1309 | /* |
1281 | * Count referenced pages from currently used mappings as | 1310 | * Count referenced pages from currently used mappings as rotated, |
1282 | * rotated, even though they are moved to the inactive list. | 1311 | * even though only some of them are actually re-activated. This |
1283 | * This helps balance scan pressure between file and anonymous | 1312 | * helps balance scan pressure between file and anonymous pages in |
1284 | * pages in get_scan_ratio. | 1313 | * get_scan_ratio. |
1285 | */ | 1314 | */ |
1286 | reclaim_stat->recent_rotated[!!file] += pgmoved; | 1315 | reclaim_stat->recent_rotated[!!file] += pgmoved; |
1287 | 1316 | ||
1288 | pgmoved = 0; | 1317 | move_active_pages_to_lru(zone, &l_active, |
1289 | while (!list_empty(&l_inactive)) { | 1318 | LRU_ACTIVE + file * LRU_FILE); |
1290 | page = lru_to_page(&l_inactive); | 1319 | move_active_pages_to_lru(zone, &l_inactive, |
1291 | prefetchw_prev_lru_page(page, &l_inactive, flags); | 1320 | LRU_BASE + file * LRU_FILE); |
1292 | VM_BUG_ON(PageLRU(page)); | ||
1293 | SetPageLRU(page); | ||
1294 | VM_BUG_ON(!PageActive(page)); | ||
1295 | ClearPageActive(page); | ||
1296 | 1321 | ||
1297 | list_move(&page->lru, &zone->lru[lru].list); | ||
1298 | mem_cgroup_add_lru_list(page, lru); | ||
1299 | pgmoved++; | ||
1300 | if (!pagevec_add(&pvec, page)) { | ||
1301 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); | ||
1302 | spin_unlock_irq(&zone->lru_lock); | ||
1303 | pgdeactivate += pgmoved; | ||
1304 | pgmoved = 0; | ||
1305 | if (buffer_heads_over_limit) | ||
1306 | pagevec_strip(&pvec); | ||
1307 | __pagevec_release(&pvec); | ||
1308 | spin_lock_irq(&zone->lru_lock); | ||
1309 | } | ||
1310 | } | ||
1311 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); | ||
1312 | pgdeactivate += pgmoved; | ||
1313 | __count_zone_vm_events(PGREFILL, zone, pgscanned); | ||
1314 | __count_vm_events(PGDEACTIVATE, pgdeactivate); | ||
1315 | spin_unlock_irq(&zone->lru_lock); | 1322 | spin_unlock_irq(&zone->lru_lock); |
1316 | if (buffer_heads_over_limit) | ||
1317 | pagevec_strip(&pvec); | ||
1318 | pagevec_release(&pvec); | ||
1319 | } | 1323 | } |
1320 | 1324 | ||
1321 | static int inactive_anon_is_low_global(struct zone *zone) | 1325 | static int inactive_anon_is_low_global(struct zone *zone) |
@@ -1350,12 +1354,48 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) | |||
1350 | return low; | 1354 | return low; |
1351 | } | 1355 | } |
1352 | 1356 | ||
1357 | static int inactive_file_is_low_global(struct zone *zone) | ||
1358 | { | ||
1359 | unsigned long active, inactive; | ||
1360 | |||
1361 | active = zone_page_state(zone, NR_ACTIVE_FILE); | ||
1362 | inactive = zone_page_state(zone, NR_INACTIVE_FILE); | ||
1363 | |||
1364 | return (active > inactive); | ||
1365 | } | ||
1366 | |||
1367 | /** | ||
1368 | * inactive_file_is_low - check if file pages need to be deactivated | ||
1369 | * @zone: zone to check | ||
1370 | * @sc: scan control of this context | ||
1371 | * | ||
1372 | * When the system is doing streaming IO, memory pressure here | ||
1373 | * ensures that active file pages get deactivated, until more | ||
1374 | * than half of the file pages are on the inactive list. | ||
1375 | * | ||
1376 | * Once we get to that situation, protect the system's working | ||
1377 | * set from being evicted by disabling active file page aging. | ||
1378 | * | ||
1379 | * This uses a different ratio than the anonymous pages, because | ||
1380 | * the page cache uses a use-once replacement algorithm. | ||
1381 | */ | ||
1382 | static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) | ||
1383 | { | ||
1384 | int low; | ||
1385 | |||
1386 | if (scanning_global_lru(sc)) | ||
1387 | low = inactive_file_is_low_global(zone); | ||
1388 | else | ||
1389 | low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup); | ||
1390 | return low; | ||
1391 | } | ||
1392 | |||
1353 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, | 1393 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, |
1354 | struct zone *zone, struct scan_control *sc, int priority) | 1394 | struct zone *zone, struct scan_control *sc, int priority) |
1355 | { | 1395 | { |
1356 | int file = is_file_lru(lru); | 1396 | int file = is_file_lru(lru); |
1357 | 1397 | ||
1358 | if (lru == LRU_ACTIVE_FILE) { | 1398 | if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) { |
1359 | shrink_active_list(nr_to_scan, zone, sc, priority, file); | 1399 | shrink_active_list(nr_to_scan, zone, sc, priority, file); |
1360 | return 0; | 1400 | return 0; |
1361 | } | 1401 | } |
@@ -1384,13 +1424,6 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
1384 | unsigned long ap, fp; | 1424 | unsigned long ap, fp; |
1385 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1425 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
1386 | 1426 | ||
1387 | /* If we have no swap space, do not bother scanning anon pages. */ | ||
1388 | if (!sc->may_swap || (nr_swap_pages <= 0)) { | ||
1389 | percent[0] = 0; | ||
1390 | percent[1] = 100; | ||
1391 | return; | ||
1392 | } | ||
1393 | |||
1394 | anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) + | 1427 | anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) + |
1395 | zone_nr_pages(zone, sc, LRU_INACTIVE_ANON); | 1428 | zone_nr_pages(zone, sc, LRU_INACTIVE_ANON); |
1396 | file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) + | 1429 | file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) + |
@@ -1400,7 +1433,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
1400 | free = zone_page_state(zone, NR_FREE_PAGES); | 1433 | free = zone_page_state(zone, NR_FREE_PAGES); |
1401 | /* If we have very few page cache pages, | 1434 | /* If we have very few page cache pages, |
1402 | force-scan anon pages. */ | 1435 | force-scan anon pages. */ |
1403 | if (unlikely(file + free <= zone->pages_high)) { | 1436 | if (unlikely(file + free <= high_wmark_pages(zone))) { |
1404 | percent[0] = 100; | 1437 | percent[0] = 100; |
1405 | percent[1] = 0; | 1438 | percent[1] = 0; |
1406 | return; | 1439 | return; |
@@ -1455,6 +1488,26 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
1455 | percent[1] = 100 - percent[0]; | 1488 | percent[1] = 100 - percent[0]; |
1456 | } | 1489 | } |
1457 | 1490 | ||
1491 | /* | ||
1492 | * Smallish @nr_to_scan's are deposited in @nr_saved_scan, | ||
1493 | * until we collected @swap_cluster_max pages to scan. | ||
1494 | */ | ||
1495 | static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, | ||
1496 | unsigned long *nr_saved_scan, | ||
1497 | unsigned long swap_cluster_max) | ||
1498 | { | ||
1499 | unsigned long nr; | ||
1500 | |||
1501 | *nr_saved_scan += nr_to_scan; | ||
1502 | nr = *nr_saved_scan; | ||
1503 | |||
1504 | if (nr >= swap_cluster_max) | ||
1505 | *nr_saved_scan = 0; | ||
1506 | else | ||
1507 | nr = 0; | ||
1508 | |||
1509 | return nr; | ||
1510 | } | ||
1458 | 1511 | ||
1459 | /* | 1512 | /* |
1460 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | 1513 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. |
@@ -1468,26 +1521,30 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1468 | enum lru_list l; | 1521 | enum lru_list l; |
1469 | unsigned long nr_reclaimed = sc->nr_reclaimed; | 1522 | unsigned long nr_reclaimed = sc->nr_reclaimed; |
1470 | unsigned long swap_cluster_max = sc->swap_cluster_max; | 1523 | unsigned long swap_cluster_max = sc->swap_cluster_max; |
1524 | int noswap = 0; | ||
1471 | 1525 | ||
1472 | get_scan_ratio(zone, sc, percent); | 1526 | /* If we have no swap space, do not bother scanning anon pages. */ |
1527 | if (!sc->may_swap || (nr_swap_pages <= 0)) { | ||
1528 | noswap = 1; | ||
1529 | percent[0] = 0; | ||
1530 | percent[1] = 100; | ||
1531 | } else | ||
1532 | get_scan_ratio(zone, sc, percent); | ||
1473 | 1533 | ||
1474 | for_each_evictable_lru(l) { | 1534 | for_each_evictable_lru(l) { |
1475 | int file = is_file_lru(l); | 1535 | int file = is_file_lru(l); |
1476 | unsigned long scan; | 1536 | unsigned long scan; |
1477 | 1537 | ||
1478 | scan = zone_nr_pages(zone, sc, l); | 1538 | scan = zone_nr_pages(zone, sc, l); |
1479 | if (priority) { | 1539 | if (priority || noswap) { |
1480 | scan >>= priority; | 1540 | scan >>= priority; |
1481 | scan = (scan * percent[file]) / 100; | 1541 | scan = (scan * percent[file]) / 100; |
1482 | } | 1542 | } |
1483 | if (scanning_global_lru(sc)) { | 1543 | if (scanning_global_lru(sc)) |
1484 | zone->lru[l].nr_scan += scan; | 1544 | nr[l] = nr_scan_try_batch(scan, |
1485 | nr[l] = zone->lru[l].nr_scan; | 1545 | &zone->lru[l].nr_saved_scan, |
1486 | if (nr[l] >= swap_cluster_max) | 1546 | swap_cluster_max); |
1487 | zone->lru[l].nr_scan = 0; | 1547 | else |
1488 | else | ||
1489 | nr[l] = 0; | ||
1490 | } else | ||
1491 | nr[l] = scan; | 1548 | nr[l] = scan; |
1492 | } | 1549 | } |
1493 | 1550 | ||
@@ -1521,7 +1578,7 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1521 | * Even if we did not try to evict anon pages at all, we want to | 1578 | * Even if we did not try to evict anon pages at all, we want to |
1522 | * rebalance the anon lru active/inactive ratio. | 1579 | * rebalance the anon lru active/inactive ratio. |
1523 | */ | 1580 | */ |
1524 | if (inactive_anon_is_low(zone, sc)) | 1581 | if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0) |
1525 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); | 1582 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); |
1526 | 1583 | ||
1527 | throttle_vm_writeout(sc->gfp_mask); | 1584 | throttle_vm_writeout(sc->gfp_mask); |
@@ -1532,11 +1589,13 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1532 | * try to reclaim pages from zones which will satisfy the caller's allocation | 1589 | * try to reclaim pages from zones which will satisfy the caller's allocation |
1533 | * request. | 1590 | * request. |
1534 | * | 1591 | * |
1535 | * We reclaim from a zone even if that zone is over pages_high. Because: | 1592 | * We reclaim from a zone even if that zone is over high_wmark_pages(zone). |
1593 | * Because: | ||
1536 | * a) The caller may be trying to free *extra* pages to satisfy a higher-order | 1594 | * a) The caller may be trying to free *extra* pages to satisfy a higher-order |
1537 | * allocation or | 1595 | * allocation or |
1538 | * b) The zones may be over pages_high but they must go *over* pages_high to | 1596 | * b) The target zone may be at high_wmark_pages(zone) but the lower zones |
1539 | * satisfy the `incremental min' zone defense algorithm. | 1597 | * must go *over* high_wmark_pages(zone) to satisfy the `incremental min' |
1598 | * zone defense algorithm. | ||
1540 | * | 1599 | * |
1541 | * If a zone is deemed to be full of pinned pages then just give it a light | 1600 | * If a zone is deemed to be full of pinned pages then just give it a light |
1542 | * scan then give up on it. | 1601 | * scan then give up on it. |
@@ -1742,7 +1801,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
1742 | 1801 | ||
1743 | /* | 1802 | /* |
1744 | * For kswapd, balance_pgdat() will work across all this node's zones until | 1803 | * For kswapd, balance_pgdat() will work across all this node's zones until |
1745 | * they are all at pages_high. | 1804 | * they are all at high_wmark_pages(zone). |
1746 | * | 1805 | * |
1747 | * Returns the number of pages which were actually freed. | 1806 | * Returns the number of pages which were actually freed. |
1748 | * | 1807 | * |
@@ -1755,11 +1814,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
1755 | * the zone for when the problem goes away. | 1814 | * the zone for when the problem goes away. |
1756 | * | 1815 | * |
1757 | * kswapd scans the zones in the highmem->normal->dma direction. It skips | 1816 | * kswapd scans the zones in the highmem->normal->dma direction. It skips |
1758 | * zones which have free_pages > pages_high, but once a zone is found to have | 1817 | * zones which have free_pages > high_wmark_pages(zone), but once a zone is |
1759 | * free_pages <= pages_high, we scan that zone and the lower zones regardless | 1818 | * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the |
1760 | * of the number of free pages in the lower zones. This interoperates with | 1819 | * lower zones regardless of the number of free pages in the lower zones. This |
1761 | * the page allocator fallback scheme to ensure that aging of pages is balanced | 1820 | * interoperates with the page allocator fallback scheme to ensure that aging |
1762 | * across the zones. | 1821 | * of pages is balanced across the zones. |
1763 | */ | 1822 | */ |
1764 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | 1823 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order) |
1765 | { | 1824 | { |
@@ -1780,7 +1839,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | |||
1780 | }; | 1839 | }; |
1781 | /* | 1840 | /* |
1782 | * temp_priority is used to remember the scanning priority at which | 1841 | * temp_priority is used to remember the scanning priority at which |
1783 | * this zone was successfully refilled to free_pages == pages_high. | 1842 | * this zone was successfully refilled to |
1843 | * free_pages == high_wmark_pages(zone). | ||
1784 | */ | 1844 | */ |
1785 | int temp_priority[MAX_NR_ZONES]; | 1845 | int temp_priority[MAX_NR_ZONES]; |
1786 | 1846 | ||
@@ -1825,8 +1885,8 @@ loop_again: | |||
1825 | shrink_active_list(SWAP_CLUSTER_MAX, zone, | 1885 | shrink_active_list(SWAP_CLUSTER_MAX, zone, |
1826 | &sc, priority, 0); | 1886 | &sc, priority, 0); |
1827 | 1887 | ||
1828 | if (!zone_watermark_ok(zone, order, zone->pages_high, | 1888 | if (!zone_watermark_ok(zone, order, |
1829 | 0, 0)) { | 1889 | high_wmark_pages(zone), 0, 0)) { |
1830 | end_zone = i; | 1890 | end_zone = i; |
1831 | break; | 1891 | break; |
1832 | } | 1892 | } |
@@ -1860,8 +1920,8 @@ loop_again: | |||
1860 | priority != DEF_PRIORITY) | 1920 | priority != DEF_PRIORITY) |
1861 | continue; | 1921 | continue; |
1862 | 1922 | ||
1863 | if (!zone_watermark_ok(zone, order, zone->pages_high, | 1923 | if (!zone_watermark_ok(zone, order, |
1864 | end_zone, 0)) | 1924 | high_wmark_pages(zone), end_zone, 0)) |
1865 | all_zones_ok = 0; | 1925 | all_zones_ok = 0; |
1866 | temp_priority[i] = priority; | 1926 | temp_priority[i] = priority; |
1867 | sc.nr_scanned = 0; | 1927 | sc.nr_scanned = 0; |
@@ -1870,8 +1930,8 @@ loop_again: | |||
1870 | * We put equal pressure on every zone, unless one | 1930 | * We put equal pressure on every zone, unless one |
1871 | * zone has way too many pages free already. | 1931 | * zone has way too many pages free already. |
1872 | */ | 1932 | */ |
1873 | if (!zone_watermark_ok(zone, order, 8*zone->pages_high, | 1933 | if (!zone_watermark_ok(zone, order, |
1874 | end_zone, 0)) | 1934 | 8*high_wmark_pages(zone), end_zone, 0)) |
1875 | shrink_zone(priority, zone, &sc); | 1935 | shrink_zone(priority, zone, &sc); |
1876 | reclaim_state->reclaimed_slab = 0; | 1936 | reclaim_state->reclaimed_slab = 0; |
1877 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, | 1937 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, |
@@ -2037,7 +2097,7 @@ void wakeup_kswapd(struct zone *zone, int order) | |||
2037 | return; | 2097 | return; |
2038 | 2098 | ||
2039 | pgdat = zone->zone_pgdat; | 2099 | pgdat = zone->zone_pgdat; |
2040 | if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) | 2100 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) |
2041 | return; | 2101 | return; |
2042 | if (pgdat->kswapd_max_order < order) | 2102 | if (pgdat->kswapd_max_order < order) |
2043 | pgdat->kswapd_max_order = order; | 2103 | pgdat->kswapd_max_order = order; |
@@ -2056,7 +2116,7 @@ unsigned long global_lru_pages(void) | |||
2056 | + global_page_state(NR_INACTIVE_FILE); | 2116 | + global_page_state(NR_INACTIVE_FILE); |
2057 | } | 2117 | } |
2058 | 2118 | ||
2059 | #ifdef CONFIG_PM | 2119 | #ifdef CONFIG_HIBERNATION |
2060 | /* | 2120 | /* |
2061 | * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages | 2121 | * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages |
2062 | * from LRU lists system-wide, for given pass and priority. | 2122 | * from LRU lists system-wide, for given pass and priority. |
@@ -2084,11 +2144,11 @@ static void shrink_all_zones(unsigned long nr_pages, int prio, | |||
2084 | l == LRU_ACTIVE_FILE)) | 2144 | l == LRU_ACTIVE_FILE)) |
2085 | continue; | 2145 | continue; |
2086 | 2146 | ||
2087 | zone->lru[l].nr_scan += (lru_pages >> prio) + 1; | 2147 | zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1; |
2088 | if (zone->lru[l].nr_scan >= nr_pages || pass > 3) { | 2148 | if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) { |
2089 | unsigned long nr_to_scan; | 2149 | unsigned long nr_to_scan; |
2090 | 2150 | ||
2091 | zone->lru[l].nr_scan = 0; | 2151 | zone->lru[l].nr_saved_scan = 0; |
2092 | nr_to_scan = min(nr_pages, lru_pages); | 2152 | nr_to_scan = min(nr_pages, lru_pages); |
2093 | nr_reclaimed += shrink_list(l, nr_to_scan, zone, | 2153 | nr_reclaimed += shrink_list(l, nr_to_scan, zone, |
2094 | sc, prio); | 2154 | sc, prio); |
@@ -2196,7 +2256,7 @@ out: | |||
2196 | 2256 | ||
2197 | return sc.nr_reclaimed; | 2257 | return sc.nr_reclaimed; |
2198 | } | 2258 | } |
2199 | #endif | 2259 | #endif /* CONFIG_HIBERNATION */ |
2200 | 2260 | ||
2201 | /* It's optimal to keep kswapds on the same CPUs as their memory, but | 2261 | /* It's optimal to keep kswapds on the same CPUs as their memory, but |
2202 | not required for correctness. So if the last cpu in a node goes | 2262 | not required for correctness. So if the last cpu in a node goes |
@@ -2290,6 +2350,48 @@ int sysctl_min_unmapped_ratio = 1; | |||
2290 | */ | 2350 | */ |
2291 | int sysctl_min_slab_ratio = 5; | 2351 | int sysctl_min_slab_ratio = 5; |
2292 | 2352 | ||
2353 | static inline unsigned long zone_unmapped_file_pages(struct zone *zone) | ||
2354 | { | ||
2355 | unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED); | ||
2356 | unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) + | ||
2357 | zone_page_state(zone, NR_ACTIVE_FILE); | ||
2358 | |||
2359 | /* | ||
2360 | * It's possible for there to be more file mapped pages than | ||
2361 | * accounted for by the pages on the file LRU lists because | ||
2362 | * tmpfs pages accounted for as ANON can also be FILE_MAPPED | ||
2363 | */ | ||
2364 | return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; | ||
2365 | } | ||
2366 | |||
2367 | /* Work out how many page cache pages we can reclaim in this reclaim_mode */ | ||
2368 | static long zone_pagecache_reclaimable(struct zone *zone) | ||
2369 | { | ||
2370 | long nr_pagecache_reclaimable; | ||
2371 | long delta = 0; | ||
2372 | |||
2373 | /* | ||
2374 | * If RECLAIM_SWAP is set, then all file pages are considered | ||
2375 | * potentially reclaimable. Otherwise, we have to worry about | ||
2376 | * pages like swapcache and zone_unmapped_file_pages() provides | ||
2377 | * a better estimate | ||
2378 | */ | ||
2379 | if (zone_reclaim_mode & RECLAIM_SWAP) | ||
2380 | nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); | ||
2381 | else | ||
2382 | nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); | ||
2383 | |||
2384 | /* If we can't clean pages, remove dirty pages from consideration */ | ||
2385 | if (!(zone_reclaim_mode & RECLAIM_WRITE)) | ||
2386 | delta += zone_page_state(zone, NR_FILE_DIRTY); | ||
2387 | |||
2388 | /* Watch for any possible underflows due to delta */ | ||
2389 | if (unlikely(delta > nr_pagecache_reclaimable)) | ||
2390 | delta = nr_pagecache_reclaimable; | ||
2391 | |||
2392 | return nr_pagecache_reclaimable - delta; | ||
2393 | } | ||
2394 | |||
2293 | /* | 2395 | /* |
2294 | * Try to free up some pages from this zone through reclaim. | 2396 | * Try to free up some pages from this zone through reclaim. |
2295 | */ | 2397 | */ |
@@ -2324,9 +2426,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
2324 | reclaim_state.reclaimed_slab = 0; | 2426 | reclaim_state.reclaimed_slab = 0; |
2325 | p->reclaim_state = &reclaim_state; | 2427 | p->reclaim_state = &reclaim_state; |
2326 | 2428 | ||
2327 | if (zone_page_state(zone, NR_FILE_PAGES) - | 2429 | if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) { |
2328 | zone_page_state(zone, NR_FILE_MAPPED) > | ||
2329 | zone->min_unmapped_pages) { | ||
2330 | /* | 2430 | /* |
2331 | * Free memory by calling shrink zone with increasing | 2431 | * Free memory by calling shrink zone with increasing |
2332 | * priorities until we have enough memory freed. | 2432 | * priorities until we have enough memory freed. |
@@ -2384,20 +2484,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
2384 | * if less than a specified percentage of the zone is used by | 2484 | * if less than a specified percentage of the zone is used by |
2385 | * unmapped file backed pages. | 2485 | * unmapped file backed pages. |
2386 | */ | 2486 | */ |
2387 | if (zone_page_state(zone, NR_FILE_PAGES) - | 2487 | if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages && |
2388 | zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages | 2488 | zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) |
2389 | && zone_page_state(zone, NR_SLAB_RECLAIMABLE) | 2489 | return ZONE_RECLAIM_FULL; |
2390 | <= zone->min_slab_pages) | ||
2391 | return 0; | ||
2392 | 2490 | ||
2393 | if (zone_is_all_unreclaimable(zone)) | 2491 | if (zone_is_all_unreclaimable(zone)) |
2394 | return 0; | 2492 | return ZONE_RECLAIM_FULL; |
2395 | 2493 | ||
2396 | /* | 2494 | /* |
2397 | * Do not scan if the allocation should not be delayed. | 2495 | * Do not scan if the allocation should not be delayed. |
2398 | */ | 2496 | */ |
2399 | if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) | 2497 | if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) |
2400 | return 0; | 2498 | return ZONE_RECLAIM_NOSCAN; |
2401 | 2499 | ||
2402 | /* | 2500 | /* |
2403 | * Only run zone reclaim on the local zone or on zones that do not | 2501 | * Only run zone reclaim on the local zone or on zones that do not |
@@ -2407,18 +2505,21 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
2407 | */ | 2505 | */ |
2408 | node_id = zone_to_nid(zone); | 2506 | node_id = zone_to_nid(zone); |
2409 | if (node_state(node_id, N_CPU) && node_id != numa_node_id()) | 2507 | if (node_state(node_id, N_CPU) && node_id != numa_node_id()) |
2410 | return 0; | 2508 | return ZONE_RECLAIM_NOSCAN; |
2411 | 2509 | ||
2412 | if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) | 2510 | if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) |
2413 | return 0; | 2511 | return ZONE_RECLAIM_NOSCAN; |
2512 | |||
2414 | ret = __zone_reclaim(zone, gfp_mask, order); | 2513 | ret = __zone_reclaim(zone, gfp_mask, order); |
2415 | zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); | 2514 | zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); |
2416 | 2515 | ||
2516 | if (!ret) | ||
2517 | count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); | ||
2518 | |||
2417 | return ret; | 2519 | return ret; |
2418 | } | 2520 | } |
2419 | #endif | 2521 | #endif |
2420 | 2522 | ||
2421 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
2422 | /* | 2523 | /* |
2423 | * page_evictable - test whether a page is evictable | 2524 | * page_evictable - test whether a page is evictable |
2424 | * @page: the page to test | 2525 | * @page: the page to test |
@@ -2665,4 +2766,3 @@ void scan_unevictable_unregister_node(struct node *node) | |||
2665 | sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); | 2766 | sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); |
2666 | } | 2767 | } |
2667 | 2768 | ||
2668 | #endif | ||
diff --git a/mm/vmstat.c b/mm/vmstat.c index 74d66dba0cbe..138bed53706e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -629,10 +629,8 @@ static const char * const vmstat_text[] = { | |||
629 | "nr_active_anon", | 629 | "nr_active_anon", |
630 | "nr_inactive_file", | 630 | "nr_inactive_file", |
631 | "nr_active_file", | 631 | "nr_active_file", |
632 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
633 | "nr_unevictable", | 632 | "nr_unevictable", |
634 | "nr_mlock", | 633 | "nr_mlock", |
635 | #endif | ||
636 | "nr_anon_pages", | 634 | "nr_anon_pages", |
637 | "nr_mapped", | 635 | "nr_mapped", |
638 | "nr_file_pages", | 636 | "nr_file_pages", |
@@ -675,6 +673,9 @@ static const char * const vmstat_text[] = { | |||
675 | TEXTS_FOR_ZONES("pgscan_kswapd") | 673 | TEXTS_FOR_ZONES("pgscan_kswapd") |
676 | TEXTS_FOR_ZONES("pgscan_direct") | 674 | TEXTS_FOR_ZONES("pgscan_direct") |
677 | 675 | ||
676 | #ifdef CONFIG_NUMA | ||
677 | "zone_reclaim_failed", | ||
678 | #endif | ||
678 | "pginodesteal", | 679 | "pginodesteal", |
679 | "slabs_scanned", | 680 | "slabs_scanned", |
680 | "kswapd_steal", | 681 | "kswapd_steal", |
@@ -687,7 +688,6 @@ static const char * const vmstat_text[] = { | |||
687 | "htlb_buddy_alloc_success", | 688 | "htlb_buddy_alloc_success", |
688 | "htlb_buddy_alloc_fail", | 689 | "htlb_buddy_alloc_fail", |
689 | #endif | 690 | #endif |
690 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
691 | "unevictable_pgs_culled", | 691 | "unevictable_pgs_culled", |
692 | "unevictable_pgs_scanned", | 692 | "unevictable_pgs_scanned", |
693 | "unevictable_pgs_rescued", | 693 | "unevictable_pgs_rescued", |
@@ -697,7 +697,6 @@ static const char * const vmstat_text[] = { | |||
697 | "unevictable_pgs_stranded", | 697 | "unevictable_pgs_stranded", |
698 | "unevictable_pgs_mlockfreed", | 698 | "unevictable_pgs_mlockfreed", |
699 | #endif | 699 | #endif |
700 | #endif | ||
701 | }; | 700 | }; |
702 | 701 | ||
703 | static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | 702 | static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, |
@@ -710,18 +709,14 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
710 | "\n min %lu" | 709 | "\n min %lu" |
711 | "\n low %lu" | 710 | "\n low %lu" |
712 | "\n high %lu" | 711 | "\n high %lu" |
713 | "\n scanned %lu (aa: %lu ia: %lu af: %lu if: %lu)" | 712 | "\n scanned %lu" |
714 | "\n spanned %lu" | 713 | "\n spanned %lu" |
715 | "\n present %lu", | 714 | "\n present %lu", |
716 | zone_page_state(zone, NR_FREE_PAGES), | 715 | zone_page_state(zone, NR_FREE_PAGES), |
717 | zone->pages_min, | 716 | min_wmark_pages(zone), |
718 | zone->pages_low, | 717 | low_wmark_pages(zone), |
719 | zone->pages_high, | 718 | high_wmark_pages(zone), |
720 | zone->pages_scanned, | 719 | zone->pages_scanned, |
721 | zone->lru[LRU_ACTIVE_ANON].nr_scan, | ||
722 | zone->lru[LRU_INACTIVE_ANON].nr_scan, | ||
723 | zone->lru[LRU_ACTIVE_FILE].nr_scan, | ||
724 | zone->lru[LRU_INACTIVE_FILE].nr_scan, | ||
725 | zone->spanned_pages, | 720 | zone->spanned_pages, |
726 | zone->present_pages); | 721 | zone->present_pages); |
727 | 722 | ||