diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-04-07 19:38:06 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-04-07 19:38:06 -0400 |
commit | 26c12d93348f0bda0756aff83f4867d9ae58a5a6 (patch) | |
tree | 65221f6837c66a9260c5c973e5fb908b10e0d504 /mm | |
parent | dc5ed40686a4da95881c35d913b60f867755cbe2 (diff) | |
parent | fdc5813fbbd484a54c88477f91a78934cda8bb32 (diff) |
Merge branch 'akpm' (incoming from Andrew)
Merge second patch-bomb from Andrew Morton:
- the rest of MM
- zram updates
- zswap updates
- exit
- procfs
- exec
- wait
- crash dump
- lib/idr
- rapidio
- adfs, affs, bfs, ufs
- cris
- Kconfig things
- initramfs
- small amount of IPC material
- percpu enhancements
- early ioremap support
- various other misc things
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (156 commits)
MAINTAINERS: update Intel C600 SAS driver maintainers
fs/ufs: remove unused ufs_super_block_third pointer
fs/ufs: remove unused ufs_super_block_second pointer
fs/ufs: remove unused ufs_super_block_first pointer
fs/ufs/super.c: add __init to init_inodecache()
doc/kernel-parameters.txt: add early_ioremap_debug
arm64: add early_ioremap support
arm64: initialize pgprot info earlier in boot
x86: use generic early_ioremap
mm: create generic early_ioremap() support
x86/mm: sparse warning fix for early_memremap
lglock: map to spinlock when !CONFIG_SMP
percpu: add preemption checks to __this_cpu ops
vmstat: use raw_cpu_ops to avoid false positives on preemption checks
slub: use raw_cpu_inc for incrementing statistics
net: replace __this_cpu_inc in route.c with raw_cpu_inc
modules: use raw_cpu_write for initialization of per cpu refcount.
mm: use raw_cpu ops for determining current NUMA node
percpu: add raw_cpu_ops
slub: fix leak of 'name' in sysfs_slab_add
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 4 | ||||
-rw-r--r-- | mm/Makefile | 3 | ||||
-rw-r--r-- | mm/compaction.c | 84 | ||||
-rw-r--r-- | mm/early_ioremap.c | 245 | ||||
-rw-r--r-- | mm/filemap.c | 86 | ||||
-rw-r--r-- | mm/huge_memory.c | 21 | ||||
-rw-r--r-- | mm/hugetlb.c | 14 | ||||
-rw-r--r-- | mm/internal.h | 16 | ||||
-rw-r--r-- | mm/memblock.c | 28 | ||||
-rw-r--r-- | mm/memcontrol.c | 453 | ||||
-rw-r--r-- | mm/memory.c | 147 | ||||
-rw-r--r-- | mm/mempolicy.c | 46 | ||||
-rw-r--r-- | mm/mempool.c | 4 | ||||
-rw-r--r-- | mm/mlock.c | 2 | ||||
-rw-r--r-- | mm/mmap.c | 55 | ||||
-rw-r--r-- | mm/mprotect.c | 56 | ||||
-rw-r--r-- | mm/nommu.c | 49 | ||||
-rw-r--r-- | mm/page-writeback.c | 4 | ||||
-rw-r--r-- | mm/page_alloc.c | 118 | ||||
-rw-r--r-- | mm/readahead.c | 21 | ||||
-rw-r--r-- | mm/rmap.c | 14 | ||||
-rw-r--r-- | mm/shmem.c | 7 | ||||
-rw-r--r-- | mm/slab.c | 8 | ||||
-rw-r--r-- | mm/slab.h | 21 | ||||
-rw-r--r-- | mm/slab_common.c | 250 | ||||
-rw-r--r-- | mm/slub.c | 87 | ||||
-rw-r--r-- | mm/sparse.c | 4 | ||||
-rw-r--r-- | mm/util.c | 5 | ||||
-rw-r--r-- | mm/vmacache.c | 112 | ||||
-rw-r--r-- | mm/vmalloc.c | 10 | ||||
-rw-r--r-- | mm/vmscan.c | 12 | ||||
-rw-r--r-- | mm/zswap.c | 78 |
32 files changed, 1342 insertions, 722 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 2888024e0b0a..ebe5880c29d6 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -216,6 +216,7 @@ config PAGEFLAGS_EXTENDED | |||
216 | # | 216 | # |
217 | config SPLIT_PTLOCK_CPUS | 217 | config SPLIT_PTLOCK_CPUS |
218 | int | 218 | int |
219 | default "999999" if !MMU | ||
219 | default "999999" if ARM && !CPU_CACHE_VIPT | 220 | default "999999" if ARM && !CPU_CACHE_VIPT |
220 | default "999999" if PARISC && !PA20 | 221 | default "999999" if PARISC && !PA20 |
221 | default "4" | 222 | default "4" |
@@ -577,3 +578,6 @@ config PGTABLE_MAPPING | |||
577 | 578 | ||
578 | You can check speed with zsmalloc benchmark: | 579 | You can check speed with zsmalloc benchmark: |
579 | https://github.com/spartacus06/zsmapbench | 580 | https://github.com/spartacus06/zsmapbench |
581 | |||
582 | config GENERIC_EARLY_IOREMAP | ||
583 | bool | ||
diff --git a/mm/Makefile b/mm/Makefile index cdd741519ee0..9e5aaf92197d 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -16,7 +16,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ | |||
16 | readahead.o swap.o truncate.o vmscan.o shmem.o \ | 16 | readahead.o swap.o truncate.o vmscan.o shmem.o \ |
17 | util.o mmzone.o vmstat.o backing-dev.o \ | 17 | util.o mmzone.o vmstat.o backing-dev.o \ |
18 | mm_init.o mmu_context.o percpu.o slab_common.o \ | 18 | mm_init.o mmu_context.o percpu.o slab_common.o \ |
19 | compaction.o balloon_compaction.o \ | 19 | compaction.o balloon_compaction.o vmacache.o \ |
20 | interval_tree.o list_lru.o workingset.o $(mmu-y) | 20 | interval_tree.o list_lru.o workingset.o $(mmu-y) |
21 | 21 | ||
22 | obj-y += init-mm.o | 22 | obj-y += init-mm.o |
@@ -61,3 +61,4 @@ obj-$(CONFIG_CLEANCACHE) += cleancache.o | |||
61 | obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o | 61 | obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o |
62 | obj-$(CONFIG_ZBUD) += zbud.o | 62 | obj-$(CONFIG_ZBUD) += zbud.o |
63 | obj-$(CONFIG_ZSMALLOC) += zsmalloc.o | 63 | obj-$(CONFIG_ZSMALLOC) += zsmalloc.o |
64 | obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o | ||
diff --git a/mm/compaction.c b/mm/compaction.c index b6ab77160068..37f976287068 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -217,21 +217,12 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock, | |||
217 | /* Returns true if the page is within a block suitable for migration to */ | 217 | /* Returns true if the page is within a block suitable for migration to */ |
218 | static bool suitable_migration_target(struct page *page) | 218 | static bool suitable_migration_target(struct page *page) |
219 | { | 219 | { |
220 | int migratetype = get_pageblock_migratetype(page); | 220 | /* If the page is a large free page, then disallow migration */ |
221 | |||
222 | /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ | ||
223 | if (migratetype == MIGRATE_RESERVE) | ||
224 | return false; | ||
225 | |||
226 | if (is_migrate_isolate(migratetype)) | ||
227 | return false; | ||
228 | |||
229 | /* If the page is a large free page, then allow migration */ | ||
230 | if (PageBuddy(page) && page_order(page) >= pageblock_order) | 221 | if (PageBuddy(page) && page_order(page) >= pageblock_order) |
231 | return true; | 222 | return false; |
232 | 223 | ||
233 | /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ | 224 | /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ |
234 | if (migrate_async_suitable(migratetype)) | 225 | if (migrate_async_suitable(get_pageblock_migratetype(page))) |
235 | return true; | 226 | return true; |
236 | 227 | ||
237 | /* Otherwise skip the block */ | 228 | /* Otherwise skip the block */ |
@@ -253,6 +244,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, | |||
253 | struct page *cursor, *valid_page = NULL; | 244 | struct page *cursor, *valid_page = NULL; |
254 | unsigned long flags; | 245 | unsigned long flags; |
255 | bool locked = false; | 246 | bool locked = false; |
247 | bool checked_pageblock = false; | ||
256 | 248 | ||
257 | cursor = pfn_to_page(blockpfn); | 249 | cursor = pfn_to_page(blockpfn); |
258 | 250 | ||
@@ -284,8 +276,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, | |||
284 | break; | 276 | break; |
285 | 277 | ||
286 | /* Recheck this is a suitable migration target under lock */ | 278 | /* Recheck this is a suitable migration target under lock */ |
287 | if (!strict && !suitable_migration_target(page)) | 279 | if (!strict && !checked_pageblock) { |
288 | break; | 280 | /* |
281 | * We need to check suitability of pageblock only once | ||
282 | * and this isolate_freepages_block() is called with | ||
283 | * pageblock range, so just check once is sufficient. | ||
284 | */ | ||
285 | checked_pageblock = true; | ||
286 | if (!suitable_migration_target(page)) | ||
287 | break; | ||
288 | } | ||
289 | 289 | ||
290 | /* Recheck this is a buddy page under lock */ | 290 | /* Recheck this is a buddy page under lock */ |
291 | if (!PageBuddy(page)) | 291 | if (!PageBuddy(page)) |
@@ -460,12 +460,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
460 | unsigned long last_pageblock_nr = 0, pageblock_nr; | 460 | unsigned long last_pageblock_nr = 0, pageblock_nr; |
461 | unsigned long nr_scanned = 0, nr_isolated = 0; | 461 | unsigned long nr_scanned = 0, nr_isolated = 0; |
462 | struct list_head *migratelist = &cc->migratepages; | 462 | struct list_head *migratelist = &cc->migratepages; |
463 | isolate_mode_t mode = 0; | ||
464 | struct lruvec *lruvec; | 463 | struct lruvec *lruvec; |
465 | unsigned long flags; | 464 | unsigned long flags; |
466 | bool locked = false; | 465 | bool locked = false; |
467 | struct page *page = NULL, *valid_page = NULL; | 466 | struct page *page = NULL, *valid_page = NULL; |
468 | bool skipped_async_unsuitable = false; | 467 | bool skipped_async_unsuitable = false; |
468 | const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) | | ||
469 | (unevictable ? ISOLATE_UNEVICTABLE : 0); | ||
469 | 470 | ||
470 | /* | 471 | /* |
471 | * Ensure that there are not too many pages isolated from the LRU | 472 | * Ensure that there are not too many pages isolated from the LRU |
@@ -487,7 +488,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
487 | cond_resched(); | 488 | cond_resched(); |
488 | for (; low_pfn < end_pfn; low_pfn++) { | 489 | for (; low_pfn < end_pfn; low_pfn++) { |
489 | /* give a chance to irqs before checking need_resched() */ | 490 | /* give a chance to irqs before checking need_resched() */ |
490 | if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) { | 491 | if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) { |
491 | if (should_release_lock(&zone->lru_lock)) { | 492 | if (should_release_lock(&zone->lru_lock)) { |
492 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 493 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
493 | locked = false; | 494 | locked = false; |
@@ -526,8 +527,25 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
526 | 527 | ||
527 | /* If isolation recently failed, do not retry */ | 528 | /* If isolation recently failed, do not retry */ |
528 | pageblock_nr = low_pfn >> pageblock_order; | 529 | pageblock_nr = low_pfn >> pageblock_order; |
529 | if (!isolation_suitable(cc, page)) | 530 | if (last_pageblock_nr != pageblock_nr) { |
530 | goto next_pageblock; | 531 | int mt; |
532 | |||
533 | last_pageblock_nr = pageblock_nr; | ||
534 | if (!isolation_suitable(cc, page)) | ||
535 | goto next_pageblock; | ||
536 | |||
537 | /* | ||
538 | * For async migration, also only scan in MOVABLE | ||
539 | * blocks. Async migration is optimistic to see if | ||
540 | * the minimum amount of work satisfies the allocation | ||
541 | */ | ||
542 | mt = get_pageblock_migratetype(page); | ||
543 | if (!cc->sync && !migrate_async_suitable(mt)) { | ||
544 | cc->finished_update_migrate = true; | ||
545 | skipped_async_unsuitable = true; | ||
546 | goto next_pageblock; | ||
547 | } | ||
548 | } | ||
531 | 549 | ||
532 | /* | 550 | /* |
533 | * Skip if free. page_order cannot be used without zone->lock | 551 | * Skip if free. page_order cannot be used without zone->lock |
@@ -537,18 +555,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
537 | continue; | 555 | continue; |
538 | 556 | ||
539 | /* | 557 | /* |
540 | * For async migration, also only scan in MOVABLE blocks. Async | ||
541 | * migration is optimistic to see if the minimum amount of work | ||
542 | * satisfies the allocation | ||
543 | */ | ||
544 | if (!cc->sync && last_pageblock_nr != pageblock_nr && | ||
545 | !migrate_async_suitable(get_pageblock_migratetype(page))) { | ||
546 | cc->finished_update_migrate = true; | ||
547 | skipped_async_unsuitable = true; | ||
548 | goto next_pageblock; | ||
549 | } | ||
550 | |||
551 | /* | ||
552 | * Check may be lockless but that's ok as we recheck later. | 558 | * Check may be lockless but that's ok as we recheck later. |
553 | * It's possible to migrate LRU pages and balloon pages | 559 | * It's possible to migrate LRU pages and balloon pages |
554 | * Skip any other type of page | 560 | * Skip any other type of page |
@@ -557,11 +563,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
557 | if (unlikely(balloon_page_movable(page))) { | 563 | if (unlikely(balloon_page_movable(page))) { |
558 | if (locked && balloon_page_isolate(page)) { | 564 | if (locked && balloon_page_isolate(page)) { |
559 | /* Successfully isolated */ | 565 | /* Successfully isolated */ |
560 | cc->finished_update_migrate = true; | 566 | goto isolate_success; |
561 | list_add(&page->lru, migratelist); | ||
562 | cc->nr_migratepages++; | ||
563 | nr_isolated++; | ||
564 | goto check_compact_cluster; | ||
565 | } | 567 | } |
566 | } | 568 | } |
567 | continue; | 569 | continue; |
@@ -607,12 +609,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
607 | continue; | 609 | continue; |
608 | } | 610 | } |
609 | 611 | ||
610 | if (!cc->sync) | ||
611 | mode |= ISOLATE_ASYNC_MIGRATE; | ||
612 | |||
613 | if (unevictable) | ||
614 | mode |= ISOLATE_UNEVICTABLE; | ||
615 | |||
616 | lruvec = mem_cgroup_page_lruvec(page, zone); | 612 | lruvec = mem_cgroup_page_lruvec(page, zone); |
617 | 613 | ||
618 | /* Try isolate the page */ | 614 | /* Try isolate the page */ |
@@ -622,13 +618,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
622 | VM_BUG_ON_PAGE(PageTransCompound(page), page); | 618 | VM_BUG_ON_PAGE(PageTransCompound(page), page); |
623 | 619 | ||
624 | /* Successfully isolated */ | 620 | /* Successfully isolated */ |
625 | cc->finished_update_migrate = true; | ||
626 | del_page_from_lru_list(page, lruvec, page_lru(page)); | 621 | del_page_from_lru_list(page, lruvec, page_lru(page)); |
622 | |||
623 | isolate_success: | ||
624 | cc->finished_update_migrate = true; | ||
627 | list_add(&page->lru, migratelist); | 625 | list_add(&page->lru, migratelist); |
628 | cc->nr_migratepages++; | 626 | cc->nr_migratepages++; |
629 | nr_isolated++; | 627 | nr_isolated++; |
630 | 628 | ||
631 | check_compact_cluster: | ||
632 | /* Avoid isolating too much */ | 629 | /* Avoid isolating too much */ |
633 | if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { | 630 | if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { |
634 | ++low_pfn; | 631 | ++low_pfn; |
@@ -639,7 +636,6 @@ check_compact_cluster: | |||
639 | 636 | ||
640 | next_pageblock: | 637 | next_pageblock: |
641 | low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1; | 638 | low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1; |
642 | last_pageblock_nr = pageblock_nr; | ||
643 | } | 639 | } |
644 | 640 | ||
645 | acct_isolated(zone, locked, cc); | 641 | acct_isolated(zone, locked, cc); |
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c new file mode 100644 index 000000000000..e10ccd299d66 --- /dev/null +++ b/mm/early_ioremap.c | |||
@@ -0,0 +1,245 @@ | |||
1 | /* | ||
2 | * Provide common bits of early_ioremap() support for architectures needing | ||
3 | * temporary mappings during boot before ioremap() is available. | ||
4 | * | ||
5 | * This is mostly a direct copy of the x86 early_ioremap implementation. | ||
6 | * | ||
7 | * (C) Copyright 1995 1996, 2014 Linus Torvalds | ||
8 | * | ||
9 | */ | ||
10 | #include <linux/kernel.h> | ||
11 | #include <linux/init.h> | ||
12 | #include <linux/io.h> | ||
13 | #include <linux/module.h> | ||
14 | #include <linux/slab.h> | ||
15 | #include <linux/mm.h> | ||
16 | #include <linux/vmalloc.h> | ||
17 | #include <asm/fixmap.h> | ||
18 | |||
19 | #ifdef CONFIG_MMU | ||
20 | static int early_ioremap_debug __initdata; | ||
21 | |||
22 | static int __init early_ioremap_debug_setup(char *str) | ||
23 | { | ||
24 | early_ioremap_debug = 1; | ||
25 | |||
26 | return 0; | ||
27 | } | ||
28 | early_param("early_ioremap_debug", early_ioremap_debug_setup); | ||
29 | |||
30 | static int after_paging_init __initdata; | ||
31 | |||
32 | void __init __weak early_ioremap_shutdown(void) | ||
33 | { | ||
34 | } | ||
35 | |||
36 | void __init early_ioremap_reset(void) | ||
37 | { | ||
38 | early_ioremap_shutdown(); | ||
39 | after_paging_init = 1; | ||
40 | } | ||
41 | |||
42 | /* | ||
43 | * Generally, ioremap() is available after paging_init() has been called. | ||
44 | * Architectures wanting to allow early_ioremap after paging_init() can | ||
45 | * define __late_set_fixmap and __late_clear_fixmap to do the right thing. | ||
46 | */ | ||
47 | #ifndef __late_set_fixmap | ||
48 | static inline void __init __late_set_fixmap(enum fixed_addresses idx, | ||
49 | phys_addr_t phys, pgprot_t prot) | ||
50 | { | ||
51 | BUG(); | ||
52 | } | ||
53 | #endif | ||
54 | |||
55 | #ifndef __late_clear_fixmap | ||
56 | static inline void __init __late_clear_fixmap(enum fixed_addresses idx) | ||
57 | { | ||
58 | BUG(); | ||
59 | } | ||
60 | #endif | ||
61 | |||
62 | static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata; | ||
63 | static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; | ||
64 | static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata; | ||
65 | |||
66 | void __init early_ioremap_setup(void) | ||
67 | { | ||
68 | int i; | ||
69 | |||
70 | for (i = 0; i < FIX_BTMAPS_SLOTS; i++) | ||
71 | if (WARN_ON(prev_map[i])) | ||
72 | break; | ||
73 | |||
74 | for (i = 0; i < FIX_BTMAPS_SLOTS; i++) | ||
75 | slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i); | ||
76 | } | ||
77 | |||
78 | static int __init check_early_ioremap_leak(void) | ||
79 | { | ||
80 | int count = 0; | ||
81 | int i; | ||
82 | |||
83 | for (i = 0; i < FIX_BTMAPS_SLOTS; i++) | ||
84 | if (prev_map[i]) | ||
85 | count++; | ||
86 | |||
87 | if (WARN(count, KERN_WARNING | ||
88 | "Debug warning: early ioremap leak of %d areas detected.\n" | ||
89 | "please boot with early_ioremap_debug and report the dmesg.\n", | ||
90 | count)) | ||
91 | return 1; | ||
92 | return 0; | ||
93 | } | ||
94 | late_initcall(check_early_ioremap_leak); | ||
95 | |||
96 | static void __init __iomem * | ||
97 | __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) | ||
98 | { | ||
99 | unsigned long offset; | ||
100 | resource_size_t last_addr; | ||
101 | unsigned int nrpages; | ||
102 | enum fixed_addresses idx; | ||
103 | int i, slot; | ||
104 | |||
105 | WARN_ON(system_state != SYSTEM_BOOTING); | ||
106 | |||
107 | slot = -1; | ||
108 | for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { | ||
109 | if (!prev_map[i]) { | ||
110 | slot = i; | ||
111 | break; | ||
112 | } | ||
113 | } | ||
114 | |||
115 | if (WARN(slot < 0, "%s(%08llx, %08lx) not found slot\n", | ||
116 | __func__, (u64)phys_addr, size)) | ||
117 | return NULL; | ||
118 | |||
119 | /* Don't allow wraparound or zero size */ | ||
120 | last_addr = phys_addr + size - 1; | ||
121 | if (WARN_ON(!size || last_addr < phys_addr)) | ||
122 | return NULL; | ||
123 | |||
124 | prev_size[slot] = size; | ||
125 | /* | ||
126 | * Mappings have to be page-aligned | ||
127 | */ | ||
128 | offset = phys_addr & ~PAGE_MASK; | ||
129 | phys_addr &= PAGE_MASK; | ||
130 | size = PAGE_ALIGN(last_addr + 1) - phys_addr; | ||
131 | |||
132 | /* | ||
133 | * Mappings have to fit in the FIX_BTMAP area. | ||
134 | */ | ||
135 | nrpages = size >> PAGE_SHIFT; | ||
136 | if (WARN_ON(nrpages > NR_FIX_BTMAPS)) | ||
137 | return NULL; | ||
138 | |||
139 | /* | ||
140 | * Ok, go for it.. | ||
141 | */ | ||
142 | idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; | ||
143 | while (nrpages > 0) { | ||
144 | if (after_paging_init) | ||
145 | __late_set_fixmap(idx, phys_addr, prot); | ||
146 | else | ||
147 | __early_set_fixmap(idx, phys_addr, prot); | ||
148 | phys_addr += PAGE_SIZE; | ||
149 | --idx; | ||
150 | --nrpages; | ||
151 | } | ||
152 | WARN(early_ioremap_debug, "%s(%08llx, %08lx) [%d] => %08lx + %08lx\n", | ||
153 | __func__, (u64)phys_addr, size, slot, offset, slot_virt[slot]); | ||
154 | |||
155 | prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]); | ||
156 | return prev_map[slot]; | ||
157 | } | ||
158 | |||
159 | void __init early_iounmap(void __iomem *addr, unsigned long size) | ||
160 | { | ||
161 | unsigned long virt_addr; | ||
162 | unsigned long offset; | ||
163 | unsigned int nrpages; | ||
164 | enum fixed_addresses idx; | ||
165 | int i, slot; | ||
166 | |||
167 | slot = -1; | ||
168 | for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { | ||
169 | if (prev_map[i] == addr) { | ||
170 | slot = i; | ||
171 | break; | ||
172 | } | ||
173 | } | ||
174 | |||
175 | if (WARN(slot < 0, "early_iounmap(%p, %08lx) not found slot\n", | ||
176 | addr, size)) | ||
177 | return; | ||
178 | |||
179 | if (WARN(prev_size[slot] != size, | ||
180 | "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n", | ||
181 | addr, size, slot, prev_size[slot])) | ||
182 | return; | ||
183 | |||
184 | WARN(early_ioremap_debug, "early_iounmap(%p, %08lx) [%d]\n", | ||
185 | addr, size, slot); | ||
186 | |||
187 | virt_addr = (unsigned long)addr; | ||
188 | if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))) | ||
189 | return; | ||
190 | |||
191 | offset = virt_addr & ~PAGE_MASK; | ||
192 | nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT; | ||
193 | |||
194 | idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; | ||
195 | while (nrpages > 0) { | ||
196 | if (after_paging_init) | ||
197 | __late_clear_fixmap(idx); | ||
198 | else | ||
199 | __early_set_fixmap(idx, 0, FIXMAP_PAGE_CLEAR); | ||
200 | --idx; | ||
201 | --nrpages; | ||
202 | } | ||
203 | prev_map[slot] = NULL; | ||
204 | } | ||
205 | |||
206 | /* Remap an IO device */ | ||
207 | void __init __iomem * | ||
208 | early_ioremap(resource_size_t phys_addr, unsigned long size) | ||
209 | { | ||
210 | return __early_ioremap(phys_addr, size, FIXMAP_PAGE_IO); | ||
211 | } | ||
212 | |||
213 | /* Remap memory */ | ||
214 | void __init * | ||
215 | early_memremap(resource_size_t phys_addr, unsigned long size) | ||
216 | { | ||
217 | return (__force void *)__early_ioremap(phys_addr, size, | ||
218 | FIXMAP_PAGE_NORMAL); | ||
219 | } | ||
220 | #else /* CONFIG_MMU */ | ||
221 | |||
222 | void __init __iomem * | ||
223 | early_ioremap(resource_size_t phys_addr, unsigned long size) | ||
224 | { | ||
225 | return (__force void __iomem *)phys_addr; | ||
226 | } | ||
227 | |||
228 | /* Remap memory */ | ||
229 | void __init * | ||
230 | early_memremap(resource_size_t phys_addr, unsigned long size) | ||
231 | { | ||
232 | return (void *)phys_addr; | ||
233 | } | ||
234 | |||
235 | void __init early_iounmap(void __iomem *addr, unsigned long size) | ||
236 | { | ||
237 | } | ||
238 | |||
239 | #endif /* CONFIG_MMU */ | ||
240 | |||
241 | |||
242 | void __init early_memunmap(void *addr, unsigned long size) | ||
243 | { | ||
244 | early_iounmap((__force void __iomem *)addr, size); | ||
245 | } | ||
diff --git a/mm/filemap.c b/mm/filemap.c index 21781f1fe52b..27ebc0c9571b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ | 33 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ |
34 | #include <linux/memcontrol.h> | 34 | #include <linux/memcontrol.h> |
35 | #include <linux/cleancache.h> | 35 | #include <linux/cleancache.h> |
36 | #include <linux/rmap.h> | ||
36 | #include "internal.h" | 37 | #include "internal.h" |
37 | 38 | ||
38 | #define CREATE_TRACE_POINTS | 39 | #define CREATE_TRACE_POINTS |
@@ -562,7 +563,7 @@ static int __add_to_page_cache_locked(struct page *page, | |||
562 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 563 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
563 | VM_BUG_ON_PAGE(PageSwapBacked(page), page); | 564 | VM_BUG_ON_PAGE(PageSwapBacked(page), page); |
564 | 565 | ||
565 | error = mem_cgroup_cache_charge(page, current->mm, | 566 | error = mem_cgroup_charge_file(page, current->mm, |
566 | gfp_mask & GFP_RECLAIM_MASK); | 567 | gfp_mask & GFP_RECLAIM_MASK); |
567 | if (error) | 568 | if (error) |
568 | return error; | 569 | return error; |
@@ -1952,11 +1953,11 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1952 | struct inode *inode = mapping->host; | 1953 | struct inode *inode = mapping->host; |
1953 | pgoff_t offset = vmf->pgoff; | 1954 | pgoff_t offset = vmf->pgoff; |
1954 | struct page *page; | 1955 | struct page *page; |
1955 | pgoff_t size; | 1956 | loff_t size; |
1956 | int ret = 0; | 1957 | int ret = 0; |
1957 | 1958 | ||
1958 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 1959 | size = round_up(i_size_read(inode), PAGE_CACHE_SIZE); |
1959 | if (offset >= size) | 1960 | if (offset >= size >> PAGE_CACHE_SHIFT) |
1960 | return VM_FAULT_SIGBUS; | 1961 | return VM_FAULT_SIGBUS; |
1961 | 1962 | ||
1962 | /* | 1963 | /* |
@@ -2005,8 +2006,8 @@ retry_find: | |||
2005 | * Found the page and have a reference on it. | 2006 | * Found the page and have a reference on it. |
2006 | * We must recheck i_size under page lock. | 2007 | * We must recheck i_size under page lock. |
2007 | */ | 2008 | */ |
2008 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 2009 | size = round_up(i_size_read(inode), PAGE_CACHE_SIZE); |
2009 | if (unlikely(offset >= size)) { | 2010 | if (unlikely(offset >= size >> PAGE_CACHE_SHIFT)) { |
2010 | unlock_page(page); | 2011 | unlock_page(page); |
2011 | page_cache_release(page); | 2012 | page_cache_release(page); |
2012 | return VM_FAULT_SIGBUS; | 2013 | return VM_FAULT_SIGBUS; |
@@ -2064,6 +2065,78 @@ page_not_uptodate: | |||
2064 | } | 2065 | } |
2065 | EXPORT_SYMBOL(filemap_fault); | 2066 | EXPORT_SYMBOL(filemap_fault); |
2066 | 2067 | ||
2068 | void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) | ||
2069 | { | ||
2070 | struct radix_tree_iter iter; | ||
2071 | void **slot; | ||
2072 | struct file *file = vma->vm_file; | ||
2073 | struct address_space *mapping = file->f_mapping; | ||
2074 | loff_t size; | ||
2075 | struct page *page; | ||
2076 | unsigned long address = (unsigned long) vmf->virtual_address; | ||
2077 | unsigned long addr; | ||
2078 | pte_t *pte; | ||
2079 | |||
2080 | rcu_read_lock(); | ||
2081 | radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, vmf->pgoff) { | ||
2082 | if (iter.index > vmf->max_pgoff) | ||
2083 | break; | ||
2084 | repeat: | ||
2085 | page = radix_tree_deref_slot(slot); | ||
2086 | if (unlikely(!page)) | ||
2087 | goto next; | ||
2088 | if (radix_tree_exception(page)) { | ||
2089 | if (radix_tree_deref_retry(page)) | ||
2090 | break; | ||
2091 | else | ||
2092 | goto next; | ||
2093 | } | ||
2094 | |||
2095 | if (!page_cache_get_speculative(page)) | ||
2096 | goto repeat; | ||
2097 | |||
2098 | /* Has the page moved? */ | ||
2099 | if (unlikely(page != *slot)) { | ||
2100 | page_cache_release(page); | ||
2101 | goto repeat; | ||
2102 | } | ||
2103 | |||
2104 | if (!PageUptodate(page) || | ||
2105 | PageReadahead(page) || | ||
2106 | PageHWPoison(page)) | ||
2107 | goto skip; | ||
2108 | if (!trylock_page(page)) | ||
2109 | goto skip; | ||
2110 | |||
2111 | if (page->mapping != mapping || !PageUptodate(page)) | ||
2112 | goto unlock; | ||
2113 | |||
2114 | size = round_up(i_size_read(mapping->host), PAGE_CACHE_SIZE); | ||
2115 | if (page->index >= size >> PAGE_CACHE_SHIFT) | ||
2116 | goto unlock; | ||
2117 | |||
2118 | pte = vmf->pte + page->index - vmf->pgoff; | ||
2119 | if (!pte_none(*pte)) | ||
2120 | goto unlock; | ||
2121 | |||
2122 | if (file->f_ra.mmap_miss > 0) | ||
2123 | file->f_ra.mmap_miss--; | ||
2124 | addr = address + (page->index - vmf->pgoff) * PAGE_SIZE; | ||
2125 | do_set_pte(vma, addr, page, pte, false, false); | ||
2126 | unlock_page(page); | ||
2127 | goto next; | ||
2128 | unlock: | ||
2129 | unlock_page(page); | ||
2130 | skip: | ||
2131 | page_cache_release(page); | ||
2132 | next: | ||
2133 | if (iter.index == vmf->max_pgoff) | ||
2134 | break; | ||
2135 | } | ||
2136 | rcu_read_unlock(); | ||
2137 | } | ||
2138 | EXPORT_SYMBOL(filemap_map_pages); | ||
2139 | |||
2067 | int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | 2140 | int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) |
2068 | { | 2141 | { |
2069 | struct page *page = vmf->page; | 2142 | struct page *page = vmf->page; |
@@ -2093,6 +2166,7 @@ EXPORT_SYMBOL(filemap_page_mkwrite); | |||
2093 | 2166 | ||
2094 | const struct vm_operations_struct generic_file_vm_ops = { | 2167 | const struct vm_operations_struct generic_file_vm_ops = { |
2095 | .fault = filemap_fault, | 2168 | .fault = filemap_fault, |
2169 | .map_pages = filemap_map_pages, | ||
2096 | .page_mkwrite = filemap_page_mkwrite, | 2170 | .page_mkwrite = filemap_page_mkwrite, |
2097 | .remap_pages = generic_file_remap_pages, | 2171 | .remap_pages = generic_file_remap_pages, |
2098 | }; | 2172 | }; |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 6ac89e9f82ef..64635f5278ff 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -827,7 +827,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
827 | count_vm_event(THP_FAULT_FALLBACK); | 827 | count_vm_event(THP_FAULT_FALLBACK); |
828 | return VM_FAULT_FALLBACK; | 828 | return VM_FAULT_FALLBACK; |
829 | } | 829 | } |
830 | if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { | 830 | if (unlikely(mem_cgroup_charge_anon(page, mm, GFP_KERNEL))) { |
831 | put_page(page); | 831 | put_page(page); |
832 | count_vm_event(THP_FAULT_FALLBACK); | 832 | count_vm_event(THP_FAULT_FALLBACK); |
833 | return VM_FAULT_FALLBACK; | 833 | return VM_FAULT_FALLBACK; |
@@ -968,7 +968,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | |||
968 | __GFP_OTHER_NODE, | 968 | __GFP_OTHER_NODE, |
969 | vma, address, page_to_nid(page)); | 969 | vma, address, page_to_nid(page)); |
970 | if (unlikely(!pages[i] || | 970 | if (unlikely(!pages[i] || |
971 | mem_cgroup_newpage_charge(pages[i], mm, | 971 | mem_cgroup_charge_anon(pages[i], mm, |
972 | GFP_KERNEL))) { | 972 | GFP_KERNEL))) { |
973 | if (pages[i]) | 973 | if (pages[i]) |
974 | put_page(pages[i]); | 974 | put_page(pages[i]); |
@@ -1101,7 +1101,7 @@ alloc: | |||
1101 | goto out; | 1101 | goto out; |
1102 | } | 1102 | } |
1103 | 1103 | ||
1104 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { | 1104 | if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) { |
1105 | put_page(new_page); | 1105 | put_page(new_page); |
1106 | if (page) { | 1106 | if (page) { |
1107 | split_huge_page(page); | 1107 | split_huge_page(page); |
@@ -1891,17 +1891,22 @@ out: | |||
1891 | int hugepage_madvise(struct vm_area_struct *vma, | 1891 | int hugepage_madvise(struct vm_area_struct *vma, |
1892 | unsigned long *vm_flags, int advice) | 1892 | unsigned long *vm_flags, int advice) |
1893 | { | 1893 | { |
1894 | struct mm_struct *mm = vma->vm_mm; | ||
1895 | |||
1896 | switch (advice) { | 1894 | switch (advice) { |
1897 | case MADV_HUGEPAGE: | 1895 | case MADV_HUGEPAGE: |
1896 | #ifdef CONFIG_S390 | ||
1897 | /* | ||
1898 | * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390 | ||
1899 | * can't handle this properly after s390_enable_sie, so we simply | ||
1900 | * ignore the madvise to prevent qemu from causing a SIGSEGV. | ||
1901 | */ | ||
1902 | if (mm_has_pgste(vma->vm_mm)) | ||
1903 | return 0; | ||
1904 | #endif | ||
1898 | /* | 1905 | /* |
1899 | * Be somewhat over-protective like KSM for now! | 1906 | * Be somewhat over-protective like KSM for now! |
1900 | */ | 1907 | */ |
1901 | if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) | 1908 | if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) |
1902 | return -EINVAL; | 1909 | return -EINVAL; |
1903 | if (mm->def_flags & VM_NOHUGEPAGE) | ||
1904 | return -EINVAL; | ||
1905 | *vm_flags &= ~VM_NOHUGEPAGE; | 1910 | *vm_flags &= ~VM_NOHUGEPAGE; |
1906 | *vm_flags |= VM_HUGEPAGE; | 1911 | *vm_flags |= VM_HUGEPAGE; |
1907 | /* | 1912 | /* |
@@ -2354,7 +2359,7 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
2354 | if (!new_page) | 2359 | if (!new_page) |
2355 | return; | 2360 | return; |
2356 | 2361 | ||
2357 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) | 2362 | if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) |
2358 | return; | 2363 | return; |
2359 | 2364 | ||
2360 | /* | 2365 | /* |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7c02b9dadfb0..dd30f22b35e0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/nodemask.h> | 13 | #include <linux/nodemask.h> |
14 | #include <linux/pagemap.h> | 14 | #include <linux/pagemap.h> |
15 | #include <linux/mempolicy.h> | 15 | #include <linux/mempolicy.h> |
16 | #include <linux/compiler.h> | ||
16 | #include <linux/cpuset.h> | 17 | #include <linux/cpuset.h> |
17 | #include <linux/mutex.h> | 18 | #include <linux/mutex.h> |
18 | #include <linux/bootmem.h> | 19 | #include <linux/bootmem.h> |
@@ -1535,6 +1536,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, | |||
1535 | while (min_count < persistent_huge_pages(h)) { | 1536 | while (min_count < persistent_huge_pages(h)) { |
1536 | if (!free_pool_huge_page(h, nodes_allowed, 0)) | 1537 | if (!free_pool_huge_page(h, nodes_allowed, 0)) |
1537 | break; | 1538 | break; |
1539 | cond_resched_lock(&hugetlb_lock); | ||
1538 | } | 1540 | } |
1539 | while (count < persistent_huge_pages(h)) { | 1541 | while (count < persistent_huge_pages(h)) { |
1540 | if (!adjust_pool_surplus(h, nodes_allowed, 1)) | 1542 | if (!adjust_pool_surplus(h, nodes_allowed, 1)) |
@@ -2690,7 +2692,8 @@ retry_avoidcopy: | |||
2690 | BUG_ON(huge_pte_none(pte)); | 2692 | BUG_ON(huge_pte_none(pte)); |
2691 | spin_lock(ptl); | 2693 | spin_lock(ptl); |
2692 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); | 2694 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); |
2693 | if (likely(pte_same(huge_ptep_get(ptep), pte))) | 2695 | if (likely(ptep && |
2696 | pte_same(huge_ptep_get(ptep), pte))) | ||
2694 | goto retry_avoidcopy; | 2697 | goto retry_avoidcopy; |
2695 | /* | 2698 | /* |
2696 | * race occurs while re-acquiring page table | 2699 | * race occurs while re-acquiring page table |
@@ -2734,7 +2737,7 @@ retry_avoidcopy: | |||
2734 | */ | 2737 | */ |
2735 | spin_lock(ptl); | 2738 | spin_lock(ptl); |
2736 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); | 2739 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); |
2737 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { | 2740 | if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { |
2738 | ClearPagePrivate(new_page); | 2741 | ClearPagePrivate(new_page); |
2739 | 2742 | ||
2740 | /* Break COW */ | 2743 | /* Break COW */ |
@@ -2896,8 +2899,7 @@ retry: | |||
2896 | if (anon_rmap) { | 2899 | if (anon_rmap) { |
2897 | ClearPagePrivate(page); | 2900 | ClearPagePrivate(page); |
2898 | hugepage_add_new_anon_rmap(page, vma, address); | 2901 | hugepage_add_new_anon_rmap(page, vma, address); |
2899 | } | 2902 | } else |
2900 | else | ||
2901 | page_dup_rmap(page); | 2903 | page_dup_rmap(page); |
2902 | new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) | 2904 | new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) |
2903 | && (vma->vm_flags & VM_SHARED))); | 2905 | && (vma->vm_flags & VM_SHARED))); |
@@ -3185,6 +3187,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, | |||
3185 | BUG_ON(address >= end); | 3187 | BUG_ON(address >= end); |
3186 | flush_cache_range(vma, address, end); | 3188 | flush_cache_range(vma, address, end); |
3187 | 3189 | ||
3190 | mmu_notifier_invalidate_range_start(mm, start, end); | ||
3188 | mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); | 3191 | mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); |
3189 | for (; address < end; address += huge_page_size(h)) { | 3192 | for (; address < end; address += huge_page_size(h)) { |
3190 | spinlock_t *ptl; | 3193 | spinlock_t *ptl; |
@@ -3214,6 +3217,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, | |||
3214 | */ | 3217 | */ |
3215 | flush_tlb_range(vma, start, end); | 3218 | flush_tlb_range(vma, start, end); |
3216 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | 3219 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); |
3220 | mmu_notifier_invalidate_range_end(mm, start, end); | ||
3217 | 3221 | ||
3218 | return pages << h->order; | 3222 | return pages << h->order; |
3219 | } | 3223 | } |
@@ -3518,7 +3522,7 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, | |||
3518 | #else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */ | 3522 | #else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */ |
3519 | 3523 | ||
3520 | /* Can be overriden by architectures */ | 3524 | /* Can be overriden by architectures */ |
3521 | __attribute__((weak)) struct page * | 3525 | struct page * __weak |
3522 | follow_huge_pud(struct mm_struct *mm, unsigned long address, | 3526 | follow_huge_pud(struct mm_struct *mm, unsigned long address, |
3523 | pud_t *pud, int write) | 3527 | pud_t *pud, int write) |
3524 | { | 3528 | { |
diff --git a/mm/internal.h b/mm/internal.h index 29e1e761f9eb..07b67361a40a 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -11,6 +11,7 @@ | |||
11 | #ifndef __MM_INTERNAL_H | 11 | #ifndef __MM_INTERNAL_H |
12 | #define __MM_INTERNAL_H | 12 | #define __MM_INTERNAL_H |
13 | 13 | ||
14 | #include <linux/fs.h> | ||
14 | #include <linux/mm.h> | 15 | #include <linux/mm.h> |
15 | 16 | ||
16 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, | 17 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, |
@@ -21,6 +22,20 @@ static inline void set_page_count(struct page *page, int v) | |||
21 | atomic_set(&page->_count, v); | 22 | atomic_set(&page->_count, v); |
22 | } | 23 | } |
23 | 24 | ||
25 | extern int __do_page_cache_readahead(struct address_space *mapping, | ||
26 | struct file *filp, pgoff_t offset, unsigned long nr_to_read, | ||
27 | unsigned long lookahead_size); | ||
28 | |||
29 | /* | ||
30 | * Submit IO for the read-ahead request in file_ra_state. | ||
31 | */ | ||
32 | static inline unsigned long ra_submit(struct file_ra_state *ra, | ||
33 | struct address_space *mapping, struct file *filp) | ||
34 | { | ||
35 | return __do_page_cache_readahead(mapping, filp, | ||
36 | ra->start, ra->size, ra->async_size); | ||
37 | } | ||
38 | |||
24 | /* | 39 | /* |
25 | * Turn a non-refcounted page (->_count == 0) into refcounted with | 40 | * Turn a non-refcounted page (->_count == 0) into refcounted with |
26 | * a count of one. | 41 | * a count of one. |
@@ -370,5 +385,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, | |||
370 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ | 385 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ |
371 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ | 386 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ |
372 | #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ | 387 | #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ |
388 | #define ALLOC_FAIR 0x100 /* fair zone allocation */ | ||
373 | 389 | ||
374 | #endif /* __MM_INTERNAL_H */ | 390 | #endif /* __MM_INTERNAL_H */ |
diff --git a/mm/memblock.c b/mm/memblock.c index 7fe5354e7552..e9d6ca9a01a9 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -1253,7 +1253,7 @@ phys_addr_t __init memblock_mem_size(unsigned long limit_pfn) | |||
1253 | pages += end_pfn - start_pfn; | 1253 | pages += end_pfn - start_pfn; |
1254 | } | 1254 | } |
1255 | 1255 | ||
1256 | return (phys_addr_t)pages << PAGE_SHIFT; | 1256 | return PFN_PHYS(pages); |
1257 | } | 1257 | } |
1258 | 1258 | ||
1259 | /* lowest address */ | 1259 | /* lowest address */ |
@@ -1271,16 +1271,14 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void) | |||
1271 | 1271 | ||
1272 | void __init memblock_enforce_memory_limit(phys_addr_t limit) | 1272 | void __init memblock_enforce_memory_limit(phys_addr_t limit) |
1273 | { | 1273 | { |
1274 | unsigned long i; | ||
1275 | phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; | 1274 | phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; |
1275 | struct memblock_region *r; | ||
1276 | 1276 | ||
1277 | if (!limit) | 1277 | if (!limit) |
1278 | return; | 1278 | return; |
1279 | 1279 | ||
1280 | /* find out max address */ | 1280 | /* find out max address */ |
1281 | for (i = 0; i < memblock.memory.cnt; i++) { | 1281 | for_each_memblock(memory, r) { |
1282 | struct memblock_region *r = &memblock.memory.regions[i]; | ||
1283 | |||
1284 | if (limit <= r->size) { | 1282 | if (limit <= r->size) { |
1285 | max_addr = r->base + limit; | 1283 | max_addr = r->base + limit; |
1286 | break; | 1284 | break; |
@@ -1326,7 +1324,7 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn, | |||
1326 | unsigned long *start_pfn, unsigned long *end_pfn) | 1324 | unsigned long *start_pfn, unsigned long *end_pfn) |
1327 | { | 1325 | { |
1328 | struct memblock_type *type = &memblock.memory; | 1326 | struct memblock_type *type = &memblock.memory; |
1329 | int mid = memblock_search(type, (phys_addr_t)pfn << PAGE_SHIFT); | 1327 | int mid = memblock_search(type, PFN_PHYS(pfn)); |
1330 | 1328 | ||
1331 | if (mid == -1) | 1329 | if (mid == -1) |
1332 | return -1; | 1330 | return -1; |
@@ -1379,13 +1377,12 @@ int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t si | |||
1379 | 1377 | ||
1380 | void __init_memblock memblock_trim_memory(phys_addr_t align) | 1378 | void __init_memblock memblock_trim_memory(phys_addr_t align) |
1381 | { | 1379 | { |
1382 | int i; | ||
1383 | phys_addr_t start, end, orig_start, orig_end; | 1380 | phys_addr_t start, end, orig_start, orig_end; |
1384 | struct memblock_type *mem = &memblock.memory; | 1381 | struct memblock_region *r; |
1385 | 1382 | ||
1386 | for (i = 0; i < mem->cnt; i++) { | 1383 | for_each_memblock(memory, r) { |
1387 | orig_start = mem->regions[i].base; | 1384 | orig_start = r->base; |
1388 | orig_end = mem->regions[i].base + mem->regions[i].size; | 1385 | orig_end = r->base + r->size; |
1389 | start = round_up(orig_start, align); | 1386 | start = round_up(orig_start, align); |
1390 | end = round_down(orig_end, align); | 1387 | end = round_down(orig_end, align); |
1391 | 1388 | ||
@@ -1393,11 +1390,12 @@ void __init_memblock memblock_trim_memory(phys_addr_t align) | |||
1393 | continue; | 1390 | continue; |
1394 | 1391 | ||
1395 | if (start < end) { | 1392 | if (start < end) { |
1396 | mem->regions[i].base = start; | 1393 | r->base = start; |
1397 | mem->regions[i].size = end - start; | 1394 | r->size = end - start; |
1398 | } else { | 1395 | } else { |
1399 | memblock_remove_region(mem, i); | 1396 | memblock_remove_region(&memblock.memory, |
1400 | i--; | 1397 | r - memblock.memory.regions); |
1398 | r--; | ||
1401 | } | 1399 | } |
1402 | } | 1400 | } |
1403 | } | 1401 | } |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index dcc8153a1681..29501f040568 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -921,8 +921,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, | |||
921 | struct page *page, | 921 | struct page *page, |
922 | bool anon, int nr_pages) | 922 | bool anon, int nr_pages) |
923 | { | 923 | { |
924 | preempt_disable(); | ||
925 | |||
926 | /* | 924 | /* |
927 | * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is | 925 | * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is |
928 | * counted as CACHE even if it's on ANON LRU. | 926 | * counted as CACHE even if it's on ANON LRU. |
@@ -947,8 +945,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, | |||
947 | } | 945 | } |
948 | 946 | ||
949 | __this_cpu_add(memcg->stat->nr_page_events, nr_pages); | 947 | __this_cpu_add(memcg->stat->nr_page_events, nr_pages); |
950 | |||
951 | preempt_enable(); | ||
952 | } | 948 | } |
953 | 949 | ||
954 | unsigned long | 950 | unsigned long |
@@ -1075,22 +1071,15 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | |||
1075 | return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); | 1071 | return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); |
1076 | } | 1072 | } |
1077 | 1073 | ||
1078 | struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | 1074 | static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) |
1079 | { | 1075 | { |
1080 | struct mem_cgroup *memcg = NULL; | 1076 | struct mem_cgroup *memcg = NULL; |
1081 | 1077 | ||
1082 | if (!mm) | ||
1083 | return NULL; | ||
1084 | /* | ||
1085 | * Because we have no locks, mm->owner's may be being moved to other | ||
1086 | * cgroup. We use css_tryget() here even if this looks | ||
1087 | * pessimistic (rather than adding locks here). | ||
1088 | */ | ||
1089 | rcu_read_lock(); | 1078 | rcu_read_lock(); |
1090 | do { | 1079 | do { |
1091 | memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 1080 | memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); |
1092 | if (unlikely(!memcg)) | 1081 | if (unlikely(!memcg)) |
1093 | break; | 1082 | memcg = root_mem_cgroup; |
1094 | } while (!css_tryget(&memcg->css)); | 1083 | } while (!css_tryget(&memcg->css)); |
1095 | rcu_read_unlock(); | 1084 | rcu_read_unlock(); |
1096 | return memcg; | 1085 | return memcg; |
@@ -1486,7 +1475,7 @@ bool task_in_mem_cgroup(struct task_struct *task, | |||
1486 | 1475 | ||
1487 | p = find_lock_task_mm(task); | 1476 | p = find_lock_task_mm(task); |
1488 | if (p) { | 1477 | if (p) { |
1489 | curr = try_get_mem_cgroup_from_mm(p->mm); | 1478 | curr = get_mem_cgroup_from_mm(p->mm); |
1490 | task_unlock(p); | 1479 | task_unlock(p); |
1491 | } else { | 1480 | } else { |
1492 | /* | 1481 | /* |
@@ -1500,8 +1489,6 @@ bool task_in_mem_cgroup(struct task_struct *task, | |||
1500 | css_get(&curr->css); | 1489 | css_get(&curr->css); |
1501 | rcu_read_unlock(); | 1490 | rcu_read_unlock(); |
1502 | } | 1491 | } |
1503 | if (!curr) | ||
1504 | return false; | ||
1505 | /* | 1492 | /* |
1506 | * We should check use_hierarchy of "memcg" not "curr". Because checking | 1493 | * We should check use_hierarchy of "memcg" not "curr". Because checking |
1507 | * use_hierarchy of "curr" here make this function true if hierarchy is | 1494 | * use_hierarchy of "curr" here make this function true if hierarchy is |
@@ -2588,7 +2575,7 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, | |||
2588 | } | 2575 | } |
2589 | 2576 | ||
2590 | 2577 | ||
2591 | /* See __mem_cgroup_try_charge() for details */ | 2578 | /* See mem_cgroup_try_charge() for details */ |
2592 | enum { | 2579 | enum { |
2593 | CHARGE_OK, /* success */ | 2580 | CHARGE_OK, /* success */ |
2594 | CHARGE_RETRY, /* need to retry but retry is not bad */ | 2581 | CHARGE_RETRY, /* need to retry but retry is not bad */ |
@@ -2661,45 +2648,34 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
2661 | return CHARGE_NOMEM; | 2648 | return CHARGE_NOMEM; |
2662 | } | 2649 | } |
2663 | 2650 | ||
2664 | /* | 2651 | /** |
2665 | * __mem_cgroup_try_charge() does | 2652 | * mem_cgroup_try_charge - try charging a memcg |
2666 | * 1. detect memcg to be charged against from passed *mm and *ptr, | 2653 | * @memcg: memcg to charge |
2667 | * 2. update res_counter | 2654 | * @nr_pages: number of pages to charge |
2668 | * 3. call memory reclaim if necessary. | 2655 | * @oom: trigger OOM if reclaim fails |
2669 | * | ||
2670 | * In some special case, if the task is fatal, fatal_signal_pending() or | ||
2671 | * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup | ||
2672 | * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon | ||
2673 | * as possible without any hazards. 2: all pages should have a valid | ||
2674 | * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg | ||
2675 | * pointer, that is treated as a charge to root_mem_cgroup. | ||
2676 | * | ||
2677 | * So __mem_cgroup_try_charge() will return | ||
2678 | * 0 ... on success, filling *ptr with a valid memcg pointer. | ||
2679 | * -ENOMEM ... charge failure because of resource limits. | ||
2680 | * -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup. | ||
2681 | * | 2656 | * |
2682 | * Unlike the exported interface, an "oom" parameter is added. if oom==true, | 2657 | * Returns 0 if @memcg was charged successfully, -EINTR if the charge |
2683 | * the oom-killer can be invoked. | 2658 | * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed. |
2684 | */ | 2659 | */ |
2685 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | 2660 | static int mem_cgroup_try_charge(struct mem_cgroup *memcg, |
2686 | gfp_t gfp_mask, | 2661 | gfp_t gfp_mask, |
2687 | unsigned int nr_pages, | 2662 | unsigned int nr_pages, |
2688 | struct mem_cgroup **ptr, | 2663 | bool oom) |
2689 | bool oom) | ||
2690 | { | 2664 | { |
2691 | unsigned int batch = max(CHARGE_BATCH, nr_pages); | 2665 | unsigned int batch = max(CHARGE_BATCH, nr_pages); |
2692 | int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; | 2666 | int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; |
2693 | struct mem_cgroup *memcg = NULL; | ||
2694 | int ret; | 2667 | int ret; |
2695 | 2668 | ||
2669 | if (mem_cgroup_is_root(memcg)) | ||
2670 | goto done; | ||
2696 | /* | 2671 | /* |
2697 | * Unlike gloval-vm's OOM-kill, we're not in memory shortage | 2672 | * Unlike in global OOM situations, memcg is not in a physical |
2698 | * in system level. So, allow to go ahead dying process in addition to | 2673 | * memory shortage. Allow dying and OOM-killed tasks to |
2699 | * MEMDIE process. | 2674 | * bypass the last charges so that they can exit quickly and |
2675 | * free their memory. | ||
2700 | */ | 2676 | */ |
2701 | if (unlikely(test_thread_flag(TIF_MEMDIE) | 2677 | if (unlikely(test_thread_flag(TIF_MEMDIE) || |
2702 | || fatal_signal_pending(current))) | 2678 | fatal_signal_pending(current))) |
2703 | goto bypass; | 2679 | goto bypass; |
2704 | 2680 | ||
2705 | if (unlikely(task_in_memcg_oom(current))) | 2681 | if (unlikely(task_in_memcg_oom(current))) |
@@ -2707,73 +2683,16 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
2707 | 2683 | ||
2708 | if (gfp_mask & __GFP_NOFAIL) | 2684 | if (gfp_mask & __GFP_NOFAIL) |
2709 | oom = false; | 2685 | oom = false; |
2710 | |||
2711 | /* | ||
2712 | * We always charge the cgroup the mm_struct belongs to. | ||
2713 | * The mm_struct's mem_cgroup changes on task migration if the | ||
2714 | * thread group leader migrates. It's possible that mm is not | ||
2715 | * set, if so charge the root memcg (happens for pagecache usage). | ||
2716 | */ | ||
2717 | if (!*ptr && !mm) | ||
2718 | *ptr = root_mem_cgroup; | ||
2719 | again: | 2686 | again: |
2720 | if (*ptr) { /* css should be a valid one */ | 2687 | if (consume_stock(memcg, nr_pages)) |
2721 | memcg = *ptr; | 2688 | goto done; |
2722 | if (mem_cgroup_is_root(memcg)) | ||
2723 | goto done; | ||
2724 | if (consume_stock(memcg, nr_pages)) | ||
2725 | goto done; | ||
2726 | css_get(&memcg->css); | ||
2727 | } else { | ||
2728 | struct task_struct *p; | ||
2729 | |||
2730 | rcu_read_lock(); | ||
2731 | p = rcu_dereference(mm->owner); | ||
2732 | /* | ||
2733 | * Because we don't have task_lock(), "p" can exit. | ||
2734 | * In that case, "memcg" can point to root or p can be NULL with | ||
2735 | * race with swapoff. Then, we have small risk of mis-accouning. | ||
2736 | * But such kind of mis-account by race always happens because | ||
2737 | * we don't have cgroup_mutex(). It's overkill and we allo that | ||
2738 | * small race, here. | ||
2739 | * (*) swapoff at el will charge against mm-struct not against | ||
2740 | * task-struct. So, mm->owner can be NULL. | ||
2741 | */ | ||
2742 | memcg = mem_cgroup_from_task(p); | ||
2743 | if (!memcg) | ||
2744 | memcg = root_mem_cgroup; | ||
2745 | if (mem_cgroup_is_root(memcg)) { | ||
2746 | rcu_read_unlock(); | ||
2747 | goto done; | ||
2748 | } | ||
2749 | if (consume_stock(memcg, nr_pages)) { | ||
2750 | /* | ||
2751 | * It seems dagerous to access memcg without css_get(). | ||
2752 | * But considering how consume_stok works, it's not | ||
2753 | * necessary. If consume_stock success, some charges | ||
2754 | * from this memcg are cached on this cpu. So, we | ||
2755 | * don't need to call css_get()/css_tryget() before | ||
2756 | * calling consume_stock(). | ||
2757 | */ | ||
2758 | rcu_read_unlock(); | ||
2759 | goto done; | ||
2760 | } | ||
2761 | /* after here, we may be blocked. we need to get refcnt */ | ||
2762 | if (!css_tryget(&memcg->css)) { | ||
2763 | rcu_read_unlock(); | ||
2764 | goto again; | ||
2765 | } | ||
2766 | rcu_read_unlock(); | ||
2767 | } | ||
2768 | 2689 | ||
2769 | do { | 2690 | do { |
2770 | bool invoke_oom = oom && !nr_oom_retries; | 2691 | bool invoke_oom = oom && !nr_oom_retries; |
2771 | 2692 | ||
2772 | /* If killed, bypass charge */ | 2693 | /* If killed, bypass charge */ |
2773 | if (fatal_signal_pending(current)) { | 2694 | if (fatal_signal_pending(current)) |
2774 | css_put(&memcg->css); | ||
2775 | goto bypass; | 2695 | goto bypass; |
2776 | } | ||
2777 | 2696 | ||
2778 | ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, | 2697 | ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, |
2779 | nr_pages, invoke_oom); | 2698 | nr_pages, invoke_oom); |
@@ -2782,17 +2701,12 @@ again: | |||
2782 | break; | 2701 | break; |
2783 | case CHARGE_RETRY: /* not in OOM situation but retry */ | 2702 | case CHARGE_RETRY: /* not in OOM situation but retry */ |
2784 | batch = nr_pages; | 2703 | batch = nr_pages; |
2785 | css_put(&memcg->css); | ||
2786 | memcg = NULL; | ||
2787 | goto again; | 2704 | goto again; |
2788 | case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ | 2705 | case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ |
2789 | css_put(&memcg->css); | ||
2790 | goto nomem; | 2706 | goto nomem; |
2791 | case CHARGE_NOMEM: /* OOM routine works */ | 2707 | case CHARGE_NOMEM: /* OOM routine works */ |
2792 | if (!oom || invoke_oom) { | 2708 | if (!oom || invoke_oom) |
2793 | css_put(&memcg->css); | ||
2794 | goto nomem; | 2709 | goto nomem; |
2795 | } | ||
2796 | nr_oom_retries--; | 2710 | nr_oom_retries--; |
2797 | break; | 2711 | break; |
2798 | } | 2712 | } |
@@ -2800,20 +2714,44 @@ again: | |||
2800 | 2714 | ||
2801 | if (batch > nr_pages) | 2715 | if (batch > nr_pages) |
2802 | refill_stock(memcg, batch - nr_pages); | 2716 | refill_stock(memcg, batch - nr_pages); |
2803 | css_put(&memcg->css); | ||
2804 | done: | 2717 | done: |
2805 | *ptr = memcg; | ||
2806 | return 0; | 2718 | return 0; |
2807 | nomem: | 2719 | nomem: |
2808 | if (!(gfp_mask & __GFP_NOFAIL)) { | 2720 | if (!(gfp_mask & __GFP_NOFAIL)) |
2809 | *ptr = NULL; | ||
2810 | return -ENOMEM; | 2721 | return -ENOMEM; |
2811 | } | ||
2812 | bypass: | 2722 | bypass: |
2813 | *ptr = root_mem_cgroup; | ||
2814 | return -EINTR; | 2723 | return -EINTR; |
2815 | } | 2724 | } |
2816 | 2725 | ||
2726 | /** | ||
2727 | * mem_cgroup_try_charge_mm - try charging a mm | ||
2728 | * @mm: mm_struct to charge | ||
2729 | * @nr_pages: number of pages to charge | ||
2730 | * @oom: trigger OOM if reclaim fails | ||
2731 | * | ||
2732 | * Returns the charged mem_cgroup associated with the given mm_struct or | ||
2733 | * NULL the charge failed. | ||
2734 | */ | ||
2735 | static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm, | ||
2736 | gfp_t gfp_mask, | ||
2737 | unsigned int nr_pages, | ||
2738 | bool oom) | ||
2739 | |||
2740 | { | ||
2741 | struct mem_cgroup *memcg; | ||
2742 | int ret; | ||
2743 | |||
2744 | memcg = get_mem_cgroup_from_mm(mm); | ||
2745 | ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom); | ||
2746 | css_put(&memcg->css); | ||
2747 | if (ret == -EINTR) | ||
2748 | memcg = root_mem_cgroup; | ||
2749 | else if (ret) | ||
2750 | memcg = NULL; | ||
2751 | |||
2752 | return memcg; | ||
2753 | } | ||
2754 | |||
2817 | /* | 2755 | /* |
2818 | * Somemtimes we have to undo a charge we got by try_charge(). | 2756 | * Somemtimes we have to undo a charge we got by try_charge(). |
2819 | * This function is for that and do uncharge, put css's refcnt. | 2757 | * This function is for that and do uncharge, put css's refcnt. |
@@ -3009,20 +2947,17 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) | |||
3009 | static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) | 2947 | static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) |
3010 | { | 2948 | { |
3011 | struct res_counter *fail_res; | 2949 | struct res_counter *fail_res; |
3012 | struct mem_cgroup *_memcg; | ||
3013 | int ret = 0; | 2950 | int ret = 0; |
3014 | 2951 | ||
3015 | ret = res_counter_charge(&memcg->kmem, size, &fail_res); | 2952 | ret = res_counter_charge(&memcg->kmem, size, &fail_res); |
3016 | if (ret) | 2953 | if (ret) |
3017 | return ret; | 2954 | return ret; |
3018 | 2955 | ||
3019 | _memcg = memcg; | 2956 | ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT, |
3020 | ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT, | 2957 | oom_gfp_allowed(gfp)); |
3021 | &_memcg, oom_gfp_allowed(gfp)); | ||
3022 | |||
3023 | if (ret == -EINTR) { | 2958 | if (ret == -EINTR) { |
3024 | /* | 2959 | /* |
3025 | * __mem_cgroup_try_charge() chosed to bypass to root due to | 2960 | * mem_cgroup_try_charge() chosed to bypass to root due to |
3026 | * OOM kill or fatal signal. Since our only options are to | 2961 | * OOM kill or fatal signal. Since our only options are to |
3027 | * either fail the allocation or charge it to this cgroup, do | 2962 | * either fail the allocation or charge it to this cgroup, do |
3028 | * it as a temporary condition. But we can't fail. From a | 2963 | * it as a temporary condition. But we can't fail. From a |
@@ -3032,7 +2967,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) | |||
3032 | * | 2967 | * |
3033 | * This condition will only trigger if the task entered | 2968 | * This condition will only trigger if the task entered |
3034 | * memcg_charge_kmem in a sane state, but was OOM-killed during | 2969 | * memcg_charge_kmem in a sane state, but was OOM-killed during |
3035 | * __mem_cgroup_try_charge() above. Tasks that were already | 2970 | * mem_cgroup_try_charge() above. Tasks that were already |
3036 | * dying when the allocation triggers should have been already | 2971 | * dying when the allocation triggers should have been already |
3037 | * directed to the root cgroup in memcontrol.h | 2972 | * directed to the root cgroup in memcontrol.h |
3038 | */ | 2973 | */ |
@@ -3159,6 +3094,29 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) | |||
3159 | return 0; | 3094 | return 0; |
3160 | } | 3095 | } |
3161 | 3096 | ||
3097 | char *memcg_create_cache_name(struct mem_cgroup *memcg, | ||
3098 | struct kmem_cache *root_cache) | ||
3099 | { | ||
3100 | static char *buf = NULL; | ||
3101 | |||
3102 | /* | ||
3103 | * We need a mutex here to protect the shared buffer. Since this is | ||
3104 | * expected to be called only on cache creation, we can employ the | ||
3105 | * slab_mutex for that purpose. | ||
3106 | */ | ||
3107 | lockdep_assert_held(&slab_mutex); | ||
3108 | |||
3109 | if (!buf) { | ||
3110 | buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); | ||
3111 | if (!buf) | ||
3112 | return NULL; | ||
3113 | } | ||
3114 | |||
3115 | cgroup_name(memcg->css.cgroup, buf, NAME_MAX + 1); | ||
3116 | return kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, | ||
3117 | memcg_cache_id(memcg), buf); | ||
3118 | } | ||
3119 | |||
3162 | int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, | 3120 | int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, |
3163 | struct kmem_cache *root_cache) | 3121 | struct kmem_cache *root_cache) |
3164 | { | 3122 | { |
@@ -3182,6 +3140,7 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, | |||
3182 | s->memcg_params->root_cache = root_cache; | 3140 | s->memcg_params->root_cache = root_cache; |
3183 | INIT_WORK(&s->memcg_params->destroy, | 3141 | INIT_WORK(&s->memcg_params->destroy, |
3184 | kmem_cache_destroy_work_func); | 3142 | kmem_cache_destroy_work_func); |
3143 | css_get(&memcg->css); | ||
3185 | } else | 3144 | } else |
3186 | s->memcg_params->is_root_cache = true; | 3145 | s->memcg_params->is_root_cache = true; |
3187 | 3146 | ||
@@ -3190,6 +3149,10 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, | |||
3190 | 3149 | ||
3191 | void memcg_free_cache_params(struct kmem_cache *s) | 3150 | void memcg_free_cache_params(struct kmem_cache *s) |
3192 | { | 3151 | { |
3152 | if (!s->memcg_params) | ||
3153 | return; | ||
3154 | if (!s->memcg_params->is_root_cache) | ||
3155 | css_put(&s->memcg_params->memcg->css); | ||
3193 | kfree(s->memcg_params); | 3156 | kfree(s->memcg_params); |
3194 | } | 3157 | } |
3195 | 3158 | ||
@@ -3212,9 +3175,6 @@ void memcg_register_cache(struct kmem_cache *s) | |||
3212 | memcg = s->memcg_params->memcg; | 3175 | memcg = s->memcg_params->memcg; |
3213 | id = memcg_cache_id(memcg); | 3176 | id = memcg_cache_id(memcg); |
3214 | 3177 | ||
3215 | css_get(&memcg->css); | ||
3216 | |||
3217 | |||
3218 | /* | 3178 | /* |
3219 | * Since readers won't lock (see cache_from_memcg_idx()), we need a | 3179 | * Since readers won't lock (see cache_from_memcg_idx()), we need a |
3220 | * barrier here to ensure nobody will see the kmem_cache partially | 3180 | * barrier here to ensure nobody will see the kmem_cache partially |
@@ -3263,10 +3223,8 @@ void memcg_unregister_cache(struct kmem_cache *s) | |||
3263 | * after removing it from the memcg_slab_caches list, otherwise we can | 3223 | * after removing it from the memcg_slab_caches list, otherwise we can |
3264 | * fail to convert memcg_params_to_cache() while traversing the list. | 3224 | * fail to convert memcg_params_to_cache() while traversing the list. |
3265 | */ | 3225 | */ |
3266 | VM_BUG_ON(!root->memcg_params->memcg_caches[id]); | 3226 | VM_BUG_ON(root->memcg_params->memcg_caches[id] != s); |
3267 | root->memcg_params->memcg_caches[id] = NULL; | 3227 | root->memcg_params->memcg_caches[id] = NULL; |
3268 | |||
3269 | css_put(&memcg->css); | ||
3270 | } | 3228 | } |
3271 | 3229 | ||
3272 | /* | 3230 | /* |
@@ -3363,55 +3321,10 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep) | |||
3363 | schedule_work(&cachep->memcg_params->destroy); | 3321 | schedule_work(&cachep->memcg_params->destroy); |
3364 | } | 3322 | } |
3365 | 3323 | ||
3366 | static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, | 3324 | int __kmem_cache_destroy_memcg_children(struct kmem_cache *s) |
3367 | struct kmem_cache *s) | ||
3368 | { | ||
3369 | struct kmem_cache *new = NULL; | ||
3370 | static char *tmp_path = NULL, *tmp_name = NULL; | ||
3371 | static DEFINE_MUTEX(mutex); /* protects tmp_name */ | ||
3372 | |||
3373 | BUG_ON(!memcg_can_account_kmem(memcg)); | ||
3374 | |||
3375 | mutex_lock(&mutex); | ||
3376 | /* | ||
3377 | * kmem_cache_create_memcg duplicates the given name and | ||
3378 | * cgroup_name for this name requires RCU context. | ||
3379 | * This static temporary buffer is used to prevent from | ||
3380 | * pointless shortliving allocation. | ||
3381 | */ | ||
3382 | if (!tmp_path || !tmp_name) { | ||
3383 | if (!tmp_path) | ||
3384 | tmp_path = kmalloc(PATH_MAX, GFP_KERNEL); | ||
3385 | if (!tmp_name) | ||
3386 | tmp_name = kmalloc(NAME_MAX + 1, GFP_KERNEL); | ||
3387 | if (!tmp_path || !tmp_name) | ||
3388 | goto out; | ||
3389 | } | ||
3390 | |||
3391 | cgroup_name(memcg->css.cgroup, tmp_name, NAME_MAX + 1); | ||
3392 | snprintf(tmp_path, PATH_MAX, "%s(%d:%s)", s->name, | ||
3393 | memcg_cache_id(memcg), tmp_name); | ||
3394 | |||
3395 | new = kmem_cache_create_memcg(memcg, tmp_path, s->object_size, s->align, | ||
3396 | (s->flags & ~SLAB_PANIC), s->ctor, s); | ||
3397 | if (new) | ||
3398 | new->allocflags |= __GFP_KMEMCG; | ||
3399 | else | ||
3400 | new = s; | ||
3401 | out: | ||
3402 | mutex_unlock(&mutex); | ||
3403 | return new; | ||
3404 | } | ||
3405 | |||
3406 | void kmem_cache_destroy_memcg_children(struct kmem_cache *s) | ||
3407 | { | 3325 | { |
3408 | struct kmem_cache *c; | 3326 | struct kmem_cache *c; |
3409 | int i; | 3327 | int i, failed = 0; |
3410 | |||
3411 | if (!s->memcg_params) | ||
3412 | return; | ||
3413 | if (!s->memcg_params->is_root_cache) | ||
3414 | return; | ||
3415 | 3328 | ||
3416 | /* | 3329 | /* |
3417 | * If the cache is being destroyed, we trust that there is no one else | 3330 | * If the cache is being destroyed, we trust that there is no one else |
@@ -3445,16 +3358,14 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s) | |||
3445 | c->memcg_params->dead = false; | 3358 | c->memcg_params->dead = false; |
3446 | cancel_work_sync(&c->memcg_params->destroy); | 3359 | cancel_work_sync(&c->memcg_params->destroy); |
3447 | kmem_cache_destroy(c); | 3360 | kmem_cache_destroy(c); |
3361 | |||
3362 | if (cache_from_memcg_idx(s, i)) | ||
3363 | failed++; | ||
3448 | } | 3364 | } |
3449 | mutex_unlock(&activate_kmem_mutex); | 3365 | mutex_unlock(&activate_kmem_mutex); |
3366 | return failed; | ||
3450 | } | 3367 | } |
3451 | 3368 | ||
3452 | struct create_work { | ||
3453 | struct mem_cgroup *memcg; | ||
3454 | struct kmem_cache *cachep; | ||
3455 | struct work_struct work; | ||
3456 | }; | ||
3457 | |||
3458 | static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) | 3369 | static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) |
3459 | { | 3370 | { |
3460 | struct kmem_cache *cachep; | 3371 | struct kmem_cache *cachep; |
@@ -3472,13 +3383,20 @@ static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) | |||
3472 | mutex_unlock(&memcg->slab_caches_mutex); | 3383 | mutex_unlock(&memcg->slab_caches_mutex); |
3473 | } | 3384 | } |
3474 | 3385 | ||
3386 | struct create_work { | ||
3387 | struct mem_cgroup *memcg; | ||
3388 | struct kmem_cache *cachep; | ||
3389 | struct work_struct work; | ||
3390 | }; | ||
3391 | |||
3475 | static void memcg_create_cache_work_func(struct work_struct *w) | 3392 | static void memcg_create_cache_work_func(struct work_struct *w) |
3476 | { | 3393 | { |
3477 | struct create_work *cw; | 3394 | struct create_work *cw = container_of(w, struct create_work, work); |
3395 | struct mem_cgroup *memcg = cw->memcg; | ||
3396 | struct kmem_cache *cachep = cw->cachep; | ||
3478 | 3397 | ||
3479 | cw = container_of(w, struct create_work, work); | 3398 | kmem_cache_create_memcg(memcg, cachep); |
3480 | memcg_create_kmem_cache(cw->memcg, cw->cachep); | 3399 | css_put(&memcg->css); |
3481 | css_put(&cw->memcg->css); | ||
3482 | kfree(cw); | 3400 | kfree(cw); |
3483 | } | 3401 | } |
3484 | 3402 | ||
@@ -3637,15 +3555,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) | |||
3637 | if (!current->mm || current->memcg_kmem_skip_account) | 3555 | if (!current->mm || current->memcg_kmem_skip_account) |
3638 | return true; | 3556 | return true; |
3639 | 3557 | ||
3640 | memcg = try_get_mem_cgroup_from_mm(current->mm); | 3558 | memcg = get_mem_cgroup_from_mm(current->mm); |
3641 | |||
3642 | /* | ||
3643 | * very rare case described in mem_cgroup_from_task. Unfortunately there | ||
3644 | * isn't much we can do without complicating this too much, and it would | ||
3645 | * be gfp-dependent anyway. Just let it go | ||
3646 | */ | ||
3647 | if (unlikely(!memcg)) | ||
3648 | return true; | ||
3649 | 3559 | ||
3650 | if (!memcg_can_account_kmem(memcg)) { | 3560 | if (!memcg_can_account_kmem(memcg)) { |
3651 | css_put(&memcg->css); | 3561 | css_put(&memcg->css); |
@@ -3748,19 +3658,6 @@ void mem_cgroup_split_huge_fixup(struct page *head) | |||
3748 | } | 3658 | } |
3749 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 3659 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
3750 | 3660 | ||
3751 | static inline | ||
3752 | void mem_cgroup_move_account_page_stat(struct mem_cgroup *from, | ||
3753 | struct mem_cgroup *to, | ||
3754 | unsigned int nr_pages, | ||
3755 | enum mem_cgroup_stat_index idx) | ||
3756 | { | ||
3757 | /* Update stat data for mem_cgroup */ | ||
3758 | preempt_disable(); | ||
3759 | __this_cpu_sub(from->stat->count[idx], nr_pages); | ||
3760 | __this_cpu_add(to->stat->count[idx], nr_pages); | ||
3761 | preempt_enable(); | ||
3762 | } | ||
3763 | |||
3764 | /** | 3661 | /** |
3765 | * mem_cgroup_move_account - move account of the page | 3662 | * mem_cgroup_move_account - move account of the page |
3766 | * @page: the page | 3663 | * @page: the page |
@@ -3806,13 +3703,19 @@ static int mem_cgroup_move_account(struct page *page, | |||
3806 | 3703 | ||
3807 | move_lock_mem_cgroup(from, &flags); | 3704 | move_lock_mem_cgroup(from, &flags); |
3808 | 3705 | ||
3809 | if (!anon && page_mapped(page)) | 3706 | if (!anon && page_mapped(page)) { |
3810 | mem_cgroup_move_account_page_stat(from, to, nr_pages, | 3707 | __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], |
3811 | MEM_CGROUP_STAT_FILE_MAPPED); | 3708 | nr_pages); |
3709 | __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], | ||
3710 | nr_pages); | ||
3711 | } | ||
3812 | 3712 | ||
3813 | if (PageWriteback(page)) | 3713 | if (PageWriteback(page)) { |
3814 | mem_cgroup_move_account_page_stat(from, to, nr_pages, | 3714 | __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], |
3815 | MEM_CGROUP_STAT_WRITEBACK); | 3715 | nr_pages); |
3716 | __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], | ||
3717 | nr_pages); | ||
3718 | } | ||
3816 | 3719 | ||
3817 | mem_cgroup_charge_statistics(from, page, anon, -nr_pages); | 3720 | mem_cgroup_charge_statistics(from, page, anon, -nr_pages); |
3818 | 3721 | ||
@@ -3898,19 +3801,19 @@ out: | |||
3898 | return ret; | 3801 | return ret; |
3899 | } | 3802 | } |
3900 | 3803 | ||
3901 | /* | 3804 | int mem_cgroup_charge_anon(struct page *page, |
3902 | * Charge the memory controller for page usage. | 3805 | struct mm_struct *mm, gfp_t gfp_mask) |
3903 | * Return | ||
3904 | * 0 if the charge was successful | ||
3905 | * < 0 if the cgroup is over its limit | ||
3906 | */ | ||
3907 | static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | ||
3908 | gfp_t gfp_mask, enum charge_type ctype) | ||
3909 | { | 3806 | { |
3910 | struct mem_cgroup *memcg = NULL; | ||
3911 | unsigned int nr_pages = 1; | 3807 | unsigned int nr_pages = 1; |
3808 | struct mem_cgroup *memcg; | ||
3912 | bool oom = true; | 3809 | bool oom = true; |
3913 | int ret; | 3810 | |
3811 | if (mem_cgroup_disabled()) | ||
3812 | return 0; | ||
3813 | |||
3814 | VM_BUG_ON_PAGE(page_mapped(page), page); | ||
3815 | VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); | ||
3816 | VM_BUG_ON(!mm); | ||
3914 | 3817 | ||
3915 | if (PageTransHuge(page)) { | 3818 | if (PageTransHuge(page)) { |
3916 | nr_pages <<= compound_order(page); | 3819 | nr_pages <<= compound_order(page); |
@@ -3922,25 +3825,14 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
3922 | oom = false; | 3825 | oom = false; |
3923 | } | 3826 | } |
3924 | 3827 | ||
3925 | ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); | 3828 | memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom); |
3926 | if (ret == -ENOMEM) | 3829 | if (!memcg) |
3927 | return ret; | 3830 | return -ENOMEM; |
3928 | __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false); | 3831 | __mem_cgroup_commit_charge(memcg, page, nr_pages, |
3832 | MEM_CGROUP_CHARGE_TYPE_ANON, false); | ||
3929 | return 0; | 3833 | return 0; |
3930 | } | 3834 | } |
3931 | 3835 | ||
3932 | int mem_cgroup_newpage_charge(struct page *page, | ||
3933 | struct mm_struct *mm, gfp_t gfp_mask) | ||
3934 | { | ||
3935 | if (mem_cgroup_disabled()) | ||
3936 | return 0; | ||
3937 | VM_BUG_ON_PAGE(page_mapped(page), page); | ||
3938 | VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); | ||
3939 | VM_BUG_ON(!mm); | ||
3940 | return mem_cgroup_charge_common(page, mm, gfp_mask, | ||
3941 | MEM_CGROUP_CHARGE_TYPE_ANON); | ||
3942 | } | ||
3943 | |||
3944 | /* | 3836 | /* |
3945 | * While swap-in, try_charge -> commit or cancel, the page is locked. | 3837 | * While swap-in, try_charge -> commit or cancel, the page is locked. |
3946 | * And when try_charge() successfully returns, one refcnt to memcg without | 3838 | * And when try_charge() successfully returns, one refcnt to memcg without |
@@ -3952,7 +3844,7 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
3952 | gfp_t mask, | 3844 | gfp_t mask, |
3953 | struct mem_cgroup **memcgp) | 3845 | struct mem_cgroup **memcgp) |
3954 | { | 3846 | { |
3955 | struct mem_cgroup *memcg; | 3847 | struct mem_cgroup *memcg = NULL; |
3956 | struct page_cgroup *pc; | 3848 | struct page_cgroup *pc; |
3957 | int ret; | 3849 | int ret; |
3958 | 3850 | ||
@@ -3965,31 +3857,29 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
3965 | * in turn serializes uncharging. | 3857 | * in turn serializes uncharging. |
3966 | */ | 3858 | */ |
3967 | if (PageCgroupUsed(pc)) | 3859 | if (PageCgroupUsed(pc)) |
3968 | return 0; | 3860 | goto out; |
3969 | if (!do_swap_account) | 3861 | if (do_swap_account) |
3970 | goto charge_cur_mm; | 3862 | memcg = try_get_mem_cgroup_from_page(page); |
3971 | memcg = try_get_mem_cgroup_from_page(page); | ||
3972 | if (!memcg) | 3863 | if (!memcg) |
3973 | goto charge_cur_mm; | 3864 | memcg = get_mem_cgroup_from_mm(mm); |
3974 | *memcgp = memcg; | 3865 | ret = mem_cgroup_try_charge(memcg, mask, 1, true); |
3975 | ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true); | ||
3976 | css_put(&memcg->css); | 3866 | css_put(&memcg->css); |
3977 | if (ret == -EINTR) | 3867 | if (ret == -EINTR) |
3978 | ret = 0; | 3868 | memcg = root_mem_cgroup; |
3979 | return ret; | 3869 | else if (ret) |
3980 | charge_cur_mm: | 3870 | return ret; |
3981 | ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); | 3871 | out: |
3982 | if (ret == -EINTR) | 3872 | *memcgp = memcg; |
3983 | ret = 0; | 3873 | return 0; |
3984 | return ret; | ||
3985 | } | 3874 | } |
3986 | 3875 | ||
3987 | int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, | 3876 | int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, |
3988 | gfp_t gfp_mask, struct mem_cgroup **memcgp) | 3877 | gfp_t gfp_mask, struct mem_cgroup **memcgp) |
3989 | { | 3878 | { |
3990 | *memcgp = NULL; | 3879 | if (mem_cgroup_disabled()) { |
3991 | if (mem_cgroup_disabled()) | 3880 | *memcgp = NULL; |
3992 | return 0; | 3881 | return 0; |
3882 | } | ||
3993 | /* | 3883 | /* |
3994 | * A racing thread's fault, or swapoff, may have already | 3884 | * A racing thread's fault, or swapoff, may have already |
3995 | * updated the pte, and even removed page from swap cache: in | 3885 | * updated the pte, and even removed page from swap cache: in |
@@ -3997,12 +3887,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, | |||
3997 | * there's also a KSM case which does need to charge the page. | 3887 | * there's also a KSM case which does need to charge the page. |
3998 | */ | 3888 | */ |
3999 | if (!PageSwapCache(page)) { | 3889 | if (!PageSwapCache(page)) { |
4000 | int ret; | 3890 | struct mem_cgroup *memcg; |
4001 | 3891 | ||
4002 | ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true); | 3892 | memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); |
4003 | if (ret == -EINTR) | 3893 | if (!memcg) |
4004 | ret = 0; | 3894 | return -ENOMEM; |
4005 | return ret; | 3895 | *memcgp = memcg; |
3896 | return 0; | ||
4006 | } | 3897 | } |
4007 | return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp); | 3898 | return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp); |
4008 | } | 3899 | } |
@@ -4046,11 +3937,11 @@ void mem_cgroup_commit_charge_swapin(struct page *page, | |||
4046 | MEM_CGROUP_CHARGE_TYPE_ANON); | 3937 | MEM_CGROUP_CHARGE_TYPE_ANON); |
4047 | } | 3938 | } |
4048 | 3939 | ||
4049 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | 3940 | int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm, |
4050 | gfp_t gfp_mask) | 3941 | gfp_t gfp_mask) |
4051 | { | 3942 | { |
4052 | struct mem_cgroup *memcg = NULL; | ||
4053 | enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; | 3943 | enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; |
3944 | struct mem_cgroup *memcg; | ||
4054 | int ret; | 3945 | int ret; |
4055 | 3946 | ||
4056 | if (mem_cgroup_disabled()) | 3947 | if (mem_cgroup_disabled()) |
@@ -4058,15 +3949,28 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
4058 | if (PageCompound(page)) | 3949 | if (PageCompound(page)) |
4059 | return 0; | 3950 | return 0; |
4060 | 3951 | ||
4061 | if (!PageSwapCache(page)) | 3952 | if (PageSwapCache(page)) { /* shmem */ |
4062 | ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); | ||
4063 | else { /* page is swapcache/shmem */ | ||
4064 | ret = __mem_cgroup_try_charge_swapin(mm, page, | 3953 | ret = __mem_cgroup_try_charge_swapin(mm, page, |
4065 | gfp_mask, &memcg); | 3954 | gfp_mask, &memcg); |
4066 | if (!ret) | 3955 | if (ret) |
4067 | __mem_cgroup_commit_charge_swapin(page, memcg, type); | 3956 | return ret; |
3957 | __mem_cgroup_commit_charge_swapin(page, memcg, type); | ||
3958 | return 0; | ||
4068 | } | 3959 | } |
4069 | return ret; | 3960 | |
3961 | /* | ||
3962 | * Page cache insertions can happen without an actual mm | ||
3963 | * context, e.g. during disk probing on boot. | ||
3964 | */ | ||
3965 | if (unlikely(!mm)) | ||
3966 | memcg = root_mem_cgroup; | ||
3967 | else { | ||
3968 | memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); | ||
3969 | if (!memcg) | ||
3970 | return -ENOMEM; | ||
3971 | } | ||
3972 | __mem_cgroup_commit_charge(memcg, page, 1, type, false); | ||
3973 | return 0; | ||
4070 | } | 3974 | } |
4071 | 3975 | ||
4072 | static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, | 3976 | static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, |
@@ -6678,8 +6582,7 @@ one_by_one: | |||
6678 | batch_count = PRECHARGE_COUNT_AT_ONCE; | 6582 | batch_count = PRECHARGE_COUNT_AT_ONCE; |
6679 | cond_resched(); | 6583 | cond_resched(); |
6680 | } | 6584 | } |
6681 | ret = __mem_cgroup_try_charge(NULL, | 6585 | ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false); |
6682 | GFP_KERNEL, 1, &memcg, false); | ||
6683 | if (ret) | 6586 | if (ret) |
6684 | /* mem_cgroup_clear_mc() will do uncharge later */ | 6587 | /* mem_cgroup_clear_mc() will do uncharge later */ |
6685 | return ret; | 6588 | return ret; |
diff --git a/mm/memory.c b/mm/memory.c index 82c1e4cf00d1..d0f0bef3be48 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -60,6 +60,7 @@ | |||
60 | #include <linux/migrate.h> | 60 | #include <linux/migrate.h> |
61 | #include <linux/string.h> | 61 | #include <linux/string.h> |
62 | #include <linux/dma-debug.h> | 62 | #include <linux/dma-debug.h> |
63 | #include <linux/debugfs.h> | ||
63 | 64 | ||
64 | #include <asm/io.h> | 65 | #include <asm/io.h> |
65 | #include <asm/pgalloc.h> | 66 | #include <asm/pgalloc.h> |
@@ -1320,9 +1321,9 @@ static void unmap_single_vma(struct mmu_gather *tlb, | |||
1320 | * It is undesirable to test vma->vm_file as it | 1321 | * It is undesirable to test vma->vm_file as it |
1321 | * should be non-null for valid hugetlb area. | 1322 | * should be non-null for valid hugetlb area. |
1322 | * However, vm_file will be NULL in the error | 1323 | * However, vm_file will be NULL in the error |
1323 | * cleanup path of do_mmap_pgoff. When | 1324 | * cleanup path of mmap_region. When |
1324 | * hugetlbfs ->mmap method fails, | 1325 | * hugetlbfs ->mmap method fails, |
1325 | * do_mmap_pgoff() nullifies vma->vm_file | 1326 | * mmap_region() nullifies vma->vm_file |
1326 | * before calling this function to clean up. | 1327 | * before calling this function to clean up. |
1327 | * Since no pte has actually been setup, it is | 1328 | * Since no pte has actually been setup, it is |
1328 | * safe to do nothing in this case. | 1329 | * safe to do nothing in this case. |
@@ -2781,7 +2782,7 @@ reuse: | |||
2781 | */ | 2782 | */ |
2782 | if (!page_mkwrite) { | 2783 | if (!page_mkwrite) { |
2783 | wait_on_page_locked(dirty_page); | 2784 | wait_on_page_locked(dirty_page); |
2784 | set_page_dirty_balance(dirty_page, page_mkwrite); | 2785 | set_page_dirty_balance(dirty_page); |
2785 | /* file_update_time outside page_lock */ | 2786 | /* file_update_time outside page_lock */ |
2786 | if (vma->vm_file) | 2787 | if (vma->vm_file) |
2787 | file_update_time(vma->vm_file); | 2788 | file_update_time(vma->vm_file); |
@@ -2827,7 +2828,7 @@ gotten: | |||
2827 | } | 2828 | } |
2828 | __SetPageUptodate(new_page); | 2829 | __SetPageUptodate(new_page); |
2829 | 2830 | ||
2830 | if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) | 2831 | if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) |
2831 | goto oom_free_new; | 2832 | goto oom_free_new; |
2832 | 2833 | ||
2833 | mmun_start = address & PAGE_MASK; | 2834 | mmun_start = address & PAGE_MASK; |
@@ -3280,7 +3281,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3280 | */ | 3281 | */ |
3281 | __SetPageUptodate(page); | 3282 | __SetPageUptodate(page); |
3282 | 3283 | ||
3283 | if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) | 3284 | if (mem_cgroup_charge_anon(page, mm, GFP_KERNEL)) |
3284 | goto oom_free_page; | 3285 | goto oom_free_page; |
3285 | 3286 | ||
3286 | entry = mk_pte(page, vma->vm_page_prot); | 3287 | entry = mk_pte(page, vma->vm_page_prot); |
@@ -3342,7 +3343,22 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, | |||
3342 | return ret; | 3343 | return ret; |
3343 | } | 3344 | } |
3344 | 3345 | ||
3345 | static void do_set_pte(struct vm_area_struct *vma, unsigned long address, | 3346 | /** |
3347 | * do_set_pte - setup new PTE entry for given page and add reverse page mapping. | ||
3348 | * | ||
3349 | * @vma: virtual memory area | ||
3350 | * @address: user virtual address | ||
3351 | * @page: page to map | ||
3352 | * @pte: pointer to target page table entry | ||
3353 | * @write: true, if new entry is writable | ||
3354 | * @anon: true, if it's anonymous page | ||
3355 | * | ||
3356 | * Caller must hold page table lock relevant for @pte. | ||
3357 | * | ||
3358 | * Target users are page handler itself and implementations of | ||
3359 | * vm_ops->map_pages. | ||
3360 | */ | ||
3361 | void do_set_pte(struct vm_area_struct *vma, unsigned long address, | ||
3346 | struct page *page, pte_t *pte, bool write, bool anon) | 3362 | struct page *page, pte_t *pte, bool write, bool anon) |
3347 | { | 3363 | { |
3348 | pte_t entry; | 3364 | pte_t entry; |
@@ -3366,6 +3382,105 @@ static void do_set_pte(struct vm_area_struct *vma, unsigned long address, | |||
3366 | update_mmu_cache(vma, address, pte); | 3382 | update_mmu_cache(vma, address, pte); |
3367 | } | 3383 | } |
3368 | 3384 | ||
3385 | #define FAULT_AROUND_ORDER 4 | ||
3386 | |||
3387 | #ifdef CONFIG_DEBUG_FS | ||
3388 | static unsigned int fault_around_order = FAULT_AROUND_ORDER; | ||
3389 | |||
3390 | static int fault_around_order_get(void *data, u64 *val) | ||
3391 | { | ||
3392 | *val = fault_around_order; | ||
3393 | return 0; | ||
3394 | } | ||
3395 | |||
3396 | static int fault_around_order_set(void *data, u64 val) | ||
3397 | { | ||
3398 | BUILD_BUG_ON((1UL << FAULT_AROUND_ORDER) > PTRS_PER_PTE); | ||
3399 | if (1UL << val > PTRS_PER_PTE) | ||
3400 | return -EINVAL; | ||
3401 | fault_around_order = val; | ||
3402 | return 0; | ||
3403 | } | ||
3404 | DEFINE_SIMPLE_ATTRIBUTE(fault_around_order_fops, | ||
3405 | fault_around_order_get, fault_around_order_set, "%llu\n"); | ||
3406 | |||
3407 | static int __init fault_around_debugfs(void) | ||
3408 | { | ||
3409 | void *ret; | ||
3410 | |||
3411 | ret = debugfs_create_file("fault_around_order", 0644, NULL, NULL, | ||
3412 | &fault_around_order_fops); | ||
3413 | if (!ret) | ||
3414 | pr_warn("Failed to create fault_around_order in debugfs"); | ||
3415 | return 0; | ||
3416 | } | ||
3417 | late_initcall(fault_around_debugfs); | ||
3418 | |||
3419 | static inline unsigned long fault_around_pages(void) | ||
3420 | { | ||
3421 | return 1UL << fault_around_order; | ||
3422 | } | ||
3423 | |||
3424 | static inline unsigned long fault_around_mask(void) | ||
3425 | { | ||
3426 | return ~((1UL << (PAGE_SHIFT + fault_around_order)) - 1); | ||
3427 | } | ||
3428 | #else | ||
3429 | static inline unsigned long fault_around_pages(void) | ||
3430 | { | ||
3431 | unsigned long nr_pages; | ||
3432 | |||
3433 | nr_pages = 1UL << FAULT_AROUND_ORDER; | ||
3434 | BUILD_BUG_ON(nr_pages > PTRS_PER_PTE); | ||
3435 | return nr_pages; | ||
3436 | } | ||
3437 | |||
3438 | static inline unsigned long fault_around_mask(void) | ||
3439 | { | ||
3440 | return ~((1UL << (PAGE_SHIFT + FAULT_AROUND_ORDER)) - 1); | ||
3441 | } | ||
3442 | #endif | ||
3443 | |||
3444 | static void do_fault_around(struct vm_area_struct *vma, unsigned long address, | ||
3445 | pte_t *pte, pgoff_t pgoff, unsigned int flags) | ||
3446 | { | ||
3447 | unsigned long start_addr; | ||
3448 | pgoff_t max_pgoff; | ||
3449 | struct vm_fault vmf; | ||
3450 | int off; | ||
3451 | |||
3452 | start_addr = max(address & fault_around_mask(), vma->vm_start); | ||
3453 | off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); | ||
3454 | pte -= off; | ||
3455 | pgoff -= off; | ||
3456 | |||
3457 | /* | ||
3458 | * max_pgoff is either end of page table or end of vma | ||
3459 | * or fault_around_pages() from pgoff, depending what is neast. | ||
3460 | */ | ||
3461 | max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + | ||
3462 | PTRS_PER_PTE - 1; | ||
3463 | max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1, | ||
3464 | pgoff + fault_around_pages() - 1); | ||
3465 | |||
3466 | /* Check if it makes any sense to call ->map_pages */ | ||
3467 | while (!pte_none(*pte)) { | ||
3468 | if (++pgoff > max_pgoff) | ||
3469 | return; | ||
3470 | start_addr += PAGE_SIZE; | ||
3471 | if (start_addr >= vma->vm_end) | ||
3472 | return; | ||
3473 | pte++; | ||
3474 | } | ||
3475 | |||
3476 | vmf.virtual_address = (void __user *) start_addr; | ||
3477 | vmf.pte = pte; | ||
3478 | vmf.pgoff = pgoff; | ||
3479 | vmf.max_pgoff = max_pgoff; | ||
3480 | vmf.flags = flags; | ||
3481 | vma->vm_ops->map_pages(vma, &vmf); | ||
3482 | } | ||
3483 | |||
3369 | static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 3484 | static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
3370 | unsigned long address, pmd_t *pmd, | 3485 | unsigned long address, pmd_t *pmd, |
3371 | pgoff_t pgoff, unsigned int flags, pte_t orig_pte) | 3486 | pgoff_t pgoff, unsigned int flags, pte_t orig_pte) |
@@ -3373,7 +3488,20 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3373 | struct page *fault_page; | 3488 | struct page *fault_page; |
3374 | spinlock_t *ptl; | 3489 | spinlock_t *ptl; |
3375 | pte_t *pte; | 3490 | pte_t *pte; |
3376 | int ret; | 3491 | int ret = 0; |
3492 | |||
3493 | /* | ||
3494 | * Let's call ->map_pages() first and use ->fault() as fallback | ||
3495 | * if page by the offset is not ready to be mapped (cold cache or | ||
3496 | * something). | ||
3497 | */ | ||
3498 | if (vma->vm_ops->map_pages) { | ||
3499 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
3500 | do_fault_around(vma, address, pte, pgoff, flags); | ||
3501 | if (!pte_same(*pte, orig_pte)) | ||
3502 | goto unlock_out; | ||
3503 | pte_unmap_unlock(pte, ptl); | ||
3504 | } | ||
3377 | 3505 | ||
3378 | ret = __do_fault(vma, address, pgoff, flags, &fault_page); | 3506 | ret = __do_fault(vma, address, pgoff, flags, &fault_page); |
3379 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 3507 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
@@ -3387,8 +3515,9 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3387 | return ret; | 3515 | return ret; |
3388 | } | 3516 | } |
3389 | do_set_pte(vma, address, fault_page, pte, false, false); | 3517 | do_set_pte(vma, address, fault_page, pte, false, false); |
3390 | pte_unmap_unlock(pte, ptl); | ||
3391 | unlock_page(fault_page); | 3518 | unlock_page(fault_page); |
3519 | unlock_out: | ||
3520 | pte_unmap_unlock(pte, ptl); | ||
3392 | return ret; | 3521 | return ret; |
3393 | } | 3522 | } |
3394 | 3523 | ||
@@ -3408,7 +3537,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3408 | if (!new_page) | 3537 | if (!new_page) |
3409 | return VM_FAULT_OOM; | 3538 | return VM_FAULT_OOM; |
3410 | 3539 | ||
3411 | if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) { | 3540 | if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) { |
3412 | page_cache_release(new_page); | 3541 | page_cache_release(new_page); |
3413 | return VM_FAULT_OOM; | 3542 | return VM_FAULT_OOM; |
3414 | } | 3543 | } |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e3ab02822799..78e1472933ea 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -795,36 +795,6 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, | |||
795 | return err; | 795 | return err; |
796 | } | 796 | } |
797 | 797 | ||
798 | /* | ||
799 | * Update task->flags PF_MEMPOLICY bit: set iff non-default | ||
800 | * mempolicy. Allows more rapid checking of this (combined perhaps | ||
801 | * with other PF_* flag bits) on memory allocation hot code paths. | ||
802 | * | ||
803 | * If called from outside this file, the task 'p' should -only- be | ||
804 | * a newly forked child not yet visible on the task list, because | ||
805 | * manipulating the task flags of a visible task is not safe. | ||
806 | * | ||
807 | * The above limitation is why this routine has the funny name | ||
808 | * mpol_fix_fork_child_flag(). | ||
809 | * | ||
810 | * It is also safe to call this with a task pointer of current, | ||
811 | * which the static wrapper mpol_set_task_struct_flag() does, | ||
812 | * for use within this file. | ||
813 | */ | ||
814 | |||
815 | void mpol_fix_fork_child_flag(struct task_struct *p) | ||
816 | { | ||
817 | if (p->mempolicy) | ||
818 | p->flags |= PF_MEMPOLICY; | ||
819 | else | ||
820 | p->flags &= ~PF_MEMPOLICY; | ||
821 | } | ||
822 | |||
823 | static void mpol_set_task_struct_flag(void) | ||
824 | { | ||
825 | mpol_fix_fork_child_flag(current); | ||
826 | } | ||
827 | |||
828 | /* Set the process memory policy */ | 798 | /* Set the process memory policy */ |
829 | static long do_set_mempolicy(unsigned short mode, unsigned short flags, | 799 | static long do_set_mempolicy(unsigned short mode, unsigned short flags, |
830 | nodemask_t *nodes) | 800 | nodemask_t *nodes) |
@@ -861,7 +831,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, | |||
861 | } | 831 | } |
862 | old = current->mempolicy; | 832 | old = current->mempolicy; |
863 | current->mempolicy = new; | 833 | current->mempolicy = new; |
864 | mpol_set_task_struct_flag(); | ||
865 | if (new && new->mode == MPOL_INTERLEAVE && | 834 | if (new && new->mode == MPOL_INTERLEAVE && |
866 | nodes_weight(new->v.nodes)) | 835 | nodes_weight(new->v.nodes)) |
867 | current->il_next = first_node(new->v.nodes); | 836 | current->il_next = first_node(new->v.nodes); |
@@ -1782,21 +1751,18 @@ static unsigned interleave_nodes(struct mempolicy *policy) | |||
1782 | /* | 1751 | /* |
1783 | * Depending on the memory policy provide a node from which to allocate the | 1752 | * Depending on the memory policy provide a node from which to allocate the |
1784 | * next slab entry. | 1753 | * next slab entry. |
1785 | * @policy must be protected by freeing by the caller. If @policy is | ||
1786 | * the current task's mempolicy, this protection is implicit, as only the | ||
1787 | * task can change it's policy. The system default policy requires no | ||
1788 | * such protection. | ||
1789 | */ | 1754 | */ |
1790 | unsigned slab_node(void) | 1755 | unsigned int mempolicy_slab_node(void) |
1791 | { | 1756 | { |
1792 | struct mempolicy *policy; | 1757 | struct mempolicy *policy; |
1758 | int node = numa_mem_id(); | ||
1793 | 1759 | ||
1794 | if (in_interrupt()) | 1760 | if (in_interrupt()) |
1795 | return numa_node_id(); | 1761 | return node; |
1796 | 1762 | ||
1797 | policy = current->mempolicy; | 1763 | policy = current->mempolicy; |
1798 | if (!policy || policy->flags & MPOL_F_LOCAL) | 1764 | if (!policy || policy->flags & MPOL_F_LOCAL) |
1799 | return numa_node_id(); | 1765 | return node; |
1800 | 1766 | ||
1801 | switch (policy->mode) { | 1767 | switch (policy->mode) { |
1802 | case MPOL_PREFERRED: | 1768 | case MPOL_PREFERRED: |
@@ -1816,11 +1782,11 @@ unsigned slab_node(void) | |||
1816 | struct zonelist *zonelist; | 1782 | struct zonelist *zonelist; |
1817 | struct zone *zone; | 1783 | struct zone *zone; |
1818 | enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); | 1784 | enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); |
1819 | zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0]; | 1785 | zonelist = &NODE_DATA(node)->node_zonelists[0]; |
1820 | (void)first_zones_zonelist(zonelist, highest_zoneidx, | 1786 | (void)first_zones_zonelist(zonelist, highest_zoneidx, |
1821 | &policy->v.nodes, | 1787 | &policy->v.nodes, |
1822 | &zone); | 1788 | &zone); |
1823 | return zone ? zone->node : numa_node_id(); | 1789 | return zone ? zone->node : node; |
1824 | } | 1790 | } |
1825 | 1791 | ||
1826 | default: | 1792 | default: |
diff --git a/mm/mempool.c b/mm/mempool.c index 659aa42bad16..905434f18c97 100644 --- a/mm/mempool.c +++ b/mm/mempool.c | |||
@@ -304,9 +304,9 @@ void mempool_free(void *element, mempool_t *pool) | |||
304 | * ensures that there will be frees which return elements to the | 304 | * ensures that there will be frees which return elements to the |
305 | * pool waking up the waiters. | 305 | * pool waking up the waiters. |
306 | */ | 306 | */ |
307 | if (pool->curr_nr < pool->min_nr) { | 307 | if (unlikely(pool->curr_nr < pool->min_nr)) { |
308 | spin_lock_irqsave(&pool->lock, flags); | 308 | spin_lock_irqsave(&pool->lock, flags); |
309 | if (pool->curr_nr < pool->min_nr) { | 309 | if (likely(pool->curr_nr < pool->min_nr)) { |
310 | add_element(pool, element); | 310 | add_element(pool, element); |
311 | spin_unlock_irqrestore(&pool->lock, flags); | 311 | spin_unlock_irqrestore(&pool->lock, flags); |
312 | wake_up(&pool->wait); | 312 | wake_up(&pool->wait); |
diff --git a/mm/mlock.c b/mm/mlock.c index 4e1a68162285..b1eb53634005 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -79,6 +79,7 @@ void clear_page_mlock(struct page *page) | |||
79 | */ | 79 | */ |
80 | void mlock_vma_page(struct page *page) | 80 | void mlock_vma_page(struct page *page) |
81 | { | 81 | { |
82 | /* Serialize with page migration */ | ||
82 | BUG_ON(!PageLocked(page)); | 83 | BUG_ON(!PageLocked(page)); |
83 | 84 | ||
84 | if (!TestSetPageMlocked(page)) { | 85 | if (!TestSetPageMlocked(page)) { |
@@ -174,6 +175,7 @@ unsigned int munlock_vma_page(struct page *page) | |||
174 | unsigned int nr_pages; | 175 | unsigned int nr_pages; |
175 | struct zone *zone = page_zone(page); | 176 | struct zone *zone = page_zone(page); |
176 | 177 | ||
178 | /* For try_to_munlock() and to serialize with page migration */ | ||
177 | BUG_ON(!PageLocked(page)); | 179 | BUG_ON(!PageLocked(page)); |
178 | 180 | ||
179 | /* | 181 | /* |
@@ -10,6 +10,7 @@ | |||
10 | #include <linux/slab.h> | 10 | #include <linux/slab.h> |
11 | #include <linux/backing-dev.h> | 11 | #include <linux/backing-dev.h> |
12 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
13 | #include <linux/vmacache.h> | ||
13 | #include <linux/shm.h> | 14 | #include <linux/shm.h> |
14 | #include <linux/mman.h> | 15 | #include <linux/mman.h> |
15 | #include <linux/pagemap.h> | 16 | #include <linux/pagemap.h> |
@@ -681,8 +682,9 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, | |||
681 | prev->vm_next = next = vma->vm_next; | 682 | prev->vm_next = next = vma->vm_next; |
682 | if (next) | 683 | if (next) |
683 | next->vm_prev = prev; | 684 | next->vm_prev = prev; |
684 | if (mm->mmap_cache == vma) | 685 | |
685 | mm->mmap_cache = prev; | 686 | /* Kill the cache */ |
687 | vmacache_invalidate(mm); | ||
686 | } | 688 | } |
687 | 689 | ||
688 | /* | 690 | /* |
@@ -1989,34 +1991,33 @@ EXPORT_SYMBOL(get_unmapped_area); | |||
1989 | /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ | 1991 | /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ |
1990 | struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | 1992 | struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) |
1991 | { | 1993 | { |
1992 | struct vm_area_struct *vma = NULL; | 1994 | struct rb_node *rb_node; |
1995 | struct vm_area_struct *vma; | ||
1993 | 1996 | ||
1994 | /* Check the cache first. */ | 1997 | /* Check the cache first. */ |
1995 | /* (Cache hit rate is typically around 35%.) */ | 1998 | vma = vmacache_find(mm, addr); |
1996 | vma = ACCESS_ONCE(mm->mmap_cache); | 1999 | if (likely(vma)) |
1997 | if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { | 2000 | return vma; |
1998 | struct rb_node *rb_node; | ||
1999 | 2001 | ||
2000 | rb_node = mm->mm_rb.rb_node; | 2002 | rb_node = mm->mm_rb.rb_node; |
2001 | vma = NULL; | 2003 | vma = NULL; |
2002 | 2004 | ||
2003 | while (rb_node) { | 2005 | while (rb_node) { |
2004 | struct vm_area_struct *vma_tmp; | 2006 | struct vm_area_struct *tmp; |
2005 | 2007 | ||
2006 | vma_tmp = rb_entry(rb_node, | 2008 | tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); |
2007 | struct vm_area_struct, vm_rb); | 2009 | |
2008 | 2010 | if (tmp->vm_end > addr) { | |
2009 | if (vma_tmp->vm_end > addr) { | 2011 | vma = tmp; |
2010 | vma = vma_tmp; | 2012 | if (tmp->vm_start <= addr) |
2011 | if (vma_tmp->vm_start <= addr) | 2013 | break; |
2012 | break; | 2014 | rb_node = rb_node->rb_left; |
2013 | rb_node = rb_node->rb_left; | 2015 | } else |
2014 | } else | 2016 | rb_node = rb_node->rb_right; |
2015 | rb_node = rb_node->rb_right; | ||
2016 | } | ||
2017 | if (vma) | ||
2018 | mm->mmap_cache = vma; | ||
2019 | } | 2017 | } |
2018 | |||
2019 | if (vma) | ||
2020 | vmacache_update(addr, vma); | ||
2020 | return vma; | 2021 | return vma; |
2021 | } | 2022 | } |
2022 | 2023 | ||
@@ -2388,7 +2389,9 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2388 | } else | 2389 | } else |
2389 | mm->highest_vm_end = prev ? prev->vm_end : 0; | 2390 | mm->highest_vm_end = prev ? prev->vm_end : 0; |
2390 | tail_vma->vm_next = NULL; | 2391 | tail_vma->vm_next = NULL; |
2391 | mm->mmap_cache = NULL; /* Kill the cache. */ | 2392 | |
2393 | /* Kill the cache */ | ||
2394 | vmacache_invalidate(mm); | ||
2392 | } | 2395 | } |
2393 | 2396 | ||
2394 | /* | 2397 | /* |
diff --git a/mm/mprotect.c b/mm/mprotect.c index 769a67a15803..c43d557941f8 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -36,6 +36,34 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) | |||
36 | } | 36 | } |
37 | #endif | 37 | #endif |
38 | 38 | ||
39 | /* | ||
40 | * For a prot_numa update we only hold mmap_sem for read so there is a | ||
41 | * potential race with faulting where a pmd was temporarily none. This | ||
42 | * function checks for a transhuge pmd under the appropriate lock. It | ||
43 | * returns a pte if it was successfully locked or NULL if it raced with | ||
44 | * a transhuge insertion. | ||
45 | */ | ||
46 | static pte_t *lock_pte_protection(struct vm_area_struct *vma, pmd_t *pmd, | ||
47 | unsigned long addr, int prot_numa, spinlock_t **ptl) | ||
48 | { | ||
49 | pte_t *pte; | ||
50 | spinlock_t *pmdl; | ||
51 | |||
52 | /* !prot_numa is protected by mmap_sem held for write */ | ||
53 | if (!prot_numa) | ||
54 | return pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl); | ||
55 | |||
56 | pmdl = pmd_lock(vma->vm_mm, pmd); | ||
57 | if (unlikely(pmd_trans_huge(*pmd) || pmd_none(*pmd))) { | ||
58 | spin_unlock(pmdl); | ||
59 | return NULL; | ||
60 | } | ||
61 | |||
62 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl); | ||
63 | spin_unlock(pmdl); | ||
64 | return pte; | ||
65 | } | ||
66 | |||
39 | static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | 67 | static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
40 | unsigned long addr, unsigned long end, pgprot_t newprot, | 68 | unsigned long addr, unsigned long end, pgprot_t newprot, |
41 | int dirty_accountable, int prot_numa) | 69 | int dirty_accountable, int prot_numa) |
@@ -45,7 +73,10 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
45 | spinlock_t *ptl; | 73 | spinlock_t *ptl; |
46 | unsigned long pages = 0; | 74 | unsigned long pages = 0; |
47 | 75 | ||
48 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); | 76 | pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl); |
77 | if (!pte) | ||
78 | return 0; | ||
79 | |||
49 | arch_enter_lazy_mmu_mode(); | 80 | arch_enter_lazy_mmu_mode(); |
50 | do { | 81 | do { |
51 | oldpte = *pte; | 82 | oldpte = *pte; |
@@ -109,15 +140,26 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, | |||
109 | pgprot_t newprot, int dirty_accountable, int prot_numa) | 140 | pgprot_t newprot, int dirty_accountable, int prot_numa) |
110 | { | 141 | { |
111 | pmd_t *pmd; | 142 | pmd_t *pmd; |
143 | struct mm_struct *mm = vma->vm_mm; | ||
112 | unsigned long next; | 144 | unsigned long next; |
113 | unsigned long pages = 0; | 145 | unsigned long pages = 0; |
114 | unsigned long nr_huge_updates = 0; | 146 | unsigned long nr_huge_updates = 0; |
147 | unsigned long mni_start = 0; | ||
115 | 148 | ||
116 | pmd = pmd_offset(pud, addr); | 149 | pmd = pmd_offset(pud, addr); |
117 | do { | 150 | do { |
118 | unsigned long this_pages; | 151 | unsigned long this_pages; |
119 | 152 | ||
120 | next = pmd_addr_end(addr, end); | 153 | next = pmd_addr_end(addr, end); |
154 | if (!pmd_trans_huge(*pmd) && pmd_none_or_clear_bad(pmd)) | ||
155 | continue; | ||
156 | |||
157 | /* invoke the mmu notifier if the pmd is populated */ | ||
158 | if (!mni_start) { | ||
159 | mni_start = addr; | ||
160 | mmu_notifier_invalidate_range_start(mm, mni_start, end); | ||
161 | } | ||
162 | |||
121 | if (pmd_trans_huge(*pmd)) { | 163 | if (pmd_trans_huge(*pmd)) { |
122 | if (next - addr != HPAGE_PMD_SIZE) | 164 | if (next - addr != HPAGE_PMD_SIZE) |
123 | split_huge_page_pmd(vma, addr, pmd); | 165 | split_huge_page_pmd(vma, addr, pmd); |
@@ -130,18 +172,21 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, | |||
130 | pages += HPAGE_PMD_NR; | 172 | pages += HPAGE_PMD_NR; |
131 | nr_huge_updates++; | 173 | nr_huge_updates++; |
132 | } | 174 | } |
175 | |||
176 | /* huge pmd was handled */ | ||
133 | continue; | 177 | continue; |
134 | } | 178 | } |
135 | } | 179 | } |
136 | /* fall through */ | 180 | /* fall through, the trans huge pmd just split */ |
137 | } | 181 | } |
138 | if (pmd_none_or_clear_bad(pmd)) | ||
139 | continue; | ||
140 | this_pages = change_pte_range(vma, pmd, addr, next, newprot, | 182 | this_pages = change_pte_range(vma, pmd, addr, next, newprot, |
141 | dirty_accountable, prot_numa); | 183 | dirty_accountable, prot_numa); |
142 | pages += this_pages; | 184 | pages += this_pages; |
143 | } while (pmd++, addr = next, addr != end); | 185 | } while (pmd++, addr = next, addr != end); |
144 | 186 | ||
187 | if (mni_start) | ||
188 | mmu_notifier_invalidate_range_end(mm, mni_start, end); | ||
189 | |||
145 | if (nr_huge_updates) | 190 | if (nr_huge_updates) |
146 | count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates); | 191 | count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates); |
147 | return pages; | 192 | return pages; |
@@ -201,15 +246,12 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, | |||
201 | unsigned long end, pgprot_t newprot, | 246 | unsigned long end, pgprot_t newprot, |
202 | int dirty_accountable, int prot_numa) | 247 | int dirty_accountable, int prot_numa) |
203 | { | 248 | { |
204 | struct mm_struct *mm = vma->vm_mm; | ||
205 | unsigned long pages; | 249 | unsigned long pages; |
206 | 250 | ||
207 | mmu_notifier_invalidate_range_start(mm, start, end); | ||
208 | if (is_vm_hugetlb_page(vma)) | 251 | if (is_vm_hugetlb_page(vma)) |
209 | pages = hugetlb_change_protection(vma, start, end, newprot); | 252 | pages = hugetlb_change_protection(vma, start, end, newprot); |
210 | else | 253 | else |
211 | pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa); | 254 | pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa); |
212 | mmu_notifier_invalidate_range_end(mm, start, end); | ||
213 | 255 | ||
214 | return pages; | 256 | return pages; |
215 | } | 257 | } |
diff --git a/mm/nommu.c b/mm/nommu.c index a554e5a451cd..85f8d6698d48 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -15,6 +15,7 @@ | |||
15 | 15 | ||
16 | #include <linux/export.h> | 16 | #include <linux/export.h> |
17 | #include <linux/mm.h> | 17 | #include <linux/mm.h> |
18 | #include <linux/vmacache.h> | ||
18 | #include <linux/mman.h> | 19 | #include <linux/mman.h> |
19 | #include <linux/swap.h> | 20 | #include <linux/swap.h> |
20 | #include <linux/file.h> | 21 | #include <linux/file.h> |
@@ -24,6 +25,7 @@ | |||
24 | #include <linux/vmalloc.h> | 25 | #include <linux/vmalloc.h> |
25 | #include <linux/blkdev.h> | 26 | #include <linux/blkdev.h> |
26 | #include <linux/backing-dev.h> | 27 | #include <linux/backing-dev.h> |
28 | #include <linux/compiler.h> | ||
27 | #include <linux/mount.h> | 29 | #include <linux/mount.h> |
28 | #include <linux/personality.h> | 30 | #include <linux/personality.h> |
29 | #include <linux/security.h> | 31 | #include <linux/security.h> |
@@ -296,7 +298,7 @@ long vwrite(char *buf, char *addr, unsigned long count) | |||
296 | count = -(unsigned long) addr; | 298 | count = -(unsigned long) addr; |
297 | 299 | ||
298 | memcpy(addr, buf, count); | 300 | memcpy(addr, buf, count); |
299 | return(count); | 301 | return count; |
300 | } | 302 | } |
301 | 303 | ||
302 | /* | 304 | /* |
@@ -459,7 +461,7 @@ EXPORT_SYMBOL_GPL(vm_unmap_aliases); | |||
459 | * Implement a stub for vmalloc_sync_all() if the architecture chose not to | 461 | * Implement a stub for vmalloc_sync_all() if the architecture chose not to |
460 | * have one. | 462 | * have one. |
461 | */ | 463 | */ |
462 | void __attribute__((weak)) vmalloc_sync_all(void) | 464 | void __weak vmalloc_sync_all(void) |
463 | { | 465 | { |
464 | } | 466 | } |
465 | 467 | ||
@@ -768,16 +770,23 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) | |||
768 | */ | 770 | */ |
769 | static void delete_vma_from_mm(struct vm_area_struct *vma) | 771 | static void delete_vma_from_mm(struct vm_area_struct *vma) |
770 | { | 772 | { |
773 | int i; | ||
771 | struct address_space *mapping; | 774 | struct address_space *mapping; |
772 | struct mm_struct *mm = vma->vm_mm; | 775 | struct mm_struct *mm = vma->vm_mm; |
776 | struct task_struct *curr = current; | ||
773 | 777 | ||
774 | kenter("%p", vma); | 778 | kenter("%p", vma); |
775 | 779 | ||
776 | protect_vma(vma, 0); | 780 | protect_vma(vma, 0); |
777 | 781 | ||
778 | mm->map_count--; | 782 | mm->map_count--; |
779 | if (mm->mmap_cache == vma) | 783 | for (i = 0; i < VMACACHE_SIZE; i++) { |
780 | mm->mmap_cache = NULL; | 784 | /* if the vma is cached, invalidate the entire cache */ |
785 | if (curr->vmacache[i] == vma) { | ||
786 | vmacache_invalidate(curr->mm); | ||
787 | break; | ||
788 | } | ||
789 | } | ||
781 | 790 | ||
782 | /* remove the VMA from the mapping */ | 791 | /* remove the VMA from the mapping */ |
783 | if (vma->vm_file) { | 792 | if (vma->vm_file) { |
@@ -825,8 +834,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | |||
825 | struct vm_area_struct *vma; | 834 | struct vm_area_struct *vma; |
826 | 835 | ||
827 | /* check the cache first */ | 836 | /* check the cache first */ |
828 | vma = ACCESS_ONCE(mm->mmap_cache); | 837 | vma = vmacache_find(mm, addr); |
829 | if (vma && vma->vm_start <= addr && vma->vm_end > addr) | 838 | if (likely(vma)) |
830 | return vma; | 839 | return vma; |
831 | 840 | ||
832 | /* trawl the list (there may be multiple mappings in which addr | 841 | /* trawl the list (there may be multiple mappings in which addr |
@@ -835,7 +844,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | |||
835 | if (vma->vm_start > addr) | 844 | if (vma->vm_start > addr) |
836 | return NULL; | 845 | return NULL; |
837 | if (vma->vm_end > addr) { | 846 | if (vma->vm_end > addr) { |
838 | mm->mmap_cache = vma; | 847 | vmacache_update(addr, vma); |
839 | return vma; | 848 | return vma; |
840 | } | 849 | } |
841 | } | 850 | } |
@@ -874,8 +883,8 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, | |||
874 | unsigned long end = addr + len; | 883 | unsigned long end = addr + len; |
875 | 884 | ||
876 | /* check the cache first */ | 885 | /* check the cache first */ |
877 | vma = mm->mmap_cache; | 886 | vma = vmacache_find_exact(mm, addr, end); |
878 | if (vma && vma->vm_start == addr && vma->vm_end == end) | 887 | if (vma) |
879 | return vma; | 888 | return vma; |
880 | 889 | ||
881 | /* trawl the list (there may be multiple mappings in which addr | 890 | /* trawl the list (there may be multiple mappings in which addr |
@@ -886,7 +895,7 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, | |||
886 | if (vma->vm_start > addr) | 895 | if (vma->vm_start > addr) |
887 | return NULL; | 896 | return NULL; |
888 | if (vma->vm_end == end) { | 897 | if (vma->vm_end == end) { |
889 | mm->mmap_cache = vma; | 898 | vmacache_update(addr, vma); |
890 | return vma; | 899 | return vma; |
891 | } | 900 | } |
892 | } | 901 | } |
@@ -1003,8 +1012,7 @@ static int validate_mmap_request(struct file *file, | |||
1003 | 1012 | ||
1004 | /* we mustn't privatise shared mappings */ | 1013 | /* we mustn't privatise shared mappings */ |
1005 | capabilities &= ~BDI_CAP_MAP_COPY; | 1014 | capabilities &= ~BDI_CAP_MAP_COPY; |
1006 | } | 1015 | } else { |
1007 | else { | ||
1008 | /* we're going to read the file into private memory we | 1016 | /* we're going to read the file into private memory we |
1009 | * allocate */ | 1017 | * allocate */ |
1010 | if (!(capabilities & BDI_CAP_MAP_COPY)) | 1018 | if (!(capabilities & BDI_CAP_MAP_COPY)) |
@@ -1035,23 +1043,20 @@ static int validate_mmap_request(struct file *file, | |||
1035 | if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { | 1043 | if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { |
1036 | if (prot & PROT_EXEC) | 1044 | if (prot & PROT_EXEC) |
1037 | return -EPERM; | 1045 | return -EPERM; |
1038 | } | 1046 | } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) { |
1039 | else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) { | ||
1040 | /* handle implication of PROT_EXEC by PROT_READ */ | 1047 | /* handle implication of PROT_EXEC by PROT_READ */ |
1041 | if (current->personality & READ_IMPLIES_EXEC) { | 1048 | if (current->personality & READ_IMPLIES_EXEC) { |
1042 | if (capabilities & BDI_CAP_EXEC_MAP) | 1049 | if (capabilities & BDI_CAP_EXEC_MAP) |
1043 | prot |= PROT_EXEC; | 1050 | prot |= PROT_EXEC; |
1044 | } | 1051 | } |
1045 | } | 1052 | } else if ((prot & PROT_READ) && |
1046 | else if ((prot & PROT_READ) && | ||
1047 | (prot & PROT_EXEC) && | 1053 | (prot & PROT_EXEC) && |
1048 | !(capabilities & BDI_CAP_EXEC_MAP) | 1054 | !(capabilities & BDI_CAP_EXEC_MAP) |
1049 | ) { | 1055 | ) { |
1050 | /* backing file is not executable, try to copy */ | 1056 | /* backing file is not executable, try to copy */ |
1051 | capabilities &= ~BDI_CAP_MAP_DIRECT; | 1057 | capabilities &= ~BDI_CAP_MAP_DIRECT; |
1052 | } | 1058 | } |
1053 | } | 1059 | } else { |
1054 | else { | ||
1055 | /* anonymous mappings are always memory backed and can be | 1060 | /* anonymous mappings are always memory backed and can be |
1056 | * privately mapped | 1061 | * privately mapped |
1057 | */ | 1062 | */ |
@@ -1659,7 +1664,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
1659 | /* find the first potentially overlapping VMA */ | 1664 | /* find the first potentially overlapping VMA */ |
1660 | vma = find_vma(mm, start); | 1665 | vma = find_vma(mm, start); |
1661 | if (!vma) { | 1666 | if (!vma) { |
1662 | static int limit = 0; | 1667 | static int limit; |
1663 | if (limit < 5) { | 1668 | if (limit < 5) { |
1664 | printk(KERN_WARNING | 1669 | printk(KERN_WARNING |
1665 | "munmap of memory not mmapped by process %d" | 1670 | "munmap of memory not mmapped by process %d" |
@@ -1985,6 +1990,12 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1985 | } | 1990 | } |
1986 | EXPORT_SYMBOL(filemap_fault); | 1991 | EXPORT_SYMBOL(filemap_fault); |
1987 | 1992 | ||
1993 | void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) | ||
1994 | { | ||
1995 | BUG(); | ||
1996 | } | ||
1997 | EXPORT_SYMBOL(filemap_map_pages); | ||
1998 | |||
1988 | int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, | 1999 | int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, |
1989 | unsigned long size, pgoff_t pgoff) | 2000 | unsigned long size, pgoff_t pgoff) |
1990 | { | 2001 | { |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7106cb1aca8e..ef413492a149 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -1562,9 +1562,9 @@ pause: | |||
1562 | bdi_start_background_writeback(bdi); | 1562 | bdi_start_background_writeback(bdi); |
1563 | } | 1563 | } |
1564 | 1564 | ||
1565 | void set_page_dirty_balance(struct page *page, int page_mkwrite) | 1565 | void set_page_dirty_balance(struct page *page) |
1566 | { | 1566 | { |
1567 | if (set_page_dirty(page) || page_mkwrite) { | 1567 | if (set_page_dirty(page)) { |
1568 | struct address_space *mapping = page_mapping(page); | 1568 | struct address_space *mapping = page_mapping(page); |
1569 | 1569 | ||
1570 | if (mapping) | 1570 | if (mapping) |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 979378deccbf..5dba2933c9c0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -295,7 +295,8 @@ static inline int bad_range(struct zone *zone, struct page *page) | |||
295 | } | 295 | } |
296 | #endif | 296 | #endif |
297 | 297 | ||
298 | static void bad_page(struct page *page, char *reason, unsigned long bad_flags) | 298 | static void bad_page(struct page *page, const char *reason, |
299 | unsigned long bad_flags) | ||
299 | { | 300 | { |
300 | static unsigned long resume; | 301 | static unsigned long resume; |
301 | static unsigned long nr_shown; | 302 | static unsigned long nr_shown; |
@@ -623,7 +624,7 @@ out: | |||
623 | 624 | ||
624 | static inline int free_pages_check(struct page *page) | 625 | static inline int free_pages_check(struct page *page) |
625 | { | 626 | { |
626 | char *bad_reason = NULL; | 627 | const char *bad_reason = NULL; |
627 | unsigned long bad_flags = 0; | 628 | unsigned long bad_flags = 0; |
628 | 629 | ||
629 | if (unlikely(page_mapcount(page))) | 630 | if (unlikely(page_mapcount(page))) |
@@ -859,7 +860,7 @@ static inline void expand(struct zone *zone, struct page *page, | |||
859 | */ | 860 | */ |
860 | static inline int check_new_page(struct page *page) | 861 | static inline int check_new_page(struct page *page) |
861 | { | 862 | { |
862 | char *bad_reason = NULL; | 863 | const char *bad_reason = NULL; |
863 | unsigned long bad_flags = 0; | 864 | unsigned long bad_flags = 0; |
864 | 865 | ||
865 | if (unlikely(page_mapcount(page))) | 866 | if (unlikely(page_mapcount(page))) |
@@ -1238,15 +1239,6 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) | |||
1238 | } | 1239 | } |
1239 | local_irq_restore(flags); | 1240 | local_irq_restore(flags); |
1240 | } | 1241 | } |
1241 | static bool gfp_thisnode_allocation(gfp_t gfp_mask) | ||
1242 | { | ||
1243 | return (gfp_mask & GFP_THISNODE) == GFP_THISNODE; | ||
1244 | } | ||
1245 | #else | ||
1246 | static bool gfp_thisnode_allocation(gfp_t gfp_mask) | ||
1247 | { | ||
1248 | return false; | ||
1249 | } | ||
1250 | #endif | 1242 | #endif |
1251 | 1243 | ||
1252 | /* | 1244 | /* |
@@ -1583,12 +1575,7 @@ again: | |||
1583 | get_pageblock_migratetype(page)); | 1575 | get_pageblock_migratetype(page)); |
1584 | } | 1576 | } |
1585 | 1577 | ||
1586 | /* | 1578 | __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); |
1587 | * NOTE: GFP_THISNODE allocations do not partake in the kswapd | ||
1588 | * aging protocol, so they can't be fair. | ||
1589 | */ | ||
1590 | if (!gfp_thisnode_allocation(gfp_flags)) | ||
1591 | __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); | ||
1592 | 1579 | ||
1593 | __count_zone_vm_events(PGALLOC, zone, 1 << order); | 1580 | __count_zone_vm_events(PGALLOC, zone, 1 << order); |
1594 | zone_statistics(preferred_zone, zone, gfp_flags); | 1581 | zone_statistics(preferred_zone, zone, gfp_flags); |
@@ -1870,7 +1857,7 @@ static void __paginginit init_zone_allows_reclaim(int nid) | |||
1870 | { | 1857 | { |
1871 | int i; | 1858 | int i; |
1872 | 1859 | ||
1873 | for_each_online_node(i) | 1860 | for_each_node_state(i, N_MEMORY) |
1874 | if (node_distance(nid, i) <= RECLAIM_DISTANCE) | 1861 | if (node_distance(nid, i) <= RECLAIM_DISTANCE) |
1875 | node_set(i, NODE_DATA(nid)->reclaim_nodes); | 1862 | node_set(i, NODE_DATA(nid)->reclaim_nodes); |
1876 | else | 1863 | else |
@@ -1954,23 +1941,12 @@ zonelist_scan: | |||
1954 | * zone size to ensure fair page aging. The zone a | 1941 | * zone size to ensure fair page aging. The zone a |
1955 | * page was allocated in should have no effect on the | 1942 | * page was allocated in should have no effect on the |
1956 | * time the page has in memory before being reclaimed. | 1943 | * time the page has in memory before being reclaimed. |
1957 | * | ||
1958 | * Try to stay in local zones in the fastpath. If | ||
1959 | * that fails, the slowpath is entered, which will do | ||
1960 | * another pass starting with the local zones, but | ||
1961 | * ultimately fall back to remote zones that do not | ||
1962 | * partake in the fairness round-robin cycle of this | ||
1963 | * zonelist. | ||
1964 | * | ||
1965 | * NOTE: GFP_THISNODE allocations do not partake in | ||
1966 | * the kswapd aging protocol, so they can't be fair. | ||
1967 | */ | 1944 | */ |
1968 | if ((alloc_flags & ALLOC_WMARK_LOW) && | 1945 | if (alloc_flags & ALLOC_FAIR) { |
1969 | !gfp_thisnode_allocation(gfp_mask)) { | ||
1970 | if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) | ||
1971 | continue; | ||
1972 | if (!zone_local(preferred_zone, zone)) | 1946 | if (!zone_local(preferred_zone, zone)) |
1973 | continue; | 1947 | continue; |
1948 | if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) | ||
1949 | continue; | ||
1974 | } | 1950 | } |
1975 | /* | 1951 | /* |
1976 | * When allocating a page cache page for writing, we | 1952 | * When allocating a page cache page for writing, we |
@@ -2408,32 +2384,40 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, | |||
2408 | return page; | 2384 | return page; |
2409 | } | 2385 | } |
2410 | 2386 | ||
2411 | static void prepare_slowpath(gfp_t gfp_mask, unsigned int order, | 2387 | static void reset_alloc_batches(struct zonelist *zonelist, |
2412 | struct zonelist *zonelist, | 2388 | enum zone_type high_zoneidx, |
2413 | enum zone_type high_zoneidx, | 2389 | struct zone *preferred_zone) |
2414 | struct zone *preferred_zone) | ||
2415 | { | 2390 | { |
2416 | struct zoneref *z; | 2391 | struct zoneref *z; |
2417 | struct zone *zone; | 2392 | struct zone *zone; |
2418 | 2393 | ||
2419 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { | 2394 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { |
2420 | if (!(gfp_mask & __GFP_NO_KSWAPD)) | ||
2421 | wakeup_kswapd(zone, order, zone_idx(preferred_zone)); | ||
2422 | /* | 2395 | /* |
2423 | * Only reset the batches of zones that were actually | 2396 | * Only reset the batches of zones that were actually |
2424 | * considered in the fast path, we don't want to | 2397 | * considered in the fairness pass, we don't want to |
2425 | * thrash fairness information for zones that are not | 2398 | * trash fairness information for zones that are not |
2426 | * actually part of this zonelist's round-robin cycle. | 2399 | * actually part of this zonelist's round-robin cycle. |
2427 | */ | 2400 | */ |
2428 | if (!zone_local(preferred_zone, zone)) | 2401 | if (!zone_local(preferred_zone, zone)) |
2429 | continue; | 2402 | continue; |
2430 | mod_zone_page_state(zone, NR_ALLOC_BATCH, | 2403 | mod_zone_page_state(zone, NR_ALLOC_BATCH, |
2431 | high_wmark_pages(zone) - | 2404 | high_wmark_pages(zone) - low_wmark_pages(zone) - |
2432 | low_wmark_pages(zone) - | 2405 | atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); |
2433 | zone_page_state(zone, NR_ALLOC_BATCH)); | ||
2434 | } | 2406 | } |
2435 | } | 2407 | } |
2436 | 2408 | ||
2409 | static void wake_all_kswapds(unsigned int order, | ||
2410 | struct zonelist *zonelist, | ||
2411 | enum zone_type high_zoneidx, | ||
2412 | struct zone *preferred_zone) | ||
2413 | { | ||
2414 | struct zoneref *z; | ||
2415 | struct zone *zone; | ||
2416 | |||
2417 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) | ||
2418 | wakeup_kswapd(zone, order, zone_idx(preferred_zone)); | ||
2419 | } | ||
2420 | |||
2437 | static inline int | 2421 | static inline int |
2438 | gfp_to_alloc_flags(gfp_t gfp_mask) | 2422 | gfp_to_alloc_flags(gfp_t gfp_mask) |
2439 | { | 2423 | { |
@@ -2522,12 +2506,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
2522 | * allowed per node queues are empty and that nodes are | 2506 | * allowed per node queues are empty and that nodes are |
2523 | * over allocated. | 2507 | * over allocated. |
2524 | */ | 2508 | */ |
2525 | if (gfp_thisnode_allocation(gfp_mask)) | 2509 | if (IS_ENABLED(CONFIG_NUMA) && |
2510 | (gfp_mask & GFP_THISNODE) == GFP_THISNODE) | ||
2526 | goto nopage; | 2511 | goto nopage; |
2527 | 2512 | ||
2528 | restart: | 2513 | restart: |
2529 | prepare_slowpath(gfp_mask, order, zonelist, | 2514 | if (!(gfp_mask & __GFP_NO_KSWAPD)) |
2530 | high_zoneidx, preferred_zone); | 2515 | wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone); |
2531 | 2516 | ||
2532 | /* | 2517 | /* |
2533 | * OK, we're below the kswapd watermark and have kicked background | 2518 | * OK, we're below the kswapd watermark and have kicked background |
@@ -2711,7 +2696,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2711 | struct page *page = NULL; | 2696 | struct page *page = NULL; |
2712 | int migratetype = allocflags_to_migratetype(gfp_mask); | 2697 | int migratetype = allocflags_to_migratetype(gfp_mask); |
2713 | unsigned int cpuset_mems_cookie; | 2698 | unsigned int cpuset_mems_cookie; |
2714 | int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; | 2699 | int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; |
2715 | struct mem_cgroup *memcg = NULL; | 2700 | struct mem_cgroup *memcg = NULL; |
2716 | 2701 | ||
2717 | gfp_mask &= gfp_allowed_mask; | 2702 | gfp_mask &= gfp_allowed_mask; |
@@ -2752,12 +2737,29 @@ retry_cpuset: | |||
2752 | if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) | 2737 | if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) |
2753 | alloc_flags |= ALLOC_CMA; | 2738 | alloc_flags |= ALLOC_CMA; |
2754 | #endif | 2739 | #endif |
2740 | retry: | ||
2755 | /* First allocation attempt */ | 2741 | /* First allocation attempt */ |
2756 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 2742 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, |
2757 | zonelist, high_zoneidx, alloc_flags, | 2743 | zonelist, high_zoneidx, alloc_flags, |
2758 | preferred_zone, migratetype); | 2744 | preferred_zone, migratetype); |
2759 | if (unlikely(!page)) { | 2745 | if (unlikely(!page)) { |
2760 | /* | 2746 | /* |
2747 | * The first pass makes sure allocations are spread | ||
2748 | * fairly within the local node. However, the local | ||
2749 | * node might have free pages left after the fairness | ||
2750 | * batches are exhausted, and remote zones haven't | ||
2751 | * even been considered yet. Try once more without | ||
2752 | * fairness, and include remote zones now, before | ||
2753 | * entering the slowpath and waking kswapd: prefer | ||
2754 | * spilling to a remote zone over swapping locally. | ||
2755 | */ | ||
2756 | if (alloc_flags & ALLOC_FAIR) { | ||
2757 | reset_alloc_batches(zonelist, high_zoneidx, | ||
2758 | preferred_zone); | ||
2759 | alloc_flags &= ~ALLOC_FAIR; | ||
2760 | goto retry; | ||
2761 | } | ||
2762 | /* | ||
2761 | * Runtime PM, block IO and its error handling path | 2763 | * Runtime PM, block IO and its error handling path |
2762 | * can deadlock because I/O on the device might not | 2764 | * can deadlock because I/O on the device might not |
2763 | * complete. | 2765 | * complete. |
@@ -4919,7 +4921,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | |||
4919 | 4921 | ||
4920 | pgdat->node_id = nid; | 4922 | pgdat->node_id = nid; |
4921 | pgdat->node_start_pfn = node_start_pfn; | 4923 | pgdat->node_start_pfn = node_start_pfn; |
4922 | init_zone_allows_reclaim(nid); | 4924 | if (node_state(nid, N_MEMORY)) |
4925 | init_zone_allows_reclaim(nid); | ||
4923 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 4926 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
4924 | get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); | 4927 | get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); |
4925 | #endif | 4928 | #endif |
@@ -5070,7 +5073,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) | |||
5070 | nodemask_t saved_node_state = node_states[N_MEMORY]; | 5073 | nodemask_t saved_node_state = node_states[N_MEMORY]; |
5071 | unsigned long totalpages = early_calculate_totalpages(); | 5074 | unsigned long totalpages = early_calculate_totalpages(); |
5072 | int usable_nodes = nodes_weight(node_states[N_MEMORY]); | 5075 | int usable_nodes = nodes_weight(node_states[N_MEMORY]); |
5073 | struct memblock_type *type = &memblock.memory; | 5076 | struct memblock_region *r; |
5074 | 5077 | ||
5075 | /* Need to find movable_zone earlier when movable_node is specified. */ | 5078 | /* Need to find movable_zone earlier when movable_node is specified. */ |
5076 | find_usable_zone_for_movable(); | 5079 | find_usable_zone_for_movable(); |
@@ -5080,13 +5083,13 @@ static void __init find_zone_movable_pfns_for_nodes(void) | |||
5080 | * options. | 5083 | * options. |
5081 | */ | 5084 | */ |
5082 | if (movable_node_is_enabled()) { | 5085 | if (movable_node_is_enabled()) { |
5083 | for (i = 0; i < type->cnt; i++) { | 5086 | for_each_memblock(memory, r) { |
5084 | if (!memblock_is_hotpluggable(&type->regions[i])) | 5087 | if (!memblock_is_hotpluggable(r)) |
5085 | continue; | 5088 | continue; |
5086 | 5089 | ||
5087 | nid = type->regions[i].nid; | 5090 | nid = r->nid; |
5088 | 5091 | ||
5089 | usable_startpfn = PFN_DOWN(type->regions[i].base); | 5092 | usable_startpfn = PFN_DOWN(r->base); |
5090 | zone_movable_pfn[nid] = zone_movable_pfn[nid] ? | 5093 | zone_movable_pfn[nid] = zone_movable_pfn[nid] ? |
5091 | min(usable_startpfn, zone_movable_pfn[nid]) : | 5094 | min(usable_startpfn, zone_movable_pfn[nid]) : |
5092 | usable_startpfn; | 5095 | usable_startpfn; |
@@ -6544,7 +6547,8 @@ static void dump_page_flags(unsigned long flags) | |||
6544 | printk(")\n"); | 6547 | printk(")\n"); |
6545 | } | 6548 | } |
6546 | 6549 | ||
6547 | void dump_page_badflags(struct page *page, char *reason, unsigned long badflags) | 6550 | void dump_page_badflags(struct page *page, const char *reason, |
6551 | unsigned long badflags) | ||
6548 | { | 6552 | { |
6549 | printk(KERN_ALERT | 6553 | printk(KERN_ALERT |
6550 | "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", | 6554 | "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", |
@@ -6560,8 +6564,8 @@ void dump_page_badflags(struct page *page, char *reason, unsigned long badflags) | |||
6560 | mem_cgroup_print_bad_page(page); | 6564 | mem_cgroup_print_bad_page(page); |
6561 | } | 6565 | } |
6562 | 6566 | ||
6563 | void dump_page(struct page *page, char *reason) | 6567 | void dump_page(struct page *page, const char *reason) |
6564 | { | 6568 | { |
6565 | dump_page_badflags(page, reason, 0); | 6569 | dump_page_badflags(page, reason, 0); |
6566 | } | 6570 | } |
6567 | EXPORT_SYMBOL_GPL(dump_page); | 6571 | EXPORT_SYMBOL(dump_page); |
diff --git a/mm/readahead.c b/mm/readahead.c index 29c5e1af5a0c..0ca36a7770b1 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -8,9 +8,7 @@ | |||
8 | */ | 8 | */ |
9 | 9 | ||
10 | #include <linux/kernel.h> | 10 | #include <linux/kernel.h> |
11 | #include <linux/fs.h> | ||
12 | #include <linux/gfp.h> | 11 | #include <linux/gfp.h> |
13 | #include <linux/mm.h> | ||
14 | #include <linux/export.h> | 12 | #include <linux/export.h> |
15 | #include <linux/blkdev.h> | 13 | #include <linux/blkdev.h> |
16 | #include <linux/backing-dev.h> | 14 | #include <linux/backing-dev.h> |
@@ -20,6 +18,8 @@ | |||
20 | #include <linux/syscalls.h> | 18 | #include <linux/syscalls.h> |
21 | #include <linux/file.h> | 19 | #include <linux/file.h> |
22 | 20 | ||
21 | #include "internal.h" | ||
22 | |||
23 | /* | 23 | /* |
24 | * Initialise a struct file's readahead state. Assumes that the caller has | 24 | * Initialise a struct file's readahead state. Assumes that the caller has |
25 | * memset *ra to zero. | 25 | * memset *ra to zero. |
@@ -149,8 +149,7 @@ out: | |||
149 | * | 149 | * |
150 | * Returns the number of pages requested, or the maximum amount of I/O allowed. | 150 | * Returns the number of pages requested, or the maximum amount of I/O allowed. |
151 | */ | 151 | */ |
152 | static int | 152 | int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, |
153 | __do_page_cache_readahead(struct address_space *mapping, struct file *filp, | ||
154 | pgoff_t offset, unsigned long nr_to_read, | 153 | pgoff_t offset, unsigned long nr_to_read, |
155 | unsigned long lookahead_size) | 154 | unsigned long lookahead_size) |
156 | { | 155 | { |
@@ -244,20 +243,6 @@ unsigned long max_sane_readahead(unsigned long nr) | |||
244 | } | 243 | } |
245 | 244 | ||
246 | /* | 245 | /* |
247 | * Submit IO for the read-ahead request in file_ra_state. | ||
248 | */ | ||
249 | unsigned long ra_submit(struct file_ra_state *ra, | ||
250 | struct address_space *mapping, struct file *filp) | ||
251 | { | ||
252 | int actual; | ||
253 | |||
254 | actual = __do_page_cache_readahead(mapping, filp, | ||
255 | ra->start, ra->size, ra->async_size); | ||
256 | |||
257 | return actual; | ||
258 | } | ||
259 | |||
260 | /* | ||
261 | * Set the initial window size, round to next power of 2 and square | 246 | * Set the initial window size, round to next power of 2 and square |
262 | * for small size, x 4 for medium, and x 2 for large | 247 | * for small size, x 4 for medium, and x 2 for large |
263 | * for 128k (32 page) max ra | 248 | * for 128k (32 page) max ra |
@@ -1332,9 +1332,19 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
1332 | BUG_ON(!page || PageAnon(page)); | 1332 | BUG_ON(!page || PageAnon(page)); |
1333 | 1333 | ||
1334 | if (locked_vma) { | 1334 | if (locked_vma) { |
1335 | mlock_vma_page(page); /* no-op if already mlocked */ | 1335 | if (page == check_page) { |
1336 | if (page == check_page) | 1336 | /* we know we have check_page locked */ |
1337 | mlock_vma_page(page); | ||
1337 | ret = SWAP_MLOCK; | 1338 | ret = SWAP_MLOCK; |
1339 | } else if (trylock_page(page)) { | ||
1340 | /* | ||
1341 | * If we can lock the page, perform mlock. | ||
1342 | * Otherwise leave the page alone, it will be | ||
1343 | * eventually encountered again later. | ||
1344 | */ | ||
1345 | mlock_vma_page(page); | ||
1346 | unlock_page(page); | ||
1347 | } | ||
1338 | continue; /* don't unmap */ | 1348 | continue; /* don't unmap */ |
1339 | } | 1349 | } |
1340 | 1350 | ||
diff --git a/mm/shmem.c b/mm/shmem.c index a3ba988ec946..70273f8df586 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -683,7 +683,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page) | |||
683 | * the shmem_swaplist_mutex which might hold up shmem_writepage(). | 683 | * the shmem_swaplist_mutex which might hold up shmem_writepage(). |
684 | * Charged back to the user (not to caller) when swap account is used. | 684 | * Charged back to the user (not to caller) when swap account is used. |
685 | */ | 685 | */ |
686 | error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); | 686 | error = mem_cgroup_charge_file(page, current->mm, GFP_KERNEL); |
687 | if (error) | 687 | if (error) |
688 | goto out; | 688 | goto out; |
689 | /* No radix_tree_preload: swap entry keeps a place for page in tree */ | 689 | /* No radix_tree_preload: swap entry keeps a place for page in tree */ |
@@ -1080,7 +1080,7 @@ repeat: | |||
1080 | goto failed; | 1080 | goto failed; |
1081 | } | 1081 | } |
1082 | 1082 | ||
1083 | error = mem_cgroup_cache_charge(page, current->mm, | 1083 | error = mem_cgroup_charge_file(page, current->mm, |
1084 | gfp & GFP_RECLAIM_MASK); | 1084 | gfp & GFP_RECLAIM_MASK); |
1085 | if (!error) { | 1085 | if (!error) { |
1086 | error = shmem_add_to_page_cache(page, mapping, index, | 1086 | error = shmem_add_to_page_cache(page, mapping, index, |
@@ -1134,7 +1134,7 @@ repeat: | |||
1134 | 1134 | ||
1135 | SetPageSwapBacked(page); | 1135 | SetPageSwapBacked(page); |
1136 | __set_page_locked(page); | 1136 | __set_page_locked(page); |
1137 | error = mem_cgroup_cache_charge(page, current->mm, | 1137 | error = mem_cgroup_charge_file(page, current->mm, |
1138 | gfp & GFP_RECLAIM_MASK); | 1138 | gfp & GFP_RECLAIM_MASK); |
1139 | if (error) | 1139 | if (error) |
1140 | goto decused; | 1140 | goto decused; |
@@ -2723,6 +2723,7 @@ static const struct super_operations shmem_ops = { | |||
2723 | 2723 | ||
2724 | static const struct vm_operations_struct shmem_vm_ops = { | 2724 | static const struct vm_operations_struct shmem_vm_ops = { |
2725 | .fault = shmem_fault, | 2725 | .fault = shmem_fault, |
2726 | .map_pages = filemap_map_pages, | ||
2726 | #ifdef CONFIG_NUMA | 2727 | #ifdef CONFIG_NUMA |
2727 | .set_policy = shmem_set_policy, | 2728 | .set_policy = shmem_set_policy, |
2728 | .get_policy = shmem_get_policy, | 2729 | .get_policy = shmem_get_policy, |
@@ -3027,7 +3027,7 @@ out: | |||
3027 | 3027 | ||
3028 | #ifdef CONFIG_NUMA | 3028 | #ifdef CONFIG_NUMA |
3029 | /* | 3029 | /* |
3030 | * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY. | 3030 | * Try allocating on another node if PF_SPREAD_SLAB is a mempolicy is set. |
3031 | * | 3031 | * |
3032 | * If we are in_interrupt, then process context, including cpusets and | 3032 | * If we are in_interrupt, then process context, including cpusets and |
3033 | * mempolicy, may not apply and should not be used for allocation policy. | 3033 | * mempolicy, may not apply and should not be used for allocation policy. |
@@ -3042,7 +3042,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3042 | if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) | 3042 | if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) |
3043 | nid_alloc = cpuset_slab_spread_node(); | 3043 | nid_alloc = cpuset_slab_spread_node(); |
3044 | else if (current->mempolicy) | 3044 | else if (current->mempolicy) |
3045 | nid_alloc = slab_node(); | 3045 | nid_alloc = mempolicy_slab_node(); |
3046 | if (nid_alloc != nid_here) | 3046 | if (nid_alloc != nid_here) |
3047 | return ____cache_alloc_node(cachep, flags, nid_alloc); | 3047 | return ____cache_alloc_node(cachep, flags, nid_alloc); |
3048 | return NULL; | 3048 | return NULL; |
@@ -3074,7 +3074,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) | |||
3074 | 3074 | ||
3075 | retry_cpuset: | 3075 | retry_cpuset: |
3076 | cpuset_mems_cookie = read_mems_allowed_begin(); | 3076 | cpuset_mems_cookie = read_mems_allowed_begin(); |
3077 | zonelist = node_zonelist(slab_node(), flags); | 3077 | zonelist = node_zonelist(mempolicy_slab_node(), flags); |
3078 | 3078 | ||
3079 | retry: | 3079 | retry: |
3080 | /* | 3080 | /* |
@@ -3259,7 +3259,7 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags) | |||
3259 | { | 3259 | { |
3260 | void *objp; | 3260 | void *objp; |
3261 | 3261 | ||
3262 | if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { | 3262 | if (current->mempolicy || unlikely(current->flags & PF_SPREAD_SLAB)) { |
3263 | objp = alternate_node_alloc(cache, flags); | 3263 | objp = alternate_node_alloc(cache, flags); |
3264 | if (objp) | 3264 | if (objp) |
3265 | goto out; | 3265 | goto out; |
@@ -55,12 +55,12 @@ extern void create_boot_cache(struct kmem_cache *, const char *name, | |||
55 | struct mem_cgroup; | 55 | struct mem_cgroup; |
56 | #ifdef CONFIG_SLUB | 56 | #ifdef CONFIG_SLUB |
57 | struct kmem_cache * | 57 | struct kmem_cache * |
58 | __kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, | 58 | __kmem_cache_alias(const char *name, size_t size, size_t align, |
59 | size_t align, unsigned long flags, void (*ctor)(void *)); | 59 | unsigned long flags, void (*ctor)(void *)); |
60 | #else | 60 | #else |
61 | static inline struct kmem_cache * | 61 | static inline struct kmem_cache * |
62 | __kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, | 62 | __kmem_cache_alias(const char *name, size_t size, size_t align, |
63 | size_t align, unsigned long flags, void (*ctor)(void *)) | 63 | unsigned long flags, void (*ctor)(void *)) |
64 | { return NULL; } | 64 | { return NULL; } |
65 | #endif | 65 | #endif |
66 | 66 | ||
@@ -119,13 +119,6 @@ static inline bool is_root_cache(struct kmem_cache *s) | |||
119 | return !s->memcg_params || s->memcg_params->is_root_cache; | 119 | return !s->memcg_params || s->memcg_params->is_root_cache; |
120 | } | 120 | } |
121 | 121 | ||
122 | static inline bool cache_match_memcg(struct kmem_cache *cachep, | ||
123 | struct mem_cgroup *memcg) | ||
124 | { | ||
125 | return (is_root_cache(cachep) && !memcg) || | ||
126 | (cachep->memcg_params->memcg == memcg); | ||
127 | } | ||
128 | |||
129 | static inline void memcg_bind_pages(struct kmem_cache *s, int order) | 122 | static inline void memcg_bind_pages(struct kmem_cache *s, int order) |
130 | { | 123 | { |
131 | if (!is_root_cache(s)) | 124 | if (!is_root_cache(s)) |
@@ -204,12 +197,6 @@ static inline bool is_root_cache(struct kmem_cache *s) | |||
204 | return true; | 197 | return true; |
205 | } | 198 | } |
206 | 199 | ||
207 | static inline bool cache_match_memcg(struct kmem_cache *cachep, | ||
208 | struct mem_cgroup *memcg) | ||
209 | { | ||
210 | return true; | ||
211 | } | ||
212 | |||
213 | static inline void memcg_bind_pages(struct kmem_cache *s, int order) | 200 | static inline void memcg_bind_pages(struct kmem_cache *s, int order) |
214 | { | 201 | { |
215 | } | 202 | } |
diff --git a/mm/slab_common.c b/mm/slab_common.c index 1ec3c619ba04..f3cfccf76dda 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
@@ -29,8 +29,7 @@ DEFINE_MUTEX(slab_mutex); | |||
29 | struct kmem_cache *kmem_cache; | 29 | struct kmem_cache *kmem_cache; |
30 | 30 | ||
31 | #ifdef CONFIG_DEBUG_VM | 31 | #ifdef CONFIG_DEBUG_VM |
32 | static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name, | 32 | static int kmem_cache_sanity_check(const char *name, size_t size) |
33 | size_t size) | ||
34 | { | 33 | { |
35 | struct kmem_cache *s = NULL; | 34 | struct kmem_cache *s = NULL; |
36 | 35 | ||
@@ -57,13 +56,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name, | |||
57 | } | 56 | } |
58 | 57 | ||
59 | #if !defined(CONFIG_SLUB) || !defined(CONFIG_SLUB_DEBUG_ON) | 58 | #if !defined(CONFIG_SLUB) || !defined(CONFIG_SLUB_DEBUG_ON) |
60 | /* | 59 | if (!strcmp(s->name, name)) { |
61 | * For simplicity, we won't check this in the list of memcg | ||
62 | * caches. We have control over memcg naming, and if there | ||
63 | * aren't duplicates in the global list, there won't be any | ||
64 | * duplicates in the memcg lists as well. | ||
65 | */ | ||
66 | if (!memcg && !strcmp(s->name, name)) { | ||
67 | pr_err("%s (%s): Cache name already exists.\n", | 60 | pr_err("%s (%s): Cache name already exists.\n", |
68 | __func__, name); | 61 | __func__, name); |
69 | dump_stack(); | 62 | dump_stack(); |
@@ -77,8 +70,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name, | |||
77 | return 0; | 70 | return 0; |
78 | } | 71 | } |
79 | #else | 72 | #else |
80 | static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg, | 73 | static inline int kmem_cache_sanity_check(const char *name, size_t size) |
81 | const char *name, size_t size) | ||
82 | { | 74 | { |
83 | return 0; | 75 | return 0; |
84 | } | 76 | } |
@@ -139,6 +131,46 @@ unsigned long calculate_alignment(unsigned long flags, | |||
139 | return ALIGN(align, sizeof(void *)); | 131 | return ALIGN(align, sizeof(void *)); |
140 | } | 132 | } |
141 | 133 | ||
134 | static struct kmem_cache * | ||
135 | do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align, | ||
136 | unsigned long flags, void (*ctor)(void *), | ||
137 | struct mem_cgroup *memcg, struct kmem_cache *root_cache) | ||
138 | { | ||
139 | struct kmem_cache *s; | ||
140 | int err; | ||
141 | |||
142 | err = -ENOMEM; | ||
143 | s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); | ||
144 | if (!s) | ||
145 | goto out; | ||
146 | |||
147 | s->name = name; | ||
148 | s->object_size = object_size; | ||
149 | s->size = size; | ||
150 | s->align = align; | ||
151 | s->ctor = ctor; | ||
152 | |||
153 | err = memcg_alloc_cache_params(memcg, s, root_cache); | ||
154 | if (err) | ||
155 | goto out_free_cache; | ||
156 | |||
157 | err = __kmem_cache_create(s, flags); | ||
158 | if (err) | ||
159 | goto out_free_cache; | ||
160 | |||
161 | s->refcount = 1; | ||
162 | list_add(&s->list, &slab_caches); | ||
163 | memcg_register_cache(s); | ||
164 | out: | ||
165 | if (err) | ||
166 | return ERR_PTR(err); | ||
167 | return s; | ||
168 | |||
169 | out_free_cache: | ||
170 | memcg_free_cache_params(s); | ||
171 | kfree(s); | ||
172 | goto out; | ||
173 | } | ||
142 | 174 | ||
143 | /* | 175 | /* |
144 | * kmem_cache_create - Create a cache. | 176 | * kmem_cache_create - Create a cache. |
@@ -164,34 +196,21 @@ unsigned long calculate_alignment(unsigned long flags, | |||
164 | * cacheline. This can be beneficial if you're counting cycles as closely | 196 | * cacheline. This can be beneficial if you're counting cycles as closely |
165 | * as davem. | 197 | * as davem. |
166 | */ | 198 | */ |
167 | |||
168 | struct kmem_cache * | 199 | struct kmem_cache * |
169 | kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, | 200 | kmem_cache_create(const char *name, size_t size, size_t align, |
170 | size_t align, unsigned long flags, void (*ctor)(void *), | 201 | unsigned long flags, void (*ctor)(void *)) |
171 | struct kmem_cache *parent_cache) | ||
172 | { | 202 | { |
173 | struct kmem_cache *s = NULL; | 203 | struct kmem_cache *s; |
204 | char *cache_name; | ||
174 | int err; | 205 | int err; |
175 | 206 | ||
176 | get_online_cpus(); | 207 | get_online_cpus(); |
177 | mutex_lock(&slab_mutex); | 208 | mutex_lock(&slab_mutex); |
178 | 209 | ||
179 | err = kmem_cache_sanity_check(memcg, name, size); | 210 | err = kmem_cache_sanity_check(name, size); |
180 | if (err) | 211 | if (err) |
181 | goto out_unlock; | 212 | goto out_unlock; |
182 | 213 | ||
183 | if (memcg) { | ||
184 | /* | ||
185 | * Since per-memcg caches are created asynchronously on first | ||
186 | * allocation (see memcg_kmem_get_cache()), several threads can | ||
187 | * try to create the same cache, but only one of them may | ||
188 | * succeed. Therefore if we get here and see the cache has | ||
189 | * already been created, we silently return NULL. | ||
190 | */ | ||
191 | if (cache_from_memcg_idx(parent_cache, memcg_cache_id(memcg))) | ||
192 | goto out_unlock; | ||
193 | } | ||
194 | |||
195 | /* | 214 | /* |
196 | * Some allocators will constraint the set of valid flags to a subset | 215 | * Some allocators will constraint the set of valid flags to a subset |
197 | * of all flags. We expect them to define CACHE_CREATE_MASK in this | 216 | * of all flags. We expect them to define CACHE_CREATE_MASK in this |
@@ -200,50 +219,29 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, | |||
200 | */ | 219 | */ |
201 | flags &= CACHE_CREATE_MASK; | 220 | flags &= CACHE_CREATE_MASK; |
202 | 221 | ||
203 | s = __kmem_cache_alias(memcg, name, size, align, flags, ctor); | 222 | s = __kmem_cache_alias(name, size, align, flags, ctor); |
204 | if (s) | 223 | if (s) |
205 | goto out_unlock; | 224 | goto out_unlock; |
206 | 225 | ||
207 | err = -ENOMEM; | 226 | cache_name = kstrdup(name, GFP_KERNEL); |
208 | s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); | 227 | if (!cache_name) { |
209 | if (!s) | 228 | err = -ENOMEM; |
210 | goto out_unlock; | 229 | goto out_unlock; |
230 | } | ||
211 | 231 | ||
212 | s->object_size = s->size = size; | 232 | s = do_kmem_cache_create(cache_name, size, size, |
213 | s->align = calculate_alignment(flags, align, size); | 233 | calculate_alignment(flags, align, size), |
214 | s->ctor = ctor; | 234 | flags, ctor, NULL, NULL); |
215 | 235 | if (IS_ERR(s)) { | |
216 | s->name = kstrdup(name, GFP_KERNEL); | 236 | err = PTR_ERR(s); |
217 | if (!s->name) | 237 | kfree(cache_name); |
218 | goto out_free_cache; | 238 | } |
219 | |||
220 | err = memcg_alloc_cache_params(memcg, s, parent_cache); | ||
221 | if (err) | ||
222 | goto out_free_cache; | ||
223 | |||
224 | err = __kmem_cache_create(s, flags); | ||
225 | if (err) | ||
226 | goto out_free_cache; | ||
227 | |||
228 | s->refcount = 1; | ||
229 | list_add(&s->list, &slab_caches); | ||
230 | memcg_register_cache(s); | ||
231 | 239 | ||
232 | out_unlock: | 240 | out_unlock: |
233 | mutex_unlock(&slab_mutex); | 241 | mutex_unlock(&slab_mutex); |
234 | put_online_cpus(); | 242 | put_online_cpus(); |
235 | 243 | ||
236 | if (err) { | 244 | if (err) { |
237 | /* | ||
238 | * There is no point in flooding logs with warnings or | ||
239 | * especially crashing the system if we fail to create a cache | ||
240 | * for a memcg. In this case we will be accounting the memcg | ||
241 | * allocation to the root cgroup until we succeed to create its | ||
242 | * own cache, but it isn't that critical. | ||
243 | */ | ||
244 | if (!memcg) | ||
245 | return NULL; | ||
246 | |||
247 | if (flags & SLAB_PANIC) | 245 | if (flags & SLAB_PANIC) |
248 | panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n", | 246 | panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n", |
249 | name, err); | 247 | name, err); |
@@ -255,52 +253,112 @@ out_unlock: | |||
255 | return NULL; | 253 | return NULL; |
256 | } | 254 | } |
257 | return s; | 255 | return s; |
256 | } | ||
257 | EXPORT_SYMBOL(kmem_cache_create); | ||
258 | 258 | ||
259 | out_free_cache: | 259 | #ifdef CONFIG_MEMCG_KMEM |
260 | memcg_free_cache_params(s); | 260 | /* |
261 | kfree(s->name); | 261 | * kmem_cache_create_memcg - Create a cache for a memory cgroup. |
262 | kmem_cache_free(kmem_cache, s); | 262 | * @memcg: The memory cgroup the new cache is for. |
263 | goto out_unlock; | 263 | * @root_cache: The parent of the new cache. |
264 | * | ||
265 | * This function attempts to create a kmem cache that will serve allocation | ||
266 | * requests going from @memcg to @root_cache. The new cache inherits properties | ||
267 | * from its parent. | ||
268 | */ | ||
269 | void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_cache) | ||
270 | { | ||
271 | struct kmem_cache *s; | ||
272 | char *cache_name; | ||
273 | |||
274 | get_online_cpus(); | ||
275 | mutex_lock(&slab_mutex); | ||
276 | |||
277 | /* | ||
278 | * Since per-memcg caches are created asynchronously on first | ||
279 | * allocation (see memcg_kmem_get_cache()), several threads can try to | ||
280 | * create the same cache, but only one of them may succeed. | ||
281 | */ | ||
282 | if (cache_from_memcg_idx(root_cache, memcg_cache_id(memcg))) | ||
283 | goto out_unlock; | ||
284 | |||
285 | cache_name = memcg_create_cache_name(memcg, root_cache); | ||
286 | if (!cache_name) | ||
287 | goto out_unlock; | ||
288 | |||
289 | s = do_kmem_cache_create(cache_name, root_cache->object_size, | ||
290 | root_cache->size, root_cache->align, | ||
291 | root_cache->flags, root_cache->ctor, | ||
292 | memcg, root_cache); | ||
293 | if (IS_ERR(s)) { | ||
294 | kfree(cache_name); | ||
295 | goto out_unlock; | ||
296 | } | ||
297 | |||
298 | s->allocflags |= __GFP_KMEMCG; | ||
299 | |||
300 | out_unlock: | ||
301 | mutex_unlock(&slab_mutex); | ||
302 | put_online_cpus(); | ||
264 | } | 303 | } |
265 | 304 | ||
266 | struct kmem_cache * | 305 | static int kmem_cache_destroy_memcg_children(struct kmem_cache *s) |
267 | kmem_cache_create(const char *name, size_t size, size_t align, | ||
268 | unsigned long flags, void (*ctor)(void *)) | ||
269 | { | 306 | { |
270 | return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL); | 307 | int rc; |
308 | |||
309 | if (!s->memcg_params || | ||
310 | !s->memcg_params->is_root_cache) | ||
311 | return 0; | ||
312 | |||
313 | mutex_unlock(&slab_mutex); | ||
314 | rc = __kmem_cache_destroy_memcg_children(s); | ||
315 | mutex_lock(&slab_mutex); | ||
316 | |||
317 | return rc; | ||
271 | } | 318 | } |
272 | EXPORT_SYMBOL(kmem_cache_create); | 319 | #else |
320 | static int kmem_cache_destroy_memcg_children(struct kmem_cache *s) | ||
321 | { | ||
322 | return 0; | ||
323 | } | ||
324 | #endif /* CONFIG_MEMCG_KMEM */ | ||
273 | 325 | ||
274 | void kmem_cache_destroy(struct kmem_cache *s) | 326 | void kmem_cache_destroy(struct kmem_cache *s) |
275 | { | 327 | { |
276 | /* Destroy all the children caches if we aren't a memcg cache */ | ||
277 | kmem_cache_destroy_memcg_children(s); | ||
278 | |||
279 | get_online_cpus(); | 328 | get_online_cpus(); |
280 | mutex_lock(&slab_mutex); | 329 | mutex_lock(&slab_mutex); |
330 | |||
281 | s->refcount--; | 331 | s->refcount--; |
282 | if (!s->refcount) { | 332 | if (s->refcount) |
283 | list_del(&s->list); | 333 | goto out_unlock; |
284 | 334 | ||
285 | if (!__kmem_cache_shutdown(s)) { | 335 | if (kmem_cache_destroy_memcg_children(s) != 0) |
286 | memcg_unregister_cache(s); | 336 | goto out_unlock; |
287 | mutex_unlock(&slab_mutex); | 337 | |
288 | if (s->flags & SLAB_DESTROY_BY_RCU) | 338 | list_del(&s->list); |
289 | rcu_barrier(); | 339 | memcg_unregister_cache(s); |
290 | 340 | ||
291 | memcg_free_cache_params(s); | 341 | if (__kmem_cache_shutdown(s) != 0) { |
292 | kfree(s->name); | 342 | list_add(&s->list, &slab_caches); |
293 | kmem_cache_free(kmem_cache, s); | 343 | memcg_register_cache(s); |
294 | } else { | 344 | printk(KERN_ERR "kmem_cache_destroy %s: " |
295 | list_add(&s->list, &slab_caches); | 345 | "Slab cache still has objects\n", s->name); |
296 | mutex_unlock(&slab_mutex); | 346 | dump_stack(); |
297 | printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n", | 347 | goto out_unlock; |
298 | s->name); | ||
299 | dump_stack(); | ||
300 | } | ||
301 | } else { | ||
302 | mutex_unlock(&slab_mutex); | ||
303 | } | 348 | } |
349 | |||
350 | mutex_unlock(&slab_mutex); | ||
351 | if (s->flags & SLAB_DESTROY_BY_RCU) | ||
352 | rcu_barrier(); | ||
353 | |||
354 | memcg_free_cache_params(s); | ||
355 | kfree(s->name); | ||
356 | kmem_cache_free(kmem_cache, s); | ||
357 | goto out_put_cpus; | ||
358 | |||
359 | out_unlock: | ||
360 | mutex_unlock(&slab_mutex); | ||
361 | out_put_cpus: | ||
304 | put_online_cpus(); | 362 | put_online_cpus(); |
305 | } | 363 | } |
306 | EXPORT_SYMBOL(kmem_cache_destroy); | 364 | EXPORT_SYMBOL(kmem_cache_destroy); |
@@ -224,7 +224,11 @@ static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { } | |||
224 | static inline void stat(const struct kmem_cache *s, enum stat_item si) | 224 | static inline void stat(const struct kmem_cache *s, enum stat_item si) |
225 | { | 225 | { |
226 | #ifdef CONFIG_SLUB_STATS | 226 | #ifdef CONFIG_SLUB_STATS |
227 | __this_cpu_inc(s->cpu_slab->stat[si]); | 227 | /* |
228 | * The rmw is racy on a preemptible kernel but this is acceptable, so | ||
229 | * avoid this_cpu_add()'s irq-disable overhead. | ||
230 | */ | ||
231 | raw_cpu_inc(s->cpu_slab->stat[si]); | ||
228 | #endif | 232 | #endif |
229 | } | 233 | } |
230 | 234 | ||
@@ -1685,7 +1689,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, | |||
1685 | 1689 | ||
1686 | do { | 1690 | do { |
1687 | cpuset_mems_cookie = read_mems_allowed_begin(); | 1691 | cpuset_mems_cookie = read_mems_allowed_begin(); |
1688 | zonelist = node_zonelist(slab_node(), flags); | 1692 | zonelist = node_zonelist(mempolicy_slab_node(), flags); |
1689 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { | 1693 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { |
1690 | struct kmem_cache_node *n; | 1694 | struct kmem_cache_node *n; |
1691 | 1695 | ||
@@ -3685,6 +3689,9 @@ static int slab_unmergeable(struct kmem_cache *s) | |||
3685 | if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) | 3689 | if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) |
3686 | return 1; | 3690 | return 1; |
3687 | 3691 | ||
3692 | if (!is_root_cache(s)) | ||
3693 | return 1; | ||
3694 | |||
3688 | if (s->ctor) | 3695 | if (s->ctor) |
3689 | return 1; | 3696 | return 1; |
3690 | 3697 | ||
@@ -3697,9 +3704,8 @@ static int slab_unmergeable(struct kmem_cache *s) | |||
3697 | return 0; | 3704 | return 0; |
3698 | } | 3705 | } |
3699 | 3706 | ||
3700 | static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size, | 3707 | static struct kmem_cache *find_mergeable(size_t size, size_t align, |
3701 | size_t align, unsigned long flags, const char *name, | 3708 | unsigned long flags, const char *name, void (*ctor)(void *)) |
3702 | void (*ctor)(void *)) | ||
3703 | { | 3709 | { |
3704 | struct kmem_cache *s; | 3710 | struct kmem_cache *s; |
3705 | 3711 | ||
@@ -3722,7 +3728,7 @@ static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size, | |||
3722 | continue; | 3728 | continue; |
3723 | 3729 | ||
3724 | if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME)) | 3730 | if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME)) |
3725 | continue; | 3731 | continue; |
3726 | /* | 3732 | /* |
3727 | * Check if alignment is compatible. | 3733 | * Check if alignment is compatible. |
3728 | * Courtesy of Adrian Drzewiecki | 3734 | * Courtesy of Adrian Drzewiecki |
@@ -3733,23 +3739,24 @@ static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size, | |||
3733 | if (s->size - size >= sizeof(void *)) | 3739 | if (s->size - size >= sizeof(void *)) |
3734 | continue; | 3740 | continue; |
3735 | 3741 | ||
3736 | if (!cache_match_memcg(s, memcg)) | ||
3737 | continue; | ||
3738 | |||
3739 | return s; | 3742 | return s; |
3740 | } | 3743 | } |
3741 | return NULL; | 3744 | return NULL; |
3742 | } | 3745 | } |
3743 | 3746 | ||
3744 | struct kmem_cache * | 3747 | struct kmem_cache * |
3745 | __kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, | 3748 | __kmem_cache_alias(const char *name, size_t size, size_t align, |
3746 | size_t align, unsigned long flags, void (*ctor)(void *)) | 3749 | unsigned long flags, void (*ctor)(void *)) |
3747 | { | 3750 | { |
3748 | struct kmem_cache *s; | 3751 | struct kmem_cache *s; |
3749 | 3752 | ||
3750 | s = find_mergeable(memcg, size, align, flags, name, ctor); | 3753 | s = find_mergeable(size, align, flags, name, ctor); |
3751 | if (s) { | 3754 | if (s) { |
3755 | int i; | ||
3756 | struct kmem_cache *c; | ||
3757 | |||
3752 | s->refcount++; | 3758 | s->refcount++; |
3759 | |||
3753 | /* | 3760 | /* |
3754 | * Adjust the object sizes so that we clear | 3761 | * Adjust the object sizes so that we clear |
3755 | * the complete object on kzalloc. | 3762 | * the complete object on kzalloc. |
@@ -3757,6 +3764,15 @@ __kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, | |||
3757 | s->object_size = max(s->object_size, (int)size); | 3764 | s->object_size = max(s->object_size, (int)size); |
3758 | s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); | 3765 | s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); |
3759 | 3766 | ||
3767 | for_each_memcg_cache_index(i) { | ||
3768 | c = cache_from_memcg_idx(s, i); | ||
3769 | if (!c) | ||
3770 | continue; | ||
3771 | c->object_size = s->object_size; | ||
3772 | c->inuse = max_t(int, c->inuse, | ||
3773 | ALIGN(size, sizeof(void *))); | ||
3774 | } | ||
3775 | |||
3760 | if (sysfs_slab_alias(s, name)) { | 3776 | if (sysfs_slab_alias(s, name)) { |
3761 | s->refcount--; | 3777 | s->refcount--; |
3762 | s = NULL; | 3778 | s = NULL; |
@@ -5126,6 +5142,15 @@ static const struct kset_uevent_ops slab_uevent_ops = { | |||
5126 | 5142 | ||
5127 | static struct kset *slab_kset; | 5143 | static struct kset *slab_kset; |
5128 | 5144 | ||
5145 | static inline struct kset *cache_kset(struct kmem_cache *s) | ||
5146 | { | ||
5147 | #ifdef CONFIG_MEMCG_KMEM | ||
5148 | if (!is_root_cache(s)) | ||
5149 | return s->memcg_params->root_cache->memcg_kset; | ||
5150 | #endif | ||
5151 | return slab_kset; | ||
5152 | } | ||
5153 | |||
5129 | #define ID_STR_LENGTH 64 | 5154 | #define ID_STR_LENGTH 64 |
5130 | 5155 | ||
5131 | /* Create a unique string id for a slab cache: | 5156 | /* Create a unique string id for a slab cache: |
@@ -5191,26 +5216,39 @@ static int sysfs_slab_add(struct kmem_cache *s) | |||
5191 | name = create_unique_id(s); | 5216 | name = create_unique_id(s); |
5192 | } | 5217 | } |
5193 | 5218 | ||
5194 | s->kobj.kset = slab_kset; | 5219 | s->kobj.kset = cache_kset(s); |
5195 | err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); | 5220 | err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); |
5196 | if (err) { | 5221 | if (err) |
5197 | kobject_put(&s->kobj); | 5222 | goto out_put_kobj; |
5198 | return err; | ||
5199 | } | ||
5200 | 5223 | ||
5201 | err = sysfs_create_group(&s->kobj, &slab_attr_group); | 5224 | err = sysfs_create_group(&s->kobj, &slab_attr_group); |
5202 | if (err) { | 5225 | if (err) |
5203 | kobject_del(&s->kobj); | 5226 | goto out_del_kobj; |
5204 | kobject_put(&s->kobj); | 5227 | |
5205 | return err; | 5228 | #ifdef CONFIG_MEMCG_KMEM |
5229 | if (is_root_cache(s)) { | ||
5230 | s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj); | ||
5231 | if (!s->memcg_kset) { | ||
5232 | err = -ENOMEM; | ||
5233 | goto out_del_kobj; | ||
5234 | } | ||
5206 | } | 5235 | } |
5236 | #endif | ||
5237 | |||
5207 | kobject_uevent(&s->kobj, KOBJ_ADD); | 5238 | kobject_uevent(&s->kobj, KOBJ_ADD); |
5208 | if (!unmergeable) { | 5239 | if (!unmergeable) { |
5209 | /* Setup first alias */ | 5240 | /* Setup first alias */ |
5210 | sysfs_slab_alias(s, s->name); | 5241 | sysfs_slab_alias(s, s->name); |
5211 | kfree(name); | ||
5212 | } | 5242 | } |
5213 | return 0; | 5243 | out: |
5244 | if (!unmergeable) | ||
5245 | kfree(name); | ||
5246 | return err; | ||
5247 | out_del_kobj: | ||
5248 | kobject_del(&s->kobj); | ||
5249 | out_put_kobj: | ||
5250 | kobject_put(&s->kobj); | ||
5251 | goto out; | ||
5214 | } | 5252 | } |
5215 | 5253 | ||
5216 | static void sysfs_slab_remove(struct kmem_cache *s) | 5254 | static void sysfs_slab_remove(struct kmem_cache *s) |
@@ -5222,6 +5260,9 @@ static void sysfs_slab_remove(struct kmem_cache *s) | |||
5222 | */ | 5260 | */ |
5223 | return; | 5261 | return; |
5224 | 5262 | ||
5263 | #ifdef CONFIG_MEMCG_KMEM | ||
5264 | kset_unregister(s->memcg_kset); | ||
5265 | #endif | ||
5225 | kobject_uevent(&s->kobj, KOBJ_REMOVE); | 5266 | kobject_uevent(&s->kobj, KOBJ_REMOVE); |
5226 | kobject_del(&s->kobj); | 5267 | kobject_del(&s->kobj); |
5227 | kobject_put(&s->kobj); | 5268 | kobject_put(&s->kobj); |
diff --git a/mm/sparse.c b/mm/sparse.c index 38cad8fd7397..d1b48b691ac8 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -5,10 +5,12 @@ | |||
5 | #include <linux/slab.h> | 5 | #include <linux/slab.h> |
6 | #include <linux/mmzone.h> | 6 | #include <linux/mmzone.h> |
7 | #include <linux/bootmem.h> | 7 | #include <linux/bootmem.h> |
8 | #include <linux/compiler.h> | ||
8 | #include <linux/highmem.h> | 9 | #include <linux/highmem.h> |
9 | #include <linux/export.h> | 10 | #include <linux/export.h> |
10 | #include <linux/spinlock.h> | 11 | #include <linux/spinlock.h> |
11 | #include <linux/vmalloc.h> | 12 | #include <linux/vmalloc.h> |
13 | |||
12 | #include "internal.h" | 14 | #include "internal.h" |
13 | #include <asm/dma.h> | 15 | #include <asm/dma.h> |
14 | #include <asm/pgalloc.h> | 16 | #include <asm/pgalloc.h> |
@@ -461,7 +463,7 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) | |||
461 | } | 463 | } |
462 | #endif | 464 | #endif |
463 | 465 | ||
464 | void __attribute__((weak)) __meminit vmemmap_populate_print_last(void) | 466 | void __weak __meminit vmemmap_populate_print_last(void) |
465 | { | 467 | { |
466 | } | 468 | } |
467 | 469 | ||
@@ -1,6 +1,7 @@ | |||
1 | #include <linux/mm.h> | 1 | #include <linux/mm.h> |
2 | #include <linux/slab.h> | 2 | #include <linux/slab.h> |
3 | #include <linux/string.h> | 3 | #include <linux/string.h> |
4 | #include <linux/compiler.h> | ||
4 | #include <linux/export.h> | 5 | #include <linux/export.h> |
5 | #include <linux/err.h> | 6 | #include <linux/err.h> |
6 | #include <linux/sched.h> | 7 | #include <linux/sched.h> |
@@ -307,7 +308,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) | |||
307 | * If the architecture not support this function, simply return with no | 308 | * If the architecture not support this function, simply return with no |
308 | * page pinned | 309 | * page pinned |
309 | */ | 310 | */ |
310 | int __attribute__((weak)) __get_user_pages_fast(unsigned long start, | 311 | int __weak __get_user_pages_fast(unsigned long start, |
311 | int nr_pages, int write, struct page **pages) | 312 | int nr_pages, int write, struct page **pages) |
312 | { | 313 | { |
313 | return 0; | 314 | return 0; |
@@ -338,7 +339,7 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast); | |||
338 | * callers need to carefully consider what to use. On many architectures, | 339 | * callers need to carefully consider what to use. On many architectures, |
339 | * get_user_pages_fast simply falls back to get_user_pages. | 340 | * get_user_pages_fast simply falls back to get_user_pages. |
340 | */ | 341 | */ |
341 | int __attribute__((weak)) get_user_pages_fast(unsigned long start, | 342 | int __weak get_user_pages_fast(unsigned long start, |
342 | int nr_pages, int write, struct page **pages) | 343 | int nr_pages, int write, struct page **pages) |
343 | { | 344 | { |
344 | struct mm_struct *mm = current->mm; | 345 | struct mm_struct *mm = current->mm; |
diff --git a/mm/vmacache.c b/mm/vmacache.c new file mode 100644 index 000000000000..d4224b397c0e --- /dev/null +++ b/mm/vmacache.c | |||
@@ -0,0 +1,112 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2014 Davidlohr Bueso. | ||
3 | */ | ||
4 | #include <linux/sched.h> | ||
5 | #include <linux/mm.h> | ||
6 | #include <linux/vmacache.h> | ||
7 | |||
8 | /* | ||
9 | * Flush vma caches for threads that share a given mm. | ||
10 | * | ||
11 | * The operation is safe because the caller holds the mmap_sem | ||
12 | * exclusively and other threads accessing the vma cache will | ||
13 | * have mmap_sem held at least for read, so no extra locking | ||
14 | * is required to maintain the vma cache. | ||
15 | */ | ||
16 | void vmacache_flush_all(struct mm_struct *mm) | ||
17 | { | ||
18 | struct task_struct *g, *p; | ||
19 | |||
20 | rcu_read_lock(); | ||
21 | for_each_process_thread(g, p) { | ||
22 | /* | ||
23 | * Only flush the vmacache pointers as the | ||
24 | * mm seqnum is already set and curr's will | ||
25 | * be set upon invalidation when the next | ||
26 | * lookup is done. | ||
27 | */ | ||
28 | if (mm == p->mm) | ||
29 | vmacache_flush(p); | ||
30 | } | ||
31 | rcu_read_unlock(); | ||
32 | } | ||
33 | |||
34 | /* | ||
35 | * This task may be accessing a foreign mm via (for example) | ||
36 | * get_user_pages()->find_vma(). The vmacache is task-local and this | ||
37 | * task's vmacache pertains to a different mm (ie, its own). There is | ||
38 | * nothing we can do here. | ||
39 | * | ||
40 | * Also handle the case where a kernel thread has adopted this mm via use_mm(). | ||
41 | * That kernel thread's vmacache is not applicable to this mm. | ||
42 | */ | ||
43 | static bool vmacache_valid_mm(struct mm_struct *mm) | ||
44 | { | ||
45 | return current->mm == mm && !(current->flags & PF_KTHREAD); | ||
46 | } | ||
47 | |||
48 | void vmacache_update(unsigned long addr, struct vm_area_struct *newvma) | ||
49 | { | ||
50 | if (vmacache_valid_mm(newvma->vm_mm)) | ||
51 | current->vmacache[VMACACHE_HASH(addr)] = newvma; | ||
52 | } | ||
53 | |||
54 | static bool vmacache_valid(struct mm_struct *mm) | ||
55 | { | ||
56 | struct task_struct *curr; | ||
57 | |||
58 | if (!vmacache_valid_mm(mm)) | ||
59 | return false; | ||
60 | |||
61 | curr = current; | ||
62 | if (mm->vmacache_seqnum != curr->vmacache_seqnum) { | ||
63 | /* | ||
64 | * First attempt will always be invalid, initialize | ||
65 | * the new cache for this task here. | ||
66 | */ | ||
67 | curr->vmacache_seqnum = mm->vmacache_seqnum; | ||
68 | vmacache_flush(curr); | ||
69 | return false; | ||
70 | } | ||
71 | return true; | ||
72 | } | ||
73 | |||
74 | struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) | ||
75 | { | ||
76 | int i; | ||
77 | |||
78 | if (!vmacache_valid(mm)) | ||
79 | return NULL; | ||
80 | |||
81 | for (i = 0; i < VMACACHE_SIZE; i++) { | ||
82 | struct vm_area_struct *vma = current->vmacache[i]; | ||
83 | |||
84 | if (vma && vma->vm_start <= addr && vma->vm_end > addr) { | ||
85 | BUG_ON(vma->vm_mm != mm); | ||
86 | return vma; | ||
87 | } | ||
88 | } | ||
89 | |||
90 | return NULL; | ||
91 | } | ||
92 | |||
93 | #ifndef CONFIG_MMU | ||
94 | struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, | ||
95 | unsigned long start, | ||
96 | unsigned long end) | ||
97 | { | ||
98 | int i; | ||
99 | |||
100 | if (!vmacache_valid(mm)) | ||
101 | return NULL; | ||
102 | |||
103 | for (i = 0; i < VMACACHE_SIZE; i++) { | ||
104 | struct vm_area_struct *vma = current->vmacache[i]; | ||
105 | |||
106 | if (vma && vma->vm_start == start && vma->vm_end == end) | ||
107 | return vma; | ||
108 | } | ||
109 | |||
110 | return NULL; | ||
111 | } | ||
112 | #endif | ||
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0fdf96803c5b..bf233b283319 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -27,7 +27,9 @@ | |||
27 | #include <linux/pfn.h> | 27 | #include <linux/pfn.h> |
28 | #include <linux/kmemleak.h> | 28 | #include <linux/kmemleak.h> |
29 | #include <linux/atomic.h> | 29 | #include <linux/atomic.h> |
30 | #include <linux/compiler.h> | ||
30 | #include <linux/llist.h> | 31 | #include <linux/llist.h> |
32 | |||
31 | #include <asm/uaccess.h> | 33 | #include <asm/uaccess.h> |
32 | #include <asm/tlbflush.h> | 34 | #include <asm/tlbflush.h> |
33 | #include <asm/shmparam.h> | 35 | #include <asm/shmparam.h> |
@@ -1083,6 +1085,12 @@ EXPORT_SYMBOL(vm_unmap_ram); | |||
1083 | * @node: prefer to allocate data structures on this node | 1085 | * @node: prefer to allocate data structures on this node |
1084 | * @prot: memory protection to use. PAGE_KERNEL for regular RAM | 1086 | * @prot: memory protection to use. PAGE_KERNEL for regular RAM |
1085 | * | 1087 | * |
1088 | * If you use this function for less than VMAP_MAX_ALLOC pages, it could be | ||
1089 | * faster than vmap so it's good. But if you mix long-life and short-life | ||
1090 | * objects with vm_map_ram(), it could consume lots of address space through | ||
1091 | * fragmentation (especially on a 32bit machine). You could see failures in | ||
1092 | * the end. Please use this function for short-lived objects. | ||
1093 | * | ||
1086 | * Returns: a pointer to the address that has been mapped, or %NULL on failure | 1094 | * Returns: a pointer to the address that has been mapped, or %NULL on failure |
1087 | */ | 1095 | */ |
1088 | void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) | 1096 | void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) |
@@ -2181,7 +2189,7 @@ EXPORT_SYMBOL(remap_vmalloc_range); | |||
2181 | * Implement a stub for vmalloc_sync_all() if the architecture chose not to | 2189 | * Implement a stub for vmalloc_sync_all() if the architecture chose not to |
2182 | * have one. | 2190 | * have one. |
2183 | */ | 2191 | */ |
2184 | void __attribute__((weak)) vmalloc_sync_all(void) | 2192 | void __weak vmalloc_sync_all(void) |
2185 | { | 2193 | { |
2186 | } | 2194 | } |
2187 | 2195 | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index 1f56a80a7c41..06879ead7380 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -2314,15 +2314,18 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2314 | unsigned long lru_pages = 0; | 2314 | unsigned long lru_pages = 0; |
2315 | bool aborted_reclaim = false; | 2315 | bool aborted_reclaim = false; |
2316 | struct reclaim_state *reclaim_state = current->reclaim_state; | 2316 | struct reclaim_state *reclaim_state = current->reclaim_state; |
2317 | gfp_t orig_mask; | ||
2317 | struct shrink_control shrink = { | 2318 | struct shrink_control shrink = { |
2318 | .gfp_mask = sc->gfp_mask, | 2319 | .gfp_mask = sc->gfp_mask, |
2319 | }; | 2320 | }; |
2321 | enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); | ||
2320 | 2322 | ||
2321 | /* | 2323 | /* |
2322 | * If the number of buffer_heads in the machine exceeds the maximum | 2324 | * If the number of buffer_heads in the machine exceeds the maximum |
2323 | * allowed level, force direct reclaim to scan the highmem zone as | 2325 | * allowed level, force direct reclaim to scan the highmem zone as |
2324 | * highmem pages could be pinning lowmem pages storing buffer_heads | 2326 | * highmem pages could be pinning lowmem pages storing buffer_heads |
2325 | */ | 2327 | */ |
2328 | orig_mask = sc->gfp_mask; | ||
2326 | if (buffer_heads_over_limit) | 2329 | if (buffer_heads_over_limit) |
2327 | sc->gfp_mask |= __GFP_HIGHMEM; | 2330 | sc->gfp_mask |= __GFP_HIGHMEM; |
2328 | 2331 | ||
@@ -2356,7 +2359,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2356 | * noticeable problem, like transparent huge | 2359 | * noticeable problem, like transparent huge |
2357 | * page allocations. | 2360 | * page allocations. |
2358 | */ | 2361 | */ |
2359 | if (compaction_ready(zone, sc)) { | 2362 | if ((zonelist_zone_idx(z) <= requested_highidx) |
2363 | && compaction_ready(zone, sc)) { | ||
2360 | aborted_reclaim = true; | 2364 | aborted_reclaim = true; |
2361 | continue; | 2365 | continue; |
2362 | } | 2366 | } |
@@ -2393,6 +2397,12 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2393 | } | 2397 | } |
2394 | } | 2398 | } |
2395 | 2399 | ||
2400 | /* | ||
2401 | * Restore to original mask to avoid the impact on the caller if we | ||
2402 | * promoted it to __GFP_HIGHMEM. | ||
2403 | */ | ||
2404 | sc->gfp_mask = orig_mask; | ||
2405 | |||
2396 | return aborted_reclaim; | 2406 | return aborted_reclaim; |
2397 | } | 2407 | } |
2398 | 2408 | ||
diff --git a/mm/zswap.c b/mm/zswap.c index d7337fbf6605..aeaef0fb5624 100644 --- a/mm/zswap.c +++ b/mm/zswap.c | |||
@@ -89,6 +89,9 @@ static unsigned int zswap_max_pool_percent = 20; | |||
89 | module_param_named(max_pool_percent, | 89 | module_param_named(max_pool_percent, |
90 | zswap_max_pool_percent, uint, 0644); | 90 | zswap_max_pool_percent, uint, 0644); |
91 | 91 | ||
92 | /* zbud_pool is shared by all of zswap backend */ | ||
93 | static struct zbud_pool *zswap_pool; | ||
94 | |||
92 | /********************************* | 95 | /********************************* |
93 | * compression functions | 96 | * compression functions |
94 | **********************************/ | 97 | **********************************/ |
@@ -160,14 +163,14 @@ static void zswap_comp_exit(void) | |||
160 | * rbnode - links the entry into red-black tree for the appropriate swap type | 163 | * rbnode - links the entry into red-black tree for the appropriate swap type |
161 | * refcount - the number of outstanding reference to the entry. This is needed | 164 | * refcount - the number of outstanding reference to the entry. This is needed |
162 | * to protect against premature freeing of the entry by code | 165 | * to protect against premature freeing of the entry by code |
163 | * concurent calls to load, invalidate, and writeback. The lock | 166 | * concurrent calls to load, invalidate, and writeback. The lock |
164 | * for the zswap_tree structure that contains the entry must | 167 | * for the zswap_tree structure that contains the entry must |
165 | * be held while changing the refcount. Since the lock must | 168 | * be held while changing the refcount. Since the lock must |
166 | * be held, there is no reason to also make refcount atomic. | 169 | * be held, there is no reason to also make refcount atomic. |
167 | * offset - the swap offset for the entry. Index into the red-black tree. | 170 | * offset - the swap offset for the entry. Index into the red-black tree. |
168 | * handle - zsmalloc allocation handle that stores the compressed page data | 171 | * handle - zbud allocation handle that stores the compressed page data |
169 | * length - the length in bytes of the compressed page data. Needed during | 172 | * length - the length in bytes of the compressed page data. Needed during |
170 | * decompression | 173 | * decompression |
171 | */ | 174 | */ |
172 | struct zswap_entry { | 175 | struct zswap_entry { |
173 | struct rb_node rbnode; | 176 | struct rb_node rbnode; |
@@ -189,7 +192,6 @@ struct zswap_header { | |||
189 | struct zswap_tree { | 192 | struct zswap_tree { |
190 | struct rb_root rbroot; | 193 | struct rb_root rbroot; |
191 | spinlock_t lock; | 194 | spinlock_t lock; |
192 | struct zbud_pool *pool; | ||
193 | }; | 195 | }; |
194 | 196 | ||
195 | static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; | 197 | static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; |
@@ -202,7 +204,7 @@ static struct kmem_cache *zswap_entry_cache; | |||
202 | static int zswap_entry_cache_create(void) | 204 | static int zswap_entry_cache_create(void) |
203 | { | 205 | { |
204 | zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); | 206 | zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); |
205 | return (zswap_entry_cache == NULL); | 207 | return zswap_entry_cache == NULL; |
206 | } | 208 | } |
207 | 209 | ||
208 | static void zswap_entry_cache_destory(void) | 210 | static void zswap_entry_cache_destory(void) |
@@ -282,16 +284,15 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) | |||
282 | } | 284 | } |
283 | 285 | ||
284 | /* | 286 | /* |
285 | * Carries out the common pattern of freeing and entry's zsmalloc allocation, | 287 | * Carries out the common pattern of freeing and entry's zbud allocation, |
286 | * freeing the entry itself, and decrementing the number of stored pages. | 288 | * freeing the entry itself, and decrementing the number of stored pages. |
287 | */ | 289 | */ |
288 | static void zswap_free_entry(struct zswap_tree *tree, | 290 | static void zswap_free_entry(struct zswap_entry *entry) |
289 | struct zswap_entry *entry) | ||
290 | { | 291 | { |
291 | zbud_free(tree->pool, entry->handle); | 292 | zbud_free(zswap_pool, entry->handle); |
292 | zswap_entry_cache_free(entry); | 293 | zswap_entry_cache_free(entry); |
293 | atomic_dec(&zswap_stored_pages); | 294 | atomic_dec(&zswap_stored_pages); |
294 | zswap_pool_pages = zbud_get_pool_size(tree->pool); | 295 | zswap_pool_pages = zbud_get_pool_size(zswap_pool); |
295 | } | 296 | } |
296 | 297 | ||
297 | /* caller must hold the tree lock */ | 298 | /* caller must hold the tree lock */ |
@@ -311,7 +312,7 @@ static void zswap_entry_put(struct zswap_tree *tree, | |||
311 | BUG_ON(refcount < 0); | 312 | BUG_ON(refcount < 0); |
312 | if (refcount == 0) { | 313 | if (refcount == 0) { |
313 | zswap_rb_erase(&tree->rbroot, entry); | 314 | zswap_rb_erase(&tree->rbroot, entry); |
314 | zswap_free_entry(tree, entry); | 315 | zswap_free_entry(entry); |
315 | } | 316 | } |
316 | } | 317 | } |
317 | 318 | ||
@@ -407,8 +408,8 @@ cleanup: | |||
407 | **********************************/ | 408 | **********************************/ |
408 | static bool zswap_is_full(void) | 409 | static bool zswap_is_full(void) |
409 | { | 410 | { |
410 | return (totalram_pages * zswap_max_pool_percent / 100 < | 411 | return totalram_pages * zswap_max_pool_percent / 100 < |
411 | zswap_pool_pages); | 412 | zswap_pool_pages; |
412 | } | 413 | } |
413 | 414 | ||
414 | /********************************* | 415 | /********************************* |
@@ -545,7 +546,6 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) | |||
545 | zbud_unmap(pool, handle); | 546 | zbud_unmap(pool, handle); |
546 | tree = zswap_trees[swp_type(swpentry)]; | 547 | tree = zswap_trees[swp_type(swpentry)]; |
547 | offset = swp_offset(swpentry); | 548 | offset = swp_offset(swpentry); |
548 | BUG_ON(pool != tree->pool); | ||
549 | 549 | ||
550 | /* find and ref zswap entry */ | 550 | /* find and ref zswap entry */ |
551 | spin_lock(&tree->lock); | 551 | spin_lock(&tree->lock); |
@@ -573,13 +573,13 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) | |||
573 | case ZSWAP_SWAPCACHE_NEW: /* page is locked */ | 573 | case ZSWAP_SWAPCACHE_NEW: /* page is locked */ |
574 | /* decompress */ | 574 | /* decompress */ |
575 | dlen = PAGE_SIZE; | 575 | dlen = PAGE_SIZE; |
576 | src = (u8 *)zbud_map(tree->pool, entry->handle) + | 576 | src = (u8 *)zbud_map(zswap_pool, entry->handle) + |
577 | sizeof(struct zswap_header); | 577 | sizeof(struct zswap_header); |
578 | dst = kmap_atomic(page); | 578 | dst = kmap_atomic(page); |
579 | ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, | 579 | ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, |
580 | entry->length, dst, &dlen); | 580 | entry->length, dst, &dlen); |
581 | kunmap_atomic(dst); | 581 | kunmap_atomic(dst); |
582 | zbud_unmap(tree->pool, entry->handle); | 582 | zbud_unmap(zswap_pool, entry->handle); |
583 | BUG_ON(ret); | 583 | BUG_ON(ret); |
584 | BUG_ON(dlen != PAGE_SIZE); | 584 | BUG_ON(dlen != PAGE_SIZE); |
585 | 585 | ||
@@ -652,7 +652,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, | |||
652 | /* reclaim space if needed */ | 652 | /* reclaim space if needed */ |
653 | if (zswap_is_full()) { | 653 | if (zswap_is_full()) { |
654 | zswap_pool_limit_hit++; | 654 | zswap_pool_limit_hit++; |
655 | if (zbud_reclaim_page(tree->pool, 8)) { | 655 | if (zbud_reclaim_page(zswap_pool, 8)) { |
656 | zswap_reject_reclaim_fail++; | 656 | zswap_reject_reclaim_fail++; |
657 | ret = -ENOMEM; | 657 | ret = -ENOMEM; |
658 | goto reject; | 658 | goto reject; |
@@ -679,7 +679,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, | |||
679 | 679 | ||
680 | /* store */ | 680 | /* store */ |
681 | len = dlen + sizeof(struct zswap_header); | 681 | len = dlen + sizeof(struct zswap_header); |
682 | ret = zbud_alloc(tree->pool, len, __GFP_NORETRY | __GFP_NOWARN, | 682 | ret = zbud_alloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN, |
683 | &handle); | 683 | &handle); |
684 | if (ret == -ENOSPC) { | 684 | if (ret == -ENOSPC) { |
685 | zswap_reject_compress_poor++; | 685 | zswap_reject_compress_poor++; |
@@ -689,11 +689,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, | |||
689 | zswap_reject_alloc_fail++; | 689 | zswap_reject_alloc_fail++; |
690 | goto freepage; | 690 | goto freepage; |
691 | } | 691 | } |
692 | zhdr = zbud_map(tree->pool, handle); | 692 | zhdr = zbud_map(zswap_pool, handle); |
693 | zhdr->swpentry = swp_entry(type, offset); | 693 | zhdr->swpentry = swp_entry(type, offset); |
694 | buf = (u8 *)(zhdr + 1); | 694 | buf = (u8 *)(zhdr + 1); |
695 | memcpy(buf, dst, dlen); | 695 | memcpy(buf, dst, dlen); |
696 | zbud_unmap(tree->pool, handle); | 696 | zbud_unmap(zswap_pool, handle); |
697 | put_cpu_var(zswap_dstmem); | 697 | put_cpu_var(zswap_dstmem); |
698 | 698 | ||
699 | /* populate entry */ | 699 | /* populate entry */ |
@@ -716,7 +716,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, | |||
716 | 716 | ||
717 | /* update stats */ | 717 | /* update stats */ |
718 | atomic_inc(&zswap_stored_pages); | 718 | atomic_inc(&zswap_stored_pages); |
719 | zswap_pool_pages = zbud_get_pool_size(tree->pool); | 719 | zswap_pool_pages = zbud_get_pool_size(zswap_pool); |
720 | 720 | ||
721 | return 0; | 721 | return 0; |
722 | 722 | ||
@@ -752,13 +752,13 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, | |||
752 | 752 | ||
753 | /* decompress */ | 753 | /* decompress */ |
754 | dlen = PAGE_SIZE; | 754 | dlen = PAGE_SIZE; |
755 | src = (u8 *)zbud_map(tree->pool, entry->handle) + | 755 | src = (u8 *)zbud_map(zswap_pool, entry->handle) + |
756 | sizeof(struct zswap_header); | 756 | sizeof(struct zswap_header); |
757 | dst = kmap_atomic(page); | 757 | dst = kmap_atomic(page); |
758 | ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, | 758 | ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, |
759 | dst, &dlen); | 759 | dst, &dlen); |
760 | kunmap_atomic(dst); | 760 | kunmap_atomic(dst); |
761 | zbud_unmap(tree->pool, entry->handle); | 761 | zbud_unmap(zswap_pool, entry->handle); |
762 | BUG_ON(ret); | 762 | BUG_ON(ret); |
763 | 763 | ||
764 | spin_lock(&tree->lock); | 764 | spin_lock(&tree->lock); |
@@ -804,11 +804,9 @@ static void zswap_frontswap_invalidate_area(unsigned type) | |||
804 | /* walk the tree and free everything */ | 804 | /* walk the tree and free everything */ |
805 | spin_lock(&tree->lock); | 805 | spin_lock(&tree->lock); |
806 | rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) | 806 | rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) |
807 | zswap_free_entry(tree, entry); | 807 | zswap_free_entry(entry); |
808 | tree->rbroot = RB_ROOT; | 808 | tree->rbroot = RB_ROOT; |
809 | spin_unlock(&tree->lock); | 809 | spin_unlock(&tree->lock); |
810 | |||
811 | zbud_destroy_pool(tree->pool); | ||
812 | kfree(tree); | 810 | kfree(tree); |
813 | zswap_trees[type] = NULL; | 811 | zswap_trees[type] = NULL; |
814 | } | 812 | } |
@@ -822,20 +820,14 @@ static void zswap_frontswap_init(unsigned type) | |||
822 | struct zswap_tree *tree; | 820 | struct zswap_tree *tree; |
823 | 821 | ||
824 | tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL); | 822 | tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL); |
825 | if (!tree) | 823 | if (!tree) { |
826 | goto err; | 824 | pr_err("alloc failed, zswap disabled for swap type %d\n", type); |
827 | tree->pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops); | 825 | return; |
828 | if (!tree->pool) | 826 | } |
829 | goto freetree; | 827 | |
830 | tree->rbroot = RB_ROOT; | 828 | tree->rbroot = RB_ROOT; |
831 | spin_lock_init(&tree->lock); | 829 | spin_lock_init(&tree->lock); |
832 | zswap_trees[type] = tree; | 830 | zswap_trees[type] = tree; |
833 | return; | ||
834 | |||
835 | freetree: | ||
836 | kfree(tree); | ||
837 | err: | ||
838 | pr_err("alloc failed, zswap disabled for swap type %d\n", type); | ||
839 | } | 831 | } |
840 | 832 | ||
841 | static struct frontswap_ops zswap_frontswap_ops = { | 833 | static struct frontswap_ops zswap_frontswap_ops = { |
@@ -907,9 +899,16 @@ static int __init init_zswap(void) | |||
907 | return 0; | 899 | return 0; |
908 | 900 | ||
909 | pr_info("loading zswap\n"); | 901 | pr_info("loading zswap\n"); |
902 | |||
903 | zswap_pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops); | ||
904 | if (!zswap_pool) { | ||
905 | pr_err("zbud pool creation failed\n"); | ||
906 | goto error; | ||
907 | } | ||
908 | |||
910 | if (zswap_entry_cache_create()) { | 909 | if (zswap_entry_cache_create()) { |
911 | pr_err("entry cache creation failed\n"); | 910 | pr_err("entry cache creation failed\n"); |
912 | goto error; | 911 | goto cachefail; |
913 | } | 912 | } |
914 | if (zswap_comp_init()) { | 913 | if (zswap_comp_init()) { |
915 | pr_err("compressor initialization failed\n"); | 914 | pr_err("compressor initialization failed\n"); |
@@ -919,6 +918,7 @@ static int __init init_zswap(void) | |||
919 | pr_err("per-cpu initialization failed\n"); | 918 | pr_err("per-cpu initialization failed\n"); |
920 | goto pcpufail; | 919 | goto pcpufail; |
921 | } | 920 | } |
921 | |||
922 | frontswap_register_ops(&zswap_frontswap_ops); | 922 | frontswap_register_ops(&zswap_frontswap_ops); |
923 | if (zswap_debugfs_init()) | 923 | if (zswap_debugfs_init()) |
924 | pr_warn("debugfs initialization failed\n"); | 924 | pr_warn("debugfs initialization failed\n"); |
@@ -927,6 +927,8 @@ pcpufail: | |||
927 | zswap_comp_exit(); | 927 | zswap_comp_exit(); |
928 | compfail: | 928 | compfail: |
929 | zswap_entry_cache_destory(); | 929 | zswap_entry_cache_destory(); |
930 | cachefail: | ||
931 | zbud_destroy_pool(zswap_pool); | ||
930 | error: | 932 | error: |
931 | return -ENOMEM; | 933 | return -ENOMEM; |
932 | } | 934 | } |