diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-10-09 03:23:15 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-10-09 03:23:15 -0400 |
commit | 9e2d8656f5e8aa214e66b462680cf86b210b74a8 (patch) | |
tree | f67d62e896cedf75599ea45f9ecf9999c6ad24cd /mm | |
parent | 1ea4f4f8405cc1ceec23f2d261bc3775785e6712 (diff) | |
parent | 9e695d2ecc8451cc2c1603d60b5c8e7f5581923a (diff) |
Merge branch 'akpm' (Andrew's patch-bomb)
Merge patches from Andrew Morton:
"A few misc things and very nearly all of the MM tree. A tremendous
amount of stuff (again), including a significant rbtree library
rework."
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (160 commits)
sparc64: Support transparent huge pages.
mm: thp: Use more portable PMD clearing sequenece in zap_huge_pmd().
mm: Add and use update_mmu_cache_pmd() in transparent huge page code.
sparc64: Document PGD and PMD layout.
sparc64: Eliminate PTE table memory wastage.
sparc64: Halve the size of PTE tables
sparc64: Only support 4MB huge pages and 8KB base pages.
memory-hotplug: suppress "Trying to free nonexistent resource <XXXXXXXXXXXXXXXX-YYYYYYYYYYYYYYYY>" warning
mm: memcg: clean up mm_match_cgroup() signature
mm: document PageHuge somewhat
mm: use %pK for /proc/vmallocinfo
mm, thp: fix mlock statistics
mm, thp: fix mapped pages avoiding unevictable list on mlock
memory-hotplug: update memory block's state and notify userspace
memory-hotplug: preparation to notify memory block's state at memory hot remove
mm: avoid section mismatch warning for memblock_type_name
make GFP_NOTRACK definition unconditional
cma: decrease cc.nr_migratepages after reclaiming pagelist
CMA: migrate mlocked pages
kpageflags: fix wrong KPF_THP on non-huge compound pages
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 3 | ||||
-rw-r--r-- | mm/Makefile | 4 | ||||
-rw-r--r-- | mm/bootmem.c | 10 | ||||
-rw-r--r-- | mm/compaction.c | 562 | ||||
-rw-r--r-- | mm/filemap.c | 6 | ||||
-rw-r--r-- | mm/filemap_xip.c | 10 | ||||
-rw-r--r-- | mm/fremap.c | 16 | ||||
-rw-r--r-- | mm/huge_memory.c | 440 | ||||
-rw-r--r-- | mm/hugetlb.c | 34 | ||||
-rw-r--r-- | mm/internal.h | 52 | ||||
-rw-r--r-- | mm/interval_tree.c | 112 | ||||
-rw-r--r-- | mm/kmemleak.c | 100 | ||||
-rw-r--r-- | mm/ksm.c | 40 | ||||
-rw-r--r-- | mm/madvise.c | 8 | ||||
-rw-r--r-- | mm/memblock.c | 5 | ||||
-rw-r--r-- | mm/memcontrol.c | 22 | ||||
-rw-r--r-- | mm/memory-failure.c | 8 | ||||
-rw-r--r-- | mm/memory.c | 115 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 77 | ||||
-rw-r--r-- | mm/mempolicy.c | 148 | ||||
-rw-r--r-- | mm/mlock.c | 27 | ||||
-rw-r--r-- | mm/mmap.c | 207 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 103 | ||||
-rw-r--r-- | mm/mremap.c | 73 | ||||
-rw-r--r-- | mm/nobootmem.c | 5 | ||||
-rw-r--r-- | mm/nommu.c | 33 | ||||
-rw-r--r-- | mm/oom_kill.c | 4 | ||||
-rw-r--r-- | mm/page_alloc.c | 317 | ||||
-rw-r--r-- | mm/page_isolation.c | 43 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 50 | ||||
-rw-r--r-- | mm/prio_tree.c | 208 | ||||
-rw-r--r-- | mm/rmap.c | 159 | ||||
-rw-r--r-- | mm/shmem.c | 3 | ||||
-rw-r--r-- | mm/swap.c | 13 | ||||
-rw-r--r-- | mm/truncate.c | 3 | ||||
-rw-r--r-- | mm/vmalloc.c | 5 | ||||
-rw-r--r-- | mm/vmscan.c | 111 | ||||
-rw-r--r-- | mm/vmstat.c | 14 |
38 files changed, 1830 insertions, 1320 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index d5c8019c6627..a3f8dddaaab3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -191,6 +191,7 @@ config SPLIT_PTLOCK_CPUS | |||
191 | # support for memory compaction | 191 | # support for memory compaction |
192 | config COMPACTION | 192 | config COMPACTION |
193 | bool "Allow for memory compaction" | 193 | bool "Allow for memory compaction" |
194 | def_bool y | ||
194 | select MIGRATION | 195 | select MIGRATION |
195 | depends on MMU | 196 | depends on MMU |
196 | help | 197 | help |
@@ -318,7 +319,7 @@ config NOMMU_INITIAL_TRIM_EXCESS | |||
318 | 319 | ||
319 | config TRANSPARENT_HUGEPAGE | 320 | config TRANSPARENT_HUGEPAGE |
320 | bool "Transparent Hugepage Support" | 321 | bool "Transparent Hugepage Support" |
321 | depends on X86 && MMU | 322 | depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE |
322 | select COMPACTION | 323 | select COMPACTION |
323 | help | 324 | help |
324 | Transparent Hugepages allows the kernel to use huge pages and | 325 | Transparent Hugepages allows the kernel to use huge pages and |
diff --git a/mm/Makefile b/mm/Makefile index 92753e2d82da..6b025f80af34 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -14,9 +14,9 @@ endif | |||
14 | obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ | 14 | obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ |
15 | maccess.o page_alloc.o page-writeback.o \ | 15 | maccess.o page_alloc.o page-writeback.o \ |
16 | readahead.o swap.o truncate.o vmscan.o shmem.o \ | 16 | readahead.o swap.o truncate.o vmscan.o shmem.o \ |
17 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ | 17 | util.o mmzone.o vmstat.o backing-dev.o \ |
18 | mm_init.o mmu_context.o percpu.o slab_common.o \ | 18 | mm_init.o mmu_context.o percpu.o slab_common.o \ |
19 | compaction.o $(mmu-y) | 19 | compaction.o interval_tree.o $(mmu-y) |
20 | 20 | ||
21 | obj-y += init-mm.o | 21 | obj-y += init-mm.o |
22 | 22 | ||
diff --git a/mm/bootmem.c b/mm/bootmem.c index f468185b3b28..434be4ae7a04 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -198,6 +198,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
198 | int order = ilog2(BITS_PER_LONG); | 198 | int order = ilog2(BITS_PER_LONG); |
199 | 199 | ||
200 | __free_pages_bootmem(pfn_to_page(start), order); | 200 | __free_pages_bootmem(pfn_to_page(start), order); |
201 | fixup_zone_present_pages(page_to_nid(pfn_to_page(start)), | ||
202 | start, start + BITS_PER_LONG); | ||
201 | count += BITS_PER_LONG; | 203 | count += BITS_PER_LONG; |
202 | start += BITS_PER_LONG; | 204 | start += BITS_PER_LONG; |
203 | } else { | 205 | } else { |
@@ -208,6 +210,9 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
208 | if (vec & 1) { | 210 | if (vec & 1) { |
209 | page = pfn_to_page(start + off); | 211 | page = pfn_to_page(start + off); |
210 | __free_pages_bootmem(page, 0); | 212 | __free_pages_bootmem(page, 0); |
213 | fixup_zone_present_pages( | ||
214 | page_to_nid(page), | ||
215 | start + off, start + off + 1); | ||
211 | count++; | 216 | count++; |
212 | } | 217 | } |
213 | vec >>= 1; | 218 | vec >>= 1; |
@@ -221,8 +226,11 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
221 | pages = bdata->node_low_pfn - bdata->node_min_pfn; | 226 | pages = bdata->node_low_pfn - bdata->node_min_pfn; |
222 | pages = bootmem_bootmap_pages(pages); | 227 | pages = bootmem_bootmap_pages(pages); |
223 | count += pages; | 228 | count += pages; |
224 | while (pages--) | 229 | while (pages--) { |
230 | fixup_zone_present_pages(page_to_nid(page), | ||
231 | page_to_pfn(page), page_to_pfn(page) + 1); | ||
225 | __free_pages_bootmem(page++, 0); | 232 | __free_pages_bootmem(page++, 0); |
233 | } | ||
226 | 234 | ||
227 | bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); | 235 | bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); |
228 | 236 | ||
diff --git a/mm/compaction.c b/mm/compaction.c index 7fcd3a52e68d..2c4ce17651d8 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -50,6 +50,111 @@ static inline bool migrate_async_suitable(int migratetype) | |||
50 | return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; | 50 | return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; |
51 | } | 51 | } |
52 | 52 | ||
53 | #ifdef CONFIG_COMPACTION | ||
54 | /* Returns true if the pageblock should be scanned for pages to isolate. */ | ||
55 | static inline bool isolation_suitable(struct compact_control *cc, | ||
56 | struct page *page) | ||
57 | { | ||
58 | if (cc->ignore_skip_hint) | ||
59 | return true; | ||
60 | |||
61 | return !get_pageblock_skip(page); | ||
62 | } | ||
63 | |||
64 | /* | ||
65 | * This function is called to clear all cached information on pageblocks that | ||
66 | * should be skipped for page isolation when the migrate and free page scanner | ||
67 | * meet. | ||
68 | */ | ||
69 | static void __reset_isolation_suitable(struct zone *zone) | ||
70 | { | ||
71 | unsigned long start_pfn = zone->zone_start_pfn; | ||
72 | unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages; | ||
73 | unsigned long pfn; | ||
74 | |||
75 | zone->compact_cached_migrate_pfn = start_pfn; | ||
76 | zone->compact_cached_free_pfn = end_pfn; | ||
77 | zone->compact_blockskip_flush = false; | ||
78 | |||
79 | /* Walk the zone and mark every pageblock as suitable for isolation */ | ||
80 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { | ||
81 | struct page *page; | ||
82 | |||
83 | cond_resched(); | ||
84 | |||
85 | if (!pfn_valid(pfn)) | ||
86 | continue; | ||
87 | |||
88 | page = pfn_to_page(pfn); | ||
89 | if (zone != page_zone(page)) | ||
90 | continue; | ||
91 | |||
92 | clear_pageblock_skip(page); | ||
93 | } | ||
94 | } | ||
95 | |||
96 | void reset_isolation_suitable(pg_data_t *pgdat) | ||
97 | { | ||
98 | int zoneid; | ||
99 | |||
100 | for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { | ||
101 | struct zone *zone = &pgdat->node_zones[zoneid]; | ||
102 | if (!populated_zone(zone)) | ||
103 | continue; | ||
104 | |||
105 | /* Only flush if a full compaction finished recently */ | ||
106 | if (zone->compact_blockskip_flush) | ||
107 | __reset_isolation_suitable(zone); | ||
108 | } | ||
109 | } | ||
110 | |||
111 | /* | ||
112 | * If no pages were isolated then mark this pageblock to be skipped in the | ||
113 | * future. The information is later cleared by __reset_isolation_suitable(). | ||
114 | */ | ||
115 | static void update_pageblock_skip(struct compact_control *cc, | ||
116 | struct page *page, unsigned long nr_isolated, | ||
117 | bool migrate_scanner) | ||
118 | { | ||
119 | struct zone *zone = cc->zone; | ||
120 | if (!page) | ||
121 | return; | ||
122 | |||
123 | if (!nr_isolated) { | ||
124 | unsigned long pfn = page_to_pfn(page); | ||
125 | set_pageblock_skip(page); | ||
126 | |||
127 | /* Update where compaction should restart */ | ||
128 | if (migrate_scanner) { | ||
129 | if (!cc->finished_update_migrate && | ||
130 | pfn > zone->compact_cached_migrate_pfn) | ||
131 | zone->compact_cached_migrate_pfn = pfn; | ||
132 | } else { | ||
133 | if (!cc->finished_update_free && | ||
134 | pfn < zone->compact_cached_free_pfn) | ||
135 | zone->compact_cached_free_pfn = pfn; | ||
136 | } | ||
137 | } | ||
138 | } | ||
139 | #else | ||
140 | static inline bool isolation_suitable(struct compact_control *cc, | ||
141 | struct page *page) | ||
142 | { | ||
143 | return true; | ||
144 | } | ||
145 | |||
146 | static void update_pageblock_skip(struct compact_control *cc, | ||
147 | struct page *page, unsigned long nr_isolated, | ||
148 | bool migrate_scanner) | ||
149 | { | ||
150 | } | ||
151 | #endif /* CONFIG_COMPACTION */ | ||
152 | |||
153 | static inline bool should_release_lock(spinlock_t *lock) | ||
154 | { | ||
155 | return need_resched() || spin_is_contended(lock); | ||
156 | } | ||
157 | |||
53 | /* | 158 | /* |
54 | * Compaction requires the taking of some coarse locks that are potentially | 159 | * Compaction requires the taking of some coarse locks that are potentially |
55 | * very heavily contended. Check if the process needs to be scheduled or | 160 | * very heavily contended. Check if the process needs to be scheduled or |
@@ -62,7 +167,7 @@ static inline bool migrate_async_suitable(int migratetype) | |||
62 | static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, | 167 | static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, |
63 | bool locked, struct compact_control *cc) | 168 | bool locked, struct compact_control *cc) |
64 | { | 169 | { |
65 | if (need_resched() || spin_is_contended(lock)) { | 170 | if (should_release_lock(lock)) { |
66 | if (locked) { | 171 | if (locked) { |
67 | spin_unlock_irqrestore(lock, *flags); | 172 | spin_unlock_irqrestore(lock, *flags); |
68 | locked = false; | 173 | locked = false; |
@@ -70,14 +175,11 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, | |||
70 | 175 | ||
71 | /* async aborts if taking too long or contended */ | 176 | /* async aborts if taking too long or contended */ |
72 | if (!cc->sync) { | 177 | if (!cc->sync) { |
73 | if (cc->contended) | 178 | cc->contended = true; |
74 | *cc->contended = true; | ||
75 | return false; | 179 | return false; |
76 | } | 180 | } |
77 | 181 | ||
78 | cond_resched(); | 182 | cond_resched(); |
79 | if (fatal_signal_pending(current)) | ||
80 | return false; | ||
81 | } | 183 | } |
82 | 184 | ||
83 | if (!locked) | 185 | if (!locked) |
@@ -91,44 +193,139 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock, | |||
91 | return compact_checklock_irqsave(lock, flags, false, cc); | 193 | return compact_checklock_irqsave(lock, flags, false, cc); |
92 | } | 194 | } |
93 | 195 | ||
196 | /* Returns true if the page is within a block suitable for migration to */ | ||
197 | static bool suitable_migration_target(struct page *page) | ||
198 | { | ||
199 | int migratetype = get_pageblock_migratetype(page); | ||
200 | |||
201 | /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ | ||
202 | if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) | ||
203 | return false; | ||
204 | |||
205 | /* If the page is a large free page, then allow migration */ | ||
206 | if (PageBuddy(page) && page_order(page) >= pageblock_order) | ||
207 | return true; | ||
208 | |||
209 | /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ | ||
210 | if (migrate_async_suitable(migratetype)) | ||
211 | return true; | ||
212 | |||
213 | /* Otherwise skip the block */ | ||
214 | return false; | ||
215 | } | ||
216 | |||
217 | static void compact_capture_page(struct compact_control *cc) | ||
218 | { | ||
219 | unsigned long flags; | ||
220 | int mtype, mtype_low, mtype_high; | ||
221 | |||
222 | if (!cc->page || *cc->page) | ||
223 | return; | ||
224 | |||
225 | /* | ||
226 | * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP | ||
227 | * regardless of the migratetype of the freelist is is captured from. | ||
228 | * This is fine because the order for a high-order MIGRATE_MOVABLE | ||
229 | * allocation is typically at least a pageblock size and overall | ||
230 | * fragmentation is not impaired. Other allocation types must | ||
231 | * capture pages from their own migratelist because otherwise they | ||
232 | * could pollute other pageblocks like MIGRATE_MOVABLE with | ||
233 | * difficult to move pages and making fragmentation worse overall. | ||
234 | */ | ||
235 | if (cc->migratetype == MIGRATE_MOVABLE) { | ||
236 | mtype_low = 0; | ||
237 | mtype_high = MIGRATE_PCPTYPES; | ||
238 | } else { | ||
239 | mtype_low = cc->migratetype; | ||
240 | mtype_high = cc->migratetype + 1; | ||
241 | } | ||
242 | |||
243 | /* Speculatively examine the free lists without zone lock */ | ||
244 | for (mtype = mtype_low; mtype < mtype_high; mtype++) { | ||
245 | int order; | ||
246 | for (order = cc->order; order < MAX_ORDER; order++) { | ||
247 | struct page *page; | ||
248 | struct free_area *area; | ||
249 | area = &(cc->zone->free_area[order]); | ||
250 | if (list_empty(&area->free_list[mtype])) | ||
251 | continue; | ||
252 | |||
253 | /* Take the lock and attempt capture of the page */ | ||
254 | if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc)) | ||
255 | return; | ||
256 | if (!list_empty(&area->free_list[mtype])) { | ||
257 | page = list_entry(area->free_list[mtype].next, | ||
258 | struct page, lru); | ||
259 | if (capture_free_page(page, cc->order, mtype)) { | ||
260 | spin_unlock_irqrestore(&cc->zone->lock, | ||
261 | flags); | ||
262 | *cc->page = page; | ||
263 | return; | ||
264 | } | ||
265 | } | ||
266 | spin_unlock_irqrestore(&cc->zone->lock, flags); | ||
267 | } | ||
268 | } | ||
269 | } | ||
270 | |||
94 | /* | 271 | /* |
95 | * Isolate free pages onto a private freelist. Caller must hold zone->lock. | 272 | * Isolate free pages onto a private freelist. Caller must hold zone->lock. |
96 | * If @strict is true, will abort returning 0 on any invalid PFNs or non-free | 273 | * If @strict is true, will abort returning 0 on any invalid PFNs or non-free |
97 | * pages inside of the pageblock (even though it may still end up isolating | 274 | * pages inside of the pageblock (even though it may still end up isolating |
98 | * some pages). | 275 | * some pages). |
99 | */ | 276 | */ |
100 | static unsigned long isolate_freepages_block(unsigned long blockpfn, | 277 | static unsigned long isolate_freepages_block(struct compact_control *cc, |
278 | unsigned long blockpfn, | ||
101 | unsigned long end_pfn, | 279 | unsigned long end_pfn, |
102 | struct list_head *freelist, | 280 | struct list_head *freelist, |
103 | bool strict) | 281 | bool strict) |
104 | { | 282 | { |
105 | int nr_scanned = 0, total_isolated = 0; | 283 | int nr_scanned = 0, total_isolated = 0; |
106 | struct page *cursor; | 284 | struct page *cursor, *valid_page = NULL; |
285 | unsigned long nr_strict_required = end_pfn - blockpfn; | ||
286 | unsigned long flags; | ||
287 | bool locked = false; | ||
107 | 288 | ||
108 | cursor = pfn_to_page(blockpfn); | 289 | cursor = pfn_to_page(blockpfn); |
109 | 290 | ||
110 | /* Isolate free pages. This assumes the block is valid */ | 291 | /* Isolate free pages. */ |
111 | for (; blockpfn < end_pfn; blockpfn++, cursor++) { | 292 | for (; blockpfn < end_pfn; blockpfn++, cursor++) { |
112 | int isolated, i; | 293 | int isolated, i; |
113 | struct page *page = cursor; | 294 | struct page *page = cursor; |
114 | 295 | ||
115 | if (!pfn_valid_within(blockpfn)) { | ||
116 | if (strict) | ||
117 | return 0; | ||
118 | continue; | ||
119 | } | ||
120 | nr_scanned++; | 296 | nr_scanned++; |
297 | if (!pfn_valid_within(blockpfn)) | ||
298 | continue; | ||
299 | if (!valid_page) | ||
300 | valid_page = page; | ||
301 | if (!PageBuddy(page)) | ||
302 | continue; | ||
121 | 303 | ||
122 | if (!PageBuddy(page)) { | 304 | /* |
123 | if (strict) | 305 | * The zone lock must be held to isolate freepages. |
124 | return 0; | 306 | * Unfortunately this is a very coarse lock and can be |
307 | * heavily contended if there are parallel allocations | ||
308 | * or parallel compactions. For async compaction do not | ||
309 | * spin on the lock and we acquire the lock as late as | ||
310 | * possible. | ||
311 | */ | ||
312 | locked = compact_checklock_irqsave(&cc->zone->lock, &flags, | ||
313 | locked, cc); | ||
314 | if (!locked) | ||
315 | break; | ||
316 | |||
317 | /* Recheck this is a suitable migration target under lock */ | ||
318 | if (!strict && !suitable_migration_target(page)) | ||
319 | break; | ||
320 | |||
321 | /* Recheck this is a buddy page under lock */ | ||
322 | if (!PageBuddy(page)) | ||
125 | continue; | 323 | continue; |
126 | } | ||
127 | 324 | ||
128 | /* Found a free page, break it into order-0 pages */ | 325 | /* Found a free page, break it into order-0 pages */ |
129 | isolated = split_free_page(page); | 326 | isolated = split_free_page(page); |
130 | if (!isolated && strict) | 327 | if (!isolated && strict) |
131 | return 0; | 328 | break; |
132 | total_isolated += isolated; | 329 | total_isolated += isolated; |
133 | for (i = 0; i < isolated; i++) { | 330 | for (i = 0; i < isolated; i++) { |
134 | list_add(&page->lru, freelist); | 331 | list_add(&page->lru, freelist); |
@@ -143,6 +340,22 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn, | |||
143 | } | 340 | } |
144 | 341 | ||
145 | trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); | 342 | trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); |
343 | |||
344 | /* | ||
345 | * If strict isolation is requested by CMA then check that all the | ||
346 | * pages requested were isolated. If there were any failures, 0 is | ||
347 | * returned and CMA will fail. | ||
348 | */ | ||
349 | if (strict && nr_strict_required != total_isolated) | ||
350 | total_isolated = 0; | ||
351 | |||
352 | if (locked) | ||
353 | spin_unlock_irqrestore(&cc->zone->lock, flags); | ||
354 | |||
355 | /* Update the pageblock-skip if the whole pageblock was scanned */ | ||
356 | if (blockpfn == end_pfn) | ||
357 | update_pageblock_skip(cc, valid_page, total_isolated, false); | ||
358 | |||
146 | return total_isolated; | 359 | return total_isolated; |
147 | } | 360 | } |
148 | 361 | ||
@@ -160,17 +373,14 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn, | |||
160 | * a free page). | 373 | * a free page). |
161 | */ | 374 | */ |
162 | unsigned long | 375 | unsigned long |
163 | isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn) | 376 | isolate_freepages_range(struct compact_control *cc, |
377 | unsigned long start_pfn, unsigned long end_pfn) | ||
164 | { | 378 | { |
165 | unsigned long isolated, pfn, block_end_pfn, flags; | 379 | unsigned long isolated, pfn, block_end_pfn; |
166 | struct zone *zone = NULL; | ||
167 | LIST_HEAD(freelist); | 380 | LIST_HEAD(freelist); |
168 | 381 | ||
169 | if (pfn_valid(start_pfn)) | ||
170 | zone = page_zone(pfn_to_page(start_pfn)); | ||
171 | |||
172 | for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) { | 382 | for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) { |
173 | if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn))) | 383 | if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn))) |
174 | break; | 384 | break; |
175 | 385 | ||
176 | /* | 386 | /* |
@@ -180,10 +390,8 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn) | |||
180 | block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); | 390 | block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); |
181 | block_end_pfn = min(block_end_pfn, end_pfn); | 391 | block_end_pfn = min(block_end_pfn, end_pfn); |
182 | 392 | ||
183 | spin_lock_irqsave(&zone->lock, flags); | 393 | isolated = isolate_freepages_block(cc, pfn, block_end_pfn, |
184 | isolated = isolate_freepages_block(pfn, block_end_pfn, | ||
185 | &freelist, true); | 394 | &freelist, true); |
186 | spin_unlock_irqrestore(&zone->lock, flags); | ||
187 | 395 | ||
188 | /* | 396 | /* |
189 | * In strict mode, isolate_freepages_block() returns 0 if | 397 | * In strict mode, isolate_freepages_block() returns 0 if |
@@ -253,6 +461,7 @@ static bool too_many_isolated(struct zone *zone) | |||
253 | * @cc: Compaction control structure. | 461 | * @cc: Compaction control structure. |
254 | * @low_pfn: The first PFN of the range. | 462 | * @low_pfn: The first PFN of the range. |
255 | * @end_pfn: The one-past-the-last PFN of the range. | 463 | * @end_pfn: The one-past-the-last PFN of the range. |
464 | * @unevictable: true if it allows to isolate unevictable pages | ||
256 | * | 465 | * |
257 | * Isolate all pages that can be migrated from the range specified by | 466 | * Isolate all pages that can be migrated from the range specified by |
258 | * [low_pfn, end_pfn). Returns zero if there is a fatal signal | 467 | * [low_pfn, end_pfn). Returns zero if there is a fatal signal |
@@ -268,7 +477,7 @@ static bool too_many_isolated(struct zone *zone) | |||
268 | */ | 477 | */ |
269 | unsigned long | 478 | unsigned long |
270 | isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | 479 | isolate_migratepages_range(struct zone *zone, struct compact_control *cc, |
271 | unsigned long low_pfn, unsigned long end_pfn) | 480 | unsigned long low_pfn, unsigned long end_pfn, bool unevictable) |
272 | { | 481 | { |
273 | unsigned long last_pageblock_nr = 0, pageblock_nr; | 482 | unsigned long last_pageblock_nr = 0, pageblock_nr; |
274 | unsigned long nr_scanned = 0, nr_isolated = 0; | 483 | unsigned long nr_scanned = 0, nr_isolated = 0; |
@@ -276,7 +485,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
276 | isolate_mode_t mode = 0; | 485 | isolate_mode_t mode = 0; |
277 | struct lruvec *lruvec; | 486 | struct lruvec *lruvec; |
278 | unsigned long flags; | 487 | unsigned long flags; |
279 | bool locked; | 488 | bool locked = false; |
489 | struct page *page = NULL, *valid_page = NULL; | ||
280 | 490 | ||
281 | /* | 491 | /* |
282 | * Ensure that there are not too many pages isolated from the LRU | 492 | * Ensure that there are not too many pages isolated from the LRU |
@@ -296,23 +506,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
296 | 506 | ||
297 | /* Time to isolate some pages for migration */ | 507 | /* Time to isolate some pages for migration */ |
298 | cond_resched(); | 508 | cond_resched(); |
299 | spin_lock_irqsave(&zone->lru_lock, flags); | ||
300 | locked = true; | ||
301 | for (; low_pfn < end_pfn; low_pfn++) { | 509 | for (; low_pfn < end_pfn; low_pfn++) { |
302 | struct page *page; | ||
303 | |||
304 | /* give a chance to irqs before checking need_resched() */ | 510 | /* give a chance to irqs before checking need_resched() */ |
305 | if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) { | 511 | if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) { |
306 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 512 | if (should_release_lock(&zone->lru_lock)) { |
307 | locked = false; | 513 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
514 | locked = false; | ||
515 | } | ||
308 | } | 516 | } |
309 | 517 | ||
310 | /* Check if it is ok to still hold the lock */ | ||
311 | locked = compact_checklock_irqsave(&zone->lru_lock, &flags, | ||
312 | locked, cc); | ||
313 | if (!locked) | ||
314 | break; | ||
315 | |||
316 | /* | 518 | /* |
317 | * migrate_pfn does not necessarily start aligned to a | 519 | * migrate_pfn does not necessarily start aligned to a |
318 | * pageblock. Ensure that pfn_valid is called when moving | 520 | * pageblock. Ensure that pfn_valid is called when moving |
@@ -340,6 +542,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
340 | if (page_zone(page) != zone) | 542 | if (page_zone(page) != zone) |
341 | continue; | 543 | continue; |
342 | 544 | ||
545 | if (!valid_page) | ||
546 | valid_page = page; | ||
547 | |||
548 | /* If isolation recently failed, do not retry */ | ||
549 | pageblock_nr = low_pfn >> pageblock_order; | ||
550 | if (!isolation_suitable(cc, page)) | ||
551 | goto next_pageblock; | ||
552 | |||
343 | /* Skip if free */ | 553 | /* Skip if free */ |
344 | if (PageBuddy(page)) | 554 | if (PageBuddy(page)) |
345 | continue; | 555 | continue; |
@@ -349,24 +559,43 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
349 | * migration is optimistic to see if the minimum amount of work | 559 | * migration is optimistic to see if the minimum amount of work |
350 | * satisfies the allocation | 560 | * satisfies the allocation |
351 | */ | 561 | */ |
352 | pageblock_nr = low_pfn >> pageblock_order; | ||
353 | if (!cc->sync && last_pageblock_nr != pageblock_nr && | 562 | if (!cc->sync && last_pageblock_nr != pageblock_nr && |
354 | !migrate_async_suitable(get_pageblock_migratetype(page))) { | 563 | !migrate_async_suitable(get_pageblock_migratetype(page))) { |
355 | low_pfn += pageblock_nr_pages; | 564 | cc->finished_update_migrate = true; |
356 | low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; | 565 | goto next_pageblock; |
357 | last_pageblock_nr = pageblock_nr; | ||
358 | continue; | ||
359 | } | 566 | } |
360 | 567 | ||
568 | /* Check may be lockless but that's ok as we recheck later */ | ||
361 | if (!PageLRU(page)) | 569 | if (!PageLRU(page)) |
362 | continue; | 570 | continue; |
363 | 571 | ||
364 | /* | 572 | /* |
365 | * PageLRU is set, and lru_lock excludes isolation, | 573 | * PageLRU is set. lru_lock normally excludes isolation |
366 | * splitting and collapsing (collapsing has already | 574 | * splitting and collapsing (collapsing has already happened |
367 | * happened if PageLRU is set). | 575 | * if PageLRU is set) but the lock is not necessarily taken |
576 | * here and it is wasteful to take it just to check transhuge. | ||
577 | * Check TransHuge without lock and skip the whole pageblock if | ||
578 | * it's either a transhuge or hugetlbfs page, as calling | ||
579 | * compound_order() without preventing THP from splitting the | ||
580 | * page underneath us may return surprising results. | ||
368 | */ | 581 | */ |
369 | if (PageTransHuge(page)) { | 582 | if (PageTransHuge(page)) { |
583 | if (!locked) | ||
584 | goto next_pageblock; | ||
585 | low_pfn += (1 << compound_order(page)) - 1; | ||
586 | continue; | ||
587 | } | ||
588 | |||
589 | /* Check if it is ok to still hold the lock */ | ||
590 | locked = compact_checklock_irqsave(&zone->lru_lock, &flags, | ||
591 | locked, cc); | ||
592 | if (!locked || fatal_signal_pending(current)) | ||
593 | break; | ||
594 | |||
595 | /* Recheck PageLRU and PageTransHuge under lock */ | ||
596 | if (!PageLRU(page)) | ||
597 | continue; | ||
598 | if (PageTransHuge(page)) { | ||
370 | low_pfn += (1 << compound_order(page)) - 1; | 599 | low_pfn += (1 << compound_order(page)) - 1; |
371 | continue; | 600 | continue; |
372 | } | 601 | } |
@@ -374,6 +603,9 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
374 | if (!cc->sync) | 603 | if (!cc->sync) |
375 | mode |= ISOLATE_ASYNC_MIGRATE; | 604 | mode |= ISOLATE_ASYNC_MIGRATE; |
376 | 605 | ||
606 | if (unevictable) | ||
607 | mode |= ISOLATE_UNEVICTABLE; | ||
608 | |||
377 | lruvec = mem_cgroup_page_lruvec(page, zone); | 609 | lruvec = mem_cgroup_page_lruvec(page, zone); |
378 | 610 | ||
379 | /* Try isolate the page */ | 611 | /* Try isolate the page */ |
@@ -383,6 +615,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
383 | VM_BUG_ON(PageTransCompound(page)); | 615 | VM_BUG_ON(PageTransCompound(page)); |
384 | 616 | ||
385 | /* Successfully isolated */ | 617 | /* Successfully isolated */ |
618 | cc->finished_update_migrate = true; | ||
386 | del_page_from_lru_list(page, lruvec, page_lru(page)); | 619 | del_page_from_lru_list(page, lruvec, page_lru(page)); |
387 | list_add(&page->lru, migratelist); | 620 | list_add(&page->lru, migratelist); |
388 | cc->nr_migratepages++; | 621 | cc->nr_migratepages++; |
@@ -393,6 +626,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
393 | ++low_pfn; | 626 | ++low_pfn; |
394 | break; | 627 | break; |
395 | } | 628 | } |
629 | |||
630 | continue; | ||
631 | |||
632 | next_pageblock: | ||
633 | low_pfn += pageblock_nr_pages; | ||
634 | low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; | ||
635 | last_pageblock_nr = pageblock_nr; | ||
396 | } | 636 | } |
397 | 637 | ||
398 | acct_isolated(zone, locked, cc); | 638 | acct_isolated(zone, locked, cc); |
@@ -400,6 +640,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
400 | if (locked) | 640 | if (locked) |
401 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 641 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
402 | 642 | ||
643 | /* Update the pageblock-skip if the whole pageblock was scanned */ | ||
644 | if (low_pfn == end_pfn) | ||
645 | update_pageblock_skip(cc, valid_page, nr_isolated, true); | ||
646 | |||
403 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); | 647 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); |
404 | 648 | ||
405 | return low_pfn; | 649 | return low_pfn; |
@@ -407,43 +651,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
407 | 651 | ||
408 | #endif /* CONFIG_COMPACTION || CONFIG_CMA */ | 652 | #endif /* CONFIG_COMPACTION || CONFIG_CMA */ |
409 | #ifdef CONFIG_COMPACTION | 653 | #ifdef CONFIG_COMPACTION |
410 | |||
411 | /* Returns true if the page is within a block suitable for migration to */ | ||
412 | static bool suitable_migration_target(struct page *page) | ||
413 | { | ||
414 | |||
415 | int migratetype = get_pageblock_migratetype(page); | ||
416 | |||
417 | /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ | ||
418 | if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) | ||
419 | return false; | ||
420 | |||
421 | /* If the page is a large free page, then allow migration */ | ||
422 | if (PageBuddy(page) && page_order(page) >= pageblock_order) | ||
423 | return true; | ||
424 | |||
425 | /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ | ||
426 | if (migrate_async_suitable(migratetype)) | ||
427 | return true; | ||
428 | |||
429 | /* Otherwise skip the block */ | ||
430 | return false; | ||
431 | } | ||
432 | |||
433 | /* | ||
434 | * Returns the start pfn of the last page block in a zone. This is the starting | ||
435 | * point for full compaction of a zone. Compaction searches for free pages from | ||
436 | * the end of each zone, while isolate_freepages_block scans forward inside each | ||
437 | * page block. | ||
438 | */ | ||
439 | static unsigned long start_free_pfn(struct zone *zone) | ||
440 | { | ||
441 | unsigned long free_pfn; | ||
442 | free_pfn = zone->zone_start_pfn + zone->spanned_pages; | ||
443 | free_pfn &= ~(pageblock_nr_pages-1); | ||
444 | return free_pfn; | ||
445 | } | ||
446 | |||
447 | /* | 654 | /* |
448 | * Based on information in the current compact_control, find blocks | 655 | * Based on information in the current compact_control, find blocks |
449 | * suitable for isolating free pages from and then isolate them. | 656 | * suitable for isolating free pages from and then isolate them. |
@@ -453,7 +660,6 @@ static void isolate_freepages(struct zone *zone, | |||
453 | { | 660 | { |
454 | struct page *page; | 661 | struct page *page; |
455 | unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn; | 662 | unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn; |
456 | unsigned long flags; | ||
457 | int nr_freepages = cc->nr_freepages; | 663 | int nr_freepages = cc->nr_freepages; |
458 | struct list_head *freelist = &cc->freepages; | 664 | struct list_head *freelist = &cc->freepages; |
459 | 665 | ||
@@ -501,30 +707,16 @@ static void isolate_freepages(struct zone *zone, | |||
501 | if (!suitable_migration_target(page)) | 707 | if (!suitable_migration_target(page)) |
502 | continue; | 708 | continue; |
503 | 709 | ||
504 | /* | 710 | /* If isolation recently failed, do not retry */ |
505 | * Found a block suitable for isolating free pages from. Now | 711 | if (!isolation_suitable(cc, page)) |
506 | * we disabled interrupts, double check things are ok and | 712 | continue; |
507 | * isolate the pages. This is to minimise the time IRQs | ||
508 | * are disabled | ||
509 | */ | ||
510 | isolated = 0; | ||
511 | 713 | ||
512 | /* | 714 | /* Found a block suitable for isolating free pages from */ |
513 | * The zone lock must be held to isolate freepages. This | 715 | isolated = 0; |
514 | * unfortunately this is a very coarse lock and can be | 716 | end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); |
515 | * heavily contended if there are parallel allocations | 717 | isolated = isolate_freepages_block(cc, pfn, end_pfn, |
516 | * or parallel compactions. For async compaction do not | 718 | freelist, false); |
517 | * spin on the lock | 719 | nr_freepages += isolated; |
518 | */ | ||
519 | if (!compact_trylock_irqsave(&zone->lock, &flags, cc)) | ||
520 | break; | ||
521 | if (suitable_migration_target(page)) { | ||
522 | end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); | ||
523 | isolated = isolate_freepages_block(pfn, end_pfn, | ||
524 | freelist, false); | ||
525 | nr_freepages += isolated; | ||
526 | } | ||
527 | spin_unlock_irqrestore(&zone->lock, flags); | ||
528 | 720 | ||
529 | /* | 721 | /* |
530 | * Record the highest PFN we isolated pages from. When next | 722 | * Record the highest PFN we isolated pages from. When next |
@@ -532,17 +724,8 @@ static void isolate_freepages(struct zone *zone, | |||
532 | * page migration may have returned some pages to the allocator | 724 | * page migration may have returned some pages to the allocator |
533 | */ | 725 | */ |
534 | if (isolated) { | 726 | if (isolated) { |
727 | cc->finished_update_free = true; | ||
535 | high_pfn = max(high_pfn, pfn); | 728 | high_pfn = max(high_pfn, pfn); |
536 | |||
537 | /* | ||
538 | * If the free scanner has wrapped, update | ||
539 | * compact_cached_free_pfn to point to the highest | ||
540 | * pageblock with free pages. This reduces excessive | ||
541 | * scanning of full pageblocks near the end of the | ||
542 | * zone | ||
543 | */ | ||
544 | if (cc->order > 0 && cc->wrapped) | ||
545 | zone->compact_cached_free_pfn = high_pfn; | ||
546 | } | 729 | } |
547 | } | 730 | } |
548 | 731 | ||
@@ -551,11 +734,6 @@ static void isolate_freepages(struct zone *zone, | |||
551 | 734 | ||
552 | cc->free_pfn = high_pfn; | 735 | cc->free_pfn = high_pfn; |
553 | cc->nr_freepages = nr_freepages; | 736 | cc->nr_freepages = nr_freepages; |
554 | |||
555 | /* If compact_cached_free_pfn is reset then set it now */ | ||
556 | if (cc->order > 0 && !cc->wrapped && | ||
557 | zone->compact_cached_free_pfn == start_free_pfn(zone)) | ||
558 | zone->compact_cached_free_pfn = high_pfn; | ||
559 | } | 737 | } |
560 | 738 | ||
561 | /* | 739 | /* |
@@ -633,8 +811,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
633 | } | 811 | } |
634 | 812 | ||
635 | /* Perform the isolation */ | 813 | /* Perform the isolation */ |
636 | low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn); | 814 | low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn, false); |
637 | if (!low_pfn) | 815 | if (!low_pfn || cc->contended) |
638 | return ISOLATE_ABORT; | 816 | return ISOLATE_ABORT; |
639 | 817 | ||
640 | cc->migrate_pfn = low_pfn; | 818 | cc->migrate_pfn = low_pfn; |
@@ -645,33 +823,24 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
645 | static int compact_finished(struct zone *zone, | 823 | static int compact_finished(struct zone *zone, |
646 | struct compact_control *cc) | 824 | struct compact_control *cc) |
647 | { | 825 | { |
648 | unsigned int order; | ||
649 | unsigned long watermark; | 826 | unsigned long watermark; |
650 | 827 | ||
651 | if (fatal_signal_pending(current)) | 828 | if (fatal_signal_pending(current)) |
652 | return COMPACT_PARTIAL; | 829 | return COMPACT_PARTIAL; |
653 | 830 | ||
654 | /* | 831 | /* Compaction run completes if the migrate and free scanner meet */ |
655 | * A full (order == -1) compaction run starts at the beginning and | ||
656 | * end of a zone; it completes when the migrate and free scanner meet. | ||
657 | * A partial (order > 0) compaction can start with the free scanner | ||
658 | * at a random point in the zone, and may have to restart. | ||
659 | */ | ||
660 | if (cc->free_pfn <= cc->migrate_pfn) { | 832 | if (cc->free_pfn <= cc->migrate_pfn) { |
661 | if (cc->order > 0 && !cc->wrapped) { | 833 | /* |
662 | /* We started partway through; restart at the end. */ | 834 | * Mark that the PG_migrate_skip information should be cleared |
663 | unsigned long free_pfn = start_free_pfn(zone); | 835 | * by kswapd when it goes to sleep. kswapd does not set the |
664 | zone->compact_cached_free_pfn = free_pfn; | 836 | * flag itself as the decision to be clear should be directly |
665 | cc->free_pfn = free_pfn; | 837 | * based on an allocation request. |
666 | cc->wrapped = 1; | 838 | */ |
667 | return COMPACT_CONTINUE; | 839 | if (!current_is_kswapd()) |
668 | } | 840 | zone->compact_blockskip_flush = true; |
669 | return COMPACT_COMPLETE; | ||
670 | } | ||
671 | 841 | ||
672 | /* We wrapped around and ended up where we started. */ | ||
673 | if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn) | ||
674 | return COMPACT_COMPLETE; | 842 | return COMPACT_COMPLETE; |
843 | } | ||
675 | 844 | ||
676 | /* | 845 | /* |
677 | * order == -1 is expected when compacting via | 846 | * order == -1 is expected when compacting via |
@@ -688,14 +857,22 @@ static int compact_finished(struct zone *zone, | |||
688 | return COMPACT_CONTINUE; | 857 | return COMPACT_CONTINUE; |
689 | 858 | ||
690 | /* Direct compactor: Is a suitable page free? */ | 859 | /* Direct compactor: Is a suitable page free? */ |
691 | for (order = cc->order; order < MAX_ORDER; order++) { | 860 | if (cc->page) { |
692 | /* Job done if page is free of the right migratetype */ | 861 | /* Was a suitable page captured? */ |
693 | if (!list_empty(&zone->free_area[order].free_list[cc->migratetype])) | 862 | if (*cc->page) |
694 | return COMPACT_PARTIAL; | ||
695 | |||
696 | /* Job done if allocation would set block type */ | ||
697 | if (order >= pageblock_order && zone->free_area[order].nr_free) | ||
698 | return COMPACT_PARTIAL; | 863 | return COMPACT_PARTIAL; |
864 | } else { | ||
865 | unsigned int order; | ||
866 | for (order = cc->order; order < MAX_ORDER; order++) { | ||
867 | struct free_area *area = &zone->free_area[cc->order]; | ||
868 | /* Job done if page is free of the right migratetype */ | ||
869 | if (!list_empty(&area->free_list[cc->migratetype])) | ||
870 | return COMPACT_PARTIAL; | ||
871 | |||
872 | /* Job done if allocation would set block type */ | ||
873 | if (cc->order >= pageblock_order && area->nr_free) | ||
874 | return COMPACT_PARTIAL; | ||
875 | } | ||
699 | } | 876 | } |
700 | 877 | ||
701 | return COMPACT_CONTINUE; | 878 | return COMPACT_CONTINUE; |
@@ -754,6 +931,8 @@ unsigned long compaction_suitable(struct zone *zone, int order) | |||
754 | static int compact_zone(struct zone *zone, struct compact_control *cc) | 931 | static int compact_zone(struct zone *zone, struct compact_control *cc) |
755 | { | 932 | { |
756 | int ret; | 933 | int ret; |
934 | unsigned long start_pfn = zone->zone_start_pfn; | ||
935 | unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages; | ||
757 | 936 | ||
758 | ret = compaction_suitable(zone, cc->order); | 937 | ret = compaction_suitable(zone, cc->order); |
759 | switch (ret) { | 938 | switch (ret) { |
@@ -766,18 +945,30 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
766 | ; | 945 | ; |
767 | } | 946 | } |
768 | 947 | ||
769 | /* Setup to move all movable pages to the end of the zone */ | 948 | /* |
770 | cc->migrate_pfn = zone->zone_start_pfn; | 949 | * Setup to move all movable pages to the end of the zone. Used cached |
771 | 950 | * information on where the scanners should start but check that it | |
772 | if (cc->order > 0) { | 951 | * is initialised by ensuring the values are within zone boundaries. |
773 | /* Incremental compaction. Start where the last one stopped. */ | 952 | */ |
774 | cc->free_pfn = zone->compact_cached_free_pfn; | 953 | cc->migrate_pfn = zone->compact_cached_migrate_pfn; |
775 | cc->start_free_pfn = cc->free_pfn; | 954 | cc->free_pfn = zone->compact_cached_free_pfn; |
776 | } else { | 955 | if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { |
777 | /* Order == -1 starts at the end of the zone. */ | 956 | cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); |
778 | cc->free_pfn = start_free_pfn(zone); | 957 | zone->compact_cached_free_pfn = cc->free_pfn; |
958 | } | ||
959 | if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { | ||
960 | cc->migrate_pfn = start_pfn; | ||
961 | zone->compact_cached_migrate_pfn = cc->migrate_pfn; | ||
779 | } | 962 | } |
780 | 963 | ||
964 | /* | ||
965 | * Clear pageblock skip if there were failures recently and compaction | ||
966 | * is about to be retried after being deferred. kswapd does not do | ||
967 | * this reset as it'll reset the cached information when going to sleep. | ||
968 | */ | ||
969 | if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) | ||
970 | __reset_isolation_suitable(zone); | ||
971 | |||
781 | migrate_prep_local(); | 972 | migrate_prep_local(); |
782 | 973 | ||
783 | while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { | 974 | while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { |
@@ -787,6 +978,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
787 | switch (isolate_migratepages(zone, cc)) { | 978 | switch (isolate_migratepages(zone, cc)) { |
788 | case ISOLATE_ABORT: | 979 | case ISOLATE_ABORT: |
789 | ret = COMPACT_PARTIAL; | 980 | ret = COMPACT_PARTIAL; |
981 | putback_lru_pages(&cc->migratepages); | ||
982 | cc->nr_migratepages = 0; | ||
790 | goto out; | 983 | goto out; |
791 | case ISOLATE_NONE: | 984 | case ISOLATE_NONE: |
792 | continue; | 985 | continue; |
@@ -817,6 +1010,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
817 | goto out; | 1010 | goto out; |
818 | } | 1011 | } |
819 | } | 1012 | } |
1013 | |||
1014 | /* Capture a page now if it is a suitable size */ | ||
1015 | compact_capture_page(cc); | ||
820 | } | 1016 | } |
821 | 1017 | ||
822 | out: | 1018 | out: |
@@ -829,8 +1025,10 @@ out: | |||
829 | 1025 | ||
830 | static unsigned long compact_zone_order(struct zone *zone, | 1026 | static unsigned long compact_zone_order(struct zone *zone, |
831 | int order, gfp_t gfp_mask, | 1027 | int order, gfp_t gfp_mask, |
832 | bool sync, bool *contended) | 1028 | bool sync, bool *contended, |
1029 | struct page **page) | ||
833 | { | 1030 | { |
1031 | unsigned long ret; | ||
834 | struct compact_control cc = { | 1032 | struct compact_control cc = { |
835 | .nr_freepages = 0, | 1033 | .nr_freepages = 0, |
836 | .nr_migratepages = 0, | 1034 | .nr_migratepages = 0, |
@@ -838,12 +1036,18 @@ static unsigned long compact_zone_order(struct zone *zone, | |||
838 | .migratetype = allocflags_to_migratetype(gfp_mask), | 1036 | .migratetype = allocflags_to_migratetype(gfp_mask), |
839 | .zone = zone, | 1037 | .zone = zone, |
840 | .sync = sync, | 1038 | .sync = sync, |
841 | .contended = contended, | 1039 | .page = page, |
842 | }; | 1040 | }; |
843 | INIT_LIST_HEAD(&cc.freepages); | 1041 | INIT_LIST_HEAD(&cc.freepages); |
844 | INIT_LIST_HEAD(&cc.migratepages); | 1042 | INIT_LIST_HEAD(&cc.migratepages); |
845 | 1043 | ||
846 | return compact_zone(zone, &cc); | 1044 | ret = compact_zone(zone, &cc); |
1045 | |||
1046 | VM_BUG_ON(!list_empty(&cc.freepages)); | ||
1047 | VM_BUG_ON(!list_empty(&cc.migratepages)); | ||
1048 | |||
1049 | *contended = cc.contended; | ||
1050 | return ret; | ||
847 | } | 1051 | } |
848 | 1052 | ||
849 | int sysctl_extfrag_threshold = 500; | 1053 | int sysctl_extfrag_threshold = 500; |
@@ -855,12 +1059,14 @@ int sysctl_extfrag_threshold = 500; | |||
855 | * @gfp_mask: The GFP mask of the current allocation | 1059 | * @gfp_mask: The GFP mask of the current allocation |
856 | * @nodemask: The allowed nodes to allocate from | 1060 | * @nodemask: The allowed nodes to allocate from |
857 | * @sync: Whether migration is synchronous or not | 1061 | * @sync: Whether migration is synchronous or not |
1062 | * @contended: Return value that is true if compaction was aborted due to lock contention | ||
1063 | * @page: Optionally capture a free page of the requested order during compaction | ||
858 | * | 1064 | * |
859 | * This is the main entry point for direct page compaction. | 1065 | * This is the main entry point for direct page compaction. |
860 | */ | 1066 | */ |
861 | unsigned long try_to_compact_pages(struct zonelist *zonelist, | 1067 | unsigned long try_to_compact_pages(struct zonelist *zonelist, |
862 | int order, gfp_t gfp_mask, nodemask_t *nodemask, | 1068 | int order, gfp_t gfp_mask, nodemask_t *nodemask, |
863 | bool sync, bool *contended) | 1069 | bool sync, bool *contended, struct page **page) |
864 | { | 1070 | { |
865 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 1071 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
866 | int may_enter_fs = gfp_mask & __GFP_FS; | 1072 | int may_enter_fs = gfp_mask & __GFP_FS; |
@@ -868,28 +1074,30 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
868 | struct zoneref *z; | 1074 | struct zoneref *z; |
869 | struct zone *zone; | 1075 | struct zone *zone; |
870 | int rc = COMPACT_SKIPPED; | 1076 | int rc = COMPACT_SKIPPED; |
1077 | int alloc_flags = 0; | ||
871 | 1078 | ||
872 | /* | 1079 | /* Check if the GFP flags allow compaction */ |
873 | * Check whether it is worth even starting compaction. The order check is | ||
874 | * made because an assumption is made that the page allocator can satisfy | ||
875 | * the "cheaper" orders without taking special steps | ||
876 | */ | ||
877 | if (!order || !may_enter_fs || !may_perform_io) | 1080 | if (!order || !may_enter_fs || !may_perform_io) |
878 | return rc; | 1081 | return rc; |
879 | 1082 | ||
880 | count_vm_event(COMPACTSTALL); | 1083 | count_vm_event(COMPACTSTALL); |
881 | 1084 | ||
1085 | #ifdef CONFIG_CMA | ||
1086 | if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) | ||
1087 | alloc_flags |= ALLOC_CMA; | ||
1088 | #endif | ||
882 | /* Compact each zone in the list */ | 1089 | /* Compact each zone in the list */ |
883 | for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, | 1090 | for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, |
884 | nodemask) { | 1091 | nodemask) { |
885 | int status; | 1092 | int status; |
886 | 1093 | ||
887 | status = compact_zone_order(zone, order, gfp_mask, sync, | 1094 | status = compact_zone_order(zone, order, gfp_mask, sync, |
888 | contended); | 1095 | contended, page); |
889 | rc = max(status, rc); | 1096 | rc = max(status, rc); |
890 | 1097 | ||
891 | /* If a normal allocation would succeed, stop compacting */ | 1098 | /* If a normal allocation would succeed, stop compacting */ |
892 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) | 1099 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, |
1100 | alloc_flags)) | ||
893 | break; | 1101 | break; |
894 | } | 1102 | } |
895 | 1103 | ||
@@ -940,6 +1148,7 @@ int compact_pgdat(pg_data_t *pgdat, int order) | |||
940 | struct compact_control cc = { | 1148 | struct compact_control cc = { |
941 | .order = order, | 1149 | .order = order, |
942 | .sync = false, | 1150 | .sync = false, |
1151 | .page = NULL, | ||
943 | }; | 1152 | }; |
944 | 1153 | ||
945 | return __compact_pgdat(pgdat, &cc); | 1154 | return __compact_pgdat(pgdat, &cc); |
@@ -950,6 +1159,7 @@ static int compact_node(int nid) | |||
950 | struct compact_control cc = { | 1159 | struct compact_control cc = { |
951 | .order = -1, | 1160 | .order = -1, |
952 | .sync = true, | 1161 | .sync = true, |
1162 | .page = NULL, | ||
953 | }; | 1163 | }; |
954 | 1164 | ||
955 | return __compact_pgdat(NODE_DATA(nid), &cc); | 1165 | return __compact_pgdat(NODE_DATA(nid), &cc); |
diff --git a/mm/filemap.c b/mm/filemap.c index 384344575c37..83efee76a5c0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -1607,13 +1607,13 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1607 | * Do we have something in the page cache already? | 1607 | * Do we have something in the page cache already? |
1608 | */ | 1608 | */ |
1609 | page = find_get_page(mapping, offset); | 1609 | page = find_get_page(mapping, offset); |
1610 | if (likely(page)) { | 1610 | if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { |
1611 | /* | 1611 | /* |
1612 | * We found the page, so try async readahead before | 1612 | * We found the page, so try async readahead before |
1613 | * waiting for the lock. | 1613 | * waiting for the lock. |
1614 | */ | 1614 | */ |
1615 | do_async_mmap_readahead(vma, ra, file, page, offset); | 1615 | do_async_mmap_readahead(vma, ra, file, page, offset); |
1616 | } else { | 1616 | } else if (!page) { |
1617 | /* No page in the page cache at all */ | 1617 | /* No page in the page cache at all */ |
1618 | do_sync_mmap_readahead(vma, ra, file, offset); | 1618 | do_sync_mmap_readahead(vma, ra, file, offset); |
1619 | count_vm_event(PGMAJFAULT); | 1619 | count_vm_event(PGMAJFAULT); |
@@ -1737,6 +1737,7 @@ EXPORT_SYMBOL(filemap_page_mkwrite); | |||
1737 | const struct vm_operations_struct generic_file_vm_ops = { | 1737 | const struct vm_operations_struct generic_file_vm_ops = { |
1738 | .fault = filemap_fault, | 1738 | .fault = filemap_fault, |
1739 | .page_mkwrite = filemap_page_mkwrite, | 1739 | .page_mkwrite = filemap_page_mkwrite, |
1740 | .remap_pages = generic_file_remap_pages, | ||
1740 | }; | 1741 | }; |
1741 | 1742 | ||
1742 | /* This is used for a general mmap of a disk file */ | 1743 | /* This is used for a general mmap of a disk file */ |
@@ -1749,7 +1750,6 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma) | |||
1749 | return -ENOEXEC; | 1750 | return -ENOEXEC; |
1750 | file_accessed(file); | 1751 | file_accessed(file); |
1751 | vma->vm_ops = &generic_file_vm_ops; | 1752 | vma->vm_ops = &generic_file_vm_ops; |
1752 | vma->vm_flags |= VM_CAN_NONLINEAR; | ||
1753 | return 0; | 1753 | return 0; |
1754 | } | 1754 | } |
1755 | 1755 | ||
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 13e013b1270c..a912da6ddfd4 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -167,7 +167,6 @@ __xip_unmap (struct address_space * mapping, | |||
167 | { | 167 | { |
168 | struct vm_area_struct *vma; | 168 | struct vm_area_struct *vma; |
169 | struct mm_struct *mm; | 169 | struct mm_struct *mm; |
170 | struct prio_tree_iter iter; | ||
171 | unsigned long address; | 170 | unsigned long address; |
172 | pte_t *pte; | 171 | pte_t *pte; |
173 | pte_t pteval; | 172 | pte_t pteval; |
@@ -184,7 +183,7 @@ __xip_unmap (struct address_space * mapping, | |||
184 | 183 | ||
185 | retry: | 184 | retry: |
186 | mutex_lock(&mapping->i_mmap_mutex); | 185 | mutex_lock(&mapping->i_mmap_mutex); |
187 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 186 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
188 | mm = vma->vm_mm; | 187 | mm = vma->vm_mm; |
189 | address = vma->vm_start + | 188 | address = vma->vm_start + |
190 | ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); | 189 | ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); |
@@ -193,11 +192,13 @@ retry: | |||
193 | if (pte) { | 192 | if (pte) { |
194 | /* Nuke the page table entry. */ | 193 | /* Nuke the page table entry. */ |
195 | flush_cache_page(vma, address, pte_pfn(*pte)); | 194 | flush_cache_page(vma, address, pte_pfn(*pte)); |
196 | pteval = ptep_clear_flush_notify(vma, address, pte); | 195 | pteval = ptep_clear_flush(vma, address, pte); |
197 | page_remove_rmap(page); | 196 | page_remove_rmap(page); |
198 | dec_mm_counter(mm, MM_FILEPAGES); | 197 | dec_mm_counter(mm, MM_FILEPAGES); |
199 | BUG_ON(pte_dirty(pteval)); | 198 | BUG_ON(pte_dirty(pteval)); |
200 | pte_unmap_unlock(pte, ptl); | 199 | pte_unmap_unlock(pte, ptl); |
200 | /* must invalidate_page _before_ freeing the page */ | ||
201 | mmu_notifier_invalidate_page(mm, address); | ||
201 | page_cache_release(page); | 202 | page_cache_release(page); |
202 | } | 203 | } |
203 | } | 204 | } |
@@ -305,6 +306,7 @@ out: | |||
305 | static const struct vm_operations_struct xip_file_vm_ops = { | 306 | static const struct vm_operations_struct xip_file_vm_ops = { |
306 | .fault = xip_file_fault, | 307 | .fault = xip_file_fault, |
307 | .page_mkwrite = filemap_page_mkwrite, | 308 | .page_mkwrite = filemap_page_mkwrite, |
309 | .remap_pages = generic_file_remap_pages, | ||
308 | }; | 310 | }; |
309 | 311 | ||
310 | int xip_file_mmap(struct file * file, struct vm_area_struct * vma) | 312 | int xip_file_mmap(struct file * file, struct vm_area_struct * vma) |
@@ -313,7 +315,7 @@ int xip_file_mmap(struct file * file, struct vm_area_struct * vma) | |||
313 | 315 | ||
314 | file_accessed(file); | 316 | file_accessed(file); |
315 | vma->vm_ops = &xip_file_vm_ops; | 317 | vma->vm_ops = &xip_file_vm_ops; |
316 | vma->vm_flags |= VM_CAN_NONLINEAR | VM_MIXEDMAP; | 318 | vma->vm_flags |= VM_MIXEDMAP; |
317 | return 0; | 319 | return 0; |
318 | } | 320 | } |
319 | EXPORT_SYMBOL_GPL(xip_file_mmap); | 321 | EXPORT_SYMBOL_GPL(xip_file_mmap); |
diff --git a/mm/fremap.c b/mm/fremap.c index 048659c0c03d..3899a86851ce 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -5,6 +5,7 @@ | |||
5 | * | 5 | * |
6 | * started by Ingo Molnar, Copyright (C) 2002, 2003 | 6 | * started by Ingo Molnar, Copyright (C) 2002, 2003 |
7 | */ | 7 | */ |
8 | #include <linux/export.h> | ||
8 | #include <linux/backing-dev.h> | 9 | #include <linux/backing-dev.h> |
9 | #include <linux/mm.h> | 10 | #include <linux/mm.h> |
10 | #include <linux/swap.h> | 11 | #include <linux/swap.h> |
@@ -80,9 +81,10 @@ out: | |||
80 | return err; | 81 | return err; |
81 | } | 82 | } |
82 | 83 | ||
83 | static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma, | 84 | int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, |
84 | unsigned long addr, unsigned long size, pgoff_t pgoff) | 85 | unsigned long size, pgoff_t pgoff) |
85 | { | 86 | { |
87 | struct mm_struct *mm = vma->vm_mm; | ||
86 | int err; | 88 | int err; |
87 | 89 | ||
88 | do { | 90 | do { |
@@ -95,9 +97,9 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma, | |||
95 | pgoff++; | 97 | pgoff++; |
96 | } while (size); | 98 | } while (size); |
97 | 99 | ||
98 | return 0; | 100 | return 0; |
99 | |||
100 | } | 101 | } |
102 | EXPORT_SYMBOL(generic_file_remap_pages); | ||
101 | 103 | ||
102 | /** | 104 | /** |
103 | * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma | 105 | * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma |
@@ -167,7 +169,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
167 | if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR)) | 169 | if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR)) |
168 | goto out; | 170 | goto out; |
169 | 171 | ||
170 | if (!(vma->vm_flags & VM_CAN_NONLINEAR)) | 172 | if (!vma->vm_ops->remap_pages) |
171 | goto out; | 173 | goto out; |
172 | 174 | ||
173 | if (start < vma->vm_start || start + size > vma->vm_end) | 175 | if (start < vma->vm_start || start + size > vma->vm_end) |
@@ -212,7 +214,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
212 | mutex_lock(&mapping->i_mmap_mutex); | 214 | mutex_lock(&mapping->i_mmap_mutex); |
213 | flush_dcache_mmap_lock(mapping); | 215 | flush_dcache_mmap_lock(mapping); |
214 | vma->vm_flags |= VM_NONLINEAR; | 216 | vma->vm_flags |= VM_NONLINEAR; |
215 | vma_prio_tree_remove(vma, &mapping->i_mmap); | 217 | vma_interval_tree_remove(vma, &mapping->i_mmap); |
216 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); | 218 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); |
217 | flush_dcache_mmap_unlock(mapping); | 219 | flush_dcache_mmap_unlock(mapping); |
218 | mutex_unlock(&mapping->i_mmap_mutex); | 220 | mutex_unlock(&mapping->i_mmap_mutex); |
@@ -228,7 +230,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
228 | } | 230 | } |
229 | 231 | ||
230 | mmu_notifier_invalidate_range_start(mm, start, start + size); | 232 | mmu_notifier_invalidate_range_start(mm, start, start + size); |
231 | err = populate_range(mm, vma, start, size, pgoff); | 233 | err = vma->vm_ops->remap_pages(vma, start, size, pgoff); |
232 | mmu_notifier_invalidate_range_end(mm, start, start + size); | 234 | mmu_notifier_invalidate_range_end(mm, start, start + size); |
233 | if (!err && !(flags & MAP_NONBLOCK)) { | 235 | if (!err && !(flags & MAP_NONBLOCK)) { |
234 | if (vma->vm_flags & VM_LOCKED) { | 236 | if (vma->vm_flags & VM_LOCKED) { |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 141dbb695097..a863af26c79c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -102,10 +102,7 @@ static int set_recommended_min_free_kbytes(void) | |||
102 | unsigned long recommended_min; | 102 | unsigned long recommended_min; |
103 | extern int min_free_kbytes; | 103 | extern int min_free_kbytes; |
104 | 104 | ||
105 | if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG, | 105 | if (!khugepaged_enabled()) |
106 | &transparent_hugepage_flags) && | ||
107 | !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, | ||
108 | &transparent_hugepage_flags)) | ||
109 | return 0; | 106 | return 0; |
110 | 107 | ||
111 | for_each_populated_zone(zone) | 108 | for_each_populated_zone(zone) |
@@ -139,12 +136,6 @@ static int start_khugepaged(void) | |||
139 | { | 136 | { |
140 | int err = 0; | 137 | int err = 0; |
141 | if (khugepaged_enabled()) { | 138 | if (khugepaged_enabled()) { |
142 | int wakeup; | ||
143 | if (unlikely(!mm_slot_cache || !mm_slots_hash)) { | ||
144 | err = -ENOMEM; | ||
145 | goto out; | ||
146 | } | ||
147 | mutex_lock(&khugepaged_mutex); | ||
148 | if (!khugepaged_thread) | 139 | if (!khugepaged_thread) |
149 | khugepaged_thread = kthread_run(khugepaged, NULL, | 140 | khugepaged_thread = kthread_run(khugepaged, NULL, |
150 | "khugepaged"); | 141 | "khugepaged"); |
@@ -154,16 +145,16 @@ static int start_khugepaged(void) | |||
154 | err = PTR_ERR(khugepaged_thread); | 145 | err = PTR_ERR(khugepaged_thread); |
155 | khugepaged_thread = NULL; | 146 | khugepaged_thread = NULL; |
156 | } | 147 | } |
157 | wakeup = !list_empty(&khugepaged_scan.mm_head); | 148 | |
158 | mutex_unlock(&khugepaged_mutex); | 149 | if (!list_empty(&khugepaged_scan.mm_head)) |
159 | if (wakeup) | ||
160 | wake_up_interruptible(&khugepaged_wait); | 150 | wake_up_interruptible(&khugepaged_wait); |
161 | 151 | ||
162 | set_recommended_min_free_kbytes(); | 152 | set_recommended_min_free_kbytes(); |
163 | } else | 153 | } else if (khugepaged_thread) { |
164 | /* wakeup to exit */ | 154 | kthread_stop(khugepaged_thread); |
165 | wake_up_interruptible(&khugepaged_wait); | 155 | khugepaged_thread = NULL; |
166 | out: | 156 | } |
157 | |||
167 | return err; | 158 | return err; |
168 | } | 159 | } |
169 | 160 | ||
@@ -224,18 +215,16 @@ static ssize_t enabled_store(struct kobject *kobj, | |||
224 | TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); | 215 | TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); |
225 | 216 | ||
226 | if (ret > 0) { | 217 | if (ret > 0) { |
227 | int err = start_khugepaged(); | 218 | int err; |
219 | |||
220 | mutex_lock(&khugepaged_mutex); | ||
221 | err = start_khugepaged(); | ||
222 | mutex_unlock(&khugepaged_mutex); | ||
223 | |||
228 | if (err) | 224 | if (err) |
229 | ret = err; | 225 | ret = err; |
230 | } | 226 | } |
231 | 227 | ||
232 | if (ret > 0 && | ||
233 | (test_bit(TRANSPARENT_HUGEPAGE_FLAG, | ||
234 | &transparent_hugepage_flags) || | ||
235 | test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, | ||
236 | &transparent_hugepage_flags))) | ||
237 | set_recommended_min_free_kbytes(); | ||
238 | |||
239 | return ret; | 228 | return ret; |
240 | } | 229 | } |
241 | static struct kobj_attribute enabled_attr = | 230 | static struct kobj_attribute enabled_attr = |
@@ -570,8 +559,6 @@ static int __init hugepage_init(void) | |||
570 | 559 | ||
571 | start_khugepaged(); | 560 | start_khugepaged(); |
572 | 561 | ||
573 | set_recommended_min_free_kbytes(); | ||
574 | |||
575 | return 0; | 562 | return 0; |
576 | out: | 563 | out: |
577 | hugepage_exit_sysfs(hugepage_kobj); | 564 | hugepage_exit_sysfs(hugepage_kobj); |
@@ -611,19 +598,6 @@ out: | |||
611 | } | 598 | } |
612 | __setup("transparent_hugepage=", setup_transparent_hugepage); | 599 | __setup("transparent_hugepage=", setup_transparent_hugepage); |
613 | 600 | ||
614 | static void prepare_pmd_huge_pte(pgtable_t pgtable, | ||
615 | struct mm_struct *mm) | ||
616 | { | ||
617 | assert_spin_locked(&mm->page_table_lock); | ||
618 | |||
619 | /* FIFO */ | ||
620 | if (!mm->pmd_huge_pte) | ||
621 | INIT_LIST_HEAD(&pgtable->lru); | ||
622 | else | ||
623 | list_add(&pgtable->lru, &mm->pmd_huge_pte->lru); | ||
624 | mm->pmd_huge_pte = pgtable; | ||
625 | } | ||
626 | |||
627 | static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) | 601 | static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) |
628 | { | 602 | { |
629 | if (likely(vma->vm_flags & VM_WRITE)) | 603 | if (likely(vma->vm_flags & VM_WRITE)) |
@@ -665,7 +639,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
665 | */ | 639 | */ |
666 | page_add_new_anon_rmap(page, vma, haddr); | 640 | page_add_new_anon_rmap(page, vma, haddr); |
667 | set_pmd_at(mm, haddr, pmd, entry); | 641 | set_pmd_at(mm, haddr, pmd, entry); |
668 | prepare_pmd_huge_pte(pgtable, mm); | 642 | pgtable_trans_huge_deposit(mm, pgtable); |
669 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); | 643 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); |
670 | mm->nr_ptes++; | 644 | mm->nr_ptes++; |
671 | spin_unlock(&mm->page_table_lock); | 645 | spin_unlock(&mm->page_table_lock); |
@@ -791,7 +765,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
791 | pmdp_set_wrprotect(src_mm, addr, src_pmd); | 765 | pmdp_set_wrprotect(src_mm, addr, src_pmd); |
792 | pmd = pmd_mkold(pmd_wrprotect(pmd)); | 766 | pmd = pmd_mkold(pmd_wrprotect(pmd)); |
793 | set_pmd_at(dst_mm, addr, dst_pmd, pmd); | 767 | set_pmd_at(dst_mm, addr, dst_pmd, pmd); |
794 | prepare_pmd_huge_pte(pgtable, dst_mm); | 768 | pgtable_trans_huge_deposit(dst_mm, pgtable); |
795 | dst_mm->nr_ptes++; | 769 | dst_mm->nr_ptes++; |
796 | 770 | ||
797 | ret = 0; | 771 | ret = 0; |
@@ -802,25 +776,6 @@ out: | |||
802 | return ret; | 776 | return ret; |
803 | } | 777 | } |
804 | 778 | ||
805 | /* no "address" argument so destroys page coloring of some arch */ | ||
806 | pgtable_t get_pmd_huge_pte(struct mm_struct *mm) | ||
807 | { | ||
808 | pgtable_t pgtable; | ||
809 | |||
810 | assert_spin_locked(&mm->page_table_lock); | ||
811 | |||
812 | /* FIFO */ | ||
813 | pgtable = mm->pmd_huge_pte; | ||
814 | if (list_empty(&pgtable->lru)) | ||
815 | mm->pmd_huge_pte = NULL; | ||
816 | else { | ||
817 | mm->pmd_huge_pte = list_entry(pgtable->lru.next, | ||
818 | struct page, lru); | ||
819 | list_del(&pgtable->lru); | ||
820 | } | ||
821 | return pgtable; | ||
822 | } | ||
823 | |||
824 | static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | 779 | static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, |
825 | struct vm_area_struct *vma, | 780 | struct vm_area_struct *vma, |
826 | unsigned long address, | 781 | unsigned long address, |
@@ -832,6 +787,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | |||
832 | pmd_t _pmd; | 787 | pmd_t _pmd; |
833 | int ret = 0, i; | 788 | int ret = 0, i; |
834 | struct page **pages; | 789 | struct page **pages; |
790 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
791 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
835 | 792 | ||
836 | pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, | 793 | pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, |
837 | GFP_KERNEL); | 794 | GFP_KERNEL); |
@@ -868,15 +825,19 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | |||
868 | cond_resched(); | 825 | cond_resched(); |
869 | } | 826 | } |
870 | 827 | ||
828 | mmun_start = haddr; | ||
829 | mmun_end = haddr + HPAGE_PMD_SIZE; | ||
830 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
831 | |||
871 | spin_lock(&mm->page_table_lock); | 832 | spin_lock(&mm->page_table_lock); |
872 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | 833 | if (unlikely(!pmd_same(*pmd, orig_pmd))) |
873 | goto out_free_pages; | 834 | goto out_free_pages; |
874 | VM_BUG_ON(!PageHead(page)); | 835 | VM_BUG_ON(!PageHead(page)); |
875 | 836 | ||
876 | pmdp_clear_flush_notify(vma, haddr, pmd); | 837 | pmdp_clear_flush(vma, haddr, pmd); |
877 | /* leave pmd empty until pte is filled */ | 838 | /* leave pmd empty until pte is filled */ |
878 | 839 | ||
879 | pgtable = get_pmd_huge_pte(mm); | 840 | pgtable = pgtable_trans_huge_withdraw(mm); |
880 | pmd_populate(mm, &_pmd, pgtable); | 841 | pmd_populate(mm, &_pmd, pgtable); |
881 | 842 | ||
882 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | 843 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { |
@@ -896,6 +857,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | |||
896 | page_remove_rmap(page); | 857 | page_remove_rmap(page); |
897 | spin_unlock(&mm->page_table_lock); | 858 | spin_unlock(&mm->page_table_lock); |
898 | 859 | ||
860 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
861 | |||
899 | ret |= VM_FAULT_WRITE; | 862 | ret |= VM_FAULT_WRITE; |
900 | put_page(page); | 863 | put_page(page); |
901 | 864 | ||
@@ -904,6 +867,7 @@ out: | |||
904 | 867 | ||
905 | out_free_pages: | 868 | out_free_pages: |
906 | spin_unlock(&mm->page_table_lock); | 869 | spin_unlock(&mm->page_table_lock); |
870 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
907 | mem_cgroup_uncharge_start(); | 871 | mem_cgroup_uncharge_start(); |
908 | for (i = 0; i < HPAGE_PMD_NR; i++) { | 872 | for (i = 0; i < HPAGE_PMD_NR; i++) { |
909 | mem_cgroup_uncharge_page(pages[i]); | 873 | mem_cgroup_uncharge_page(pages[i]); |
@@ -920,6 +884,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
920 | int ret = 0; | 884 | int ret = 0; |
921 | struct page *page, *new_page; | 885 | struct page *page, *new_page; |
922 | unsigned long haddr; | 886 | unsigned long haddr; |
887 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
888 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
923 | 889 | ||
924 | VM_BUG_ON(!vma->anon_vma); | 890 | VM_BUG_ON(!vma->anon_vma); |
925 | spin_lock(&mm->page_table_lock); | 891 | spin_lock(&mm->page_table_lock); |
@@ -934,7 +900,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
934 | entry = pmd_mkyoung(orig_pmd); | 900 | entry = pmd_mkyoung(orig_pmd); |
935 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | 901 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); |
936 | if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) | 902 | if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) |
937 | update_mmu_cache(vma, address, entry); | 903 | update_mmu_cache_pmd(vma, address, pmd); |
938 | ret |= VM_FAULT_WRITE; | 904 | ret |= VM_FAULT_WRITE; |
939 | goto out_unlock; | 905 | goto out_unlock; |
940 | } | 906 | } |
@@ -970,38 +936,47 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
970 | copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); | 936 | copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); |
971 | __SetPageUptodate(new_page); | 937 | __SetPageUptodate(new_page); |
972 | 938 | ||
939 | mmun_start = haddr; | ||
940 | mmun_end = haddr + HPAGE_PMD_SIZE; | ||
941 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
942 | |||
973 | spin_lock(&mm->page_table_lock); | 943 | spin_lock(&mm->page_table_lock); |
974 | put_page(page); | 944 | put_page(page); |
975 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { | 945 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { |
976 | spin_unlock(&mm->page_table_lock); | 946 | spin_unlock(&mm->page_table_lock); |
977 | mem_cgroup_uncharge_page(new_page); | 947 | mem_cgroup_uncharge_page(new_page); |
978 | put_page(new_page); | 948 | put_page(new_page); |
979 | goto out; | 949 | goto out_mn; |
980 | } else { | 950 | } else { |
981 | pmd_t entry; | 951 | pmd_t entry; |
982 | VM_BUG_ON(!PageHead(page)); | 952 | VM_BUG_ON(!PageHead(page)); |
983 | entry = mk_pmd(new_page, vma->vm_page_prot); | 953 | entry = mk_pmd(new_page, vma->vm_page_prot); |
984 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | 954 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); |
985 | entry = pmd_mkhuge(entry); | 955 | entry = pmd_mkhuge(entry); |
986 | pmdp_clear_flush_notify(vma, haddr, pmd); | 956 | pmdp_clear_flush(vma, haddr, pmd); |
987 | page_add_new_anon_rmap(new_page, vma, haddr); | 957 | page_add_new_anon_rmap(new_page, vma, haddr); |
988 | set_pmd_at(mm, haddr, pmd, entry); | 958 | set_pmd_at(mm, haddr, pmd, entry); |
989 | update_mmu_cache(vma, address, entry); | 959 | update_mmu_cache_pmd(vma, address, pmd); |
990 | page_remove_rmap(page); | 960 | page_remove_rmap(page); |
991 | put_page(page); | 961 | put_page(page); |
992 | ret |= VM_FAULT_WRITE; | 962 | ret |= VM_FAULT_WRITE; |
993 | } | 963 | } |
994 | out_unlock: | ||
995 | spin_unlock(&mm->page_table_lock); | 964 | spin_unlock(&mm->page_table_lock); |
965 | out_mn: | ||
966 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
996 | out: | 967 | out: |
997 | return ret; | 968 | return ret; |
969 | out_unlock: | ||
970 | spin_unlock(&mm->page_table_lock); | ||
971 | return ret; | ||
998 | } | 972 | } |
999 | 973 | ||
1000 | struct page *follow_trans_huge_pmd(struct mm_struct *mm, | 974 | struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, |
1001 | unsigned long addr, | 975 | unsigned long addr, |
1002 | pmd_t *pmd, | 976 | pmd_t *pmd, |
1003 | unsigned int flags) | 977 | unsigned int flags) |
1004 | { | 978 | { |
979 | struct mm_struct *mm = vma->vm_mm; | ||
1005 | struct page *page = NULL; | 980 | struct page *page = NULL; |
1006 | 981 | ||
1007 | assert_spin_locked(&mm->page_table_lock); | 982 | assert_spin_locked(&mm->page_table_lock); |
@@ -1024,6 +999,14 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm, | |||
1024 | _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); | 999 | _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); |
1025 | set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); | 1000 | set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); |
1026 | } | 1001 | } |
1002 | if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { | ||
1003 | if (page->mapping && trylock_page(page)) { | ||
1004 | lru_add_drain(); | ||
1005 | if (page->mapping) | ||
1006 | mlock_vma_page(page); | ||
1007 | unlock_page(page); | ||
1008 | } | ||
1009 | } | ||
1027 | page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; | 1010 | page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; |
1028 | VM_BUG_ON(!PageCompound(page)); | 1011 | VM_BUG_ON(!PageCompound(page)); |
1029 | if (flags & FOLL_GET) | 1012 | if (flags & FOLL_GET) |
@@ -1041,9 +1024,10 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
1041 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { | 1024 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { |
1042 | struct page *page; | 1025 | struct page *page; |
1043 | pgtable_t pgtable; | 1026 | pgtable_t pgtable; |
1044 | pgtable = get_pmd_huge_pte(tlb->mm); | 1027 | pmd_t orig_pmd; |
1045 | page = pmd_page(*pmd); | 1028 | pgtable = pgtable_trans_huge_withdraw(tlb->mm); |
1046 | pmd_clear(pmd); | 1029 | orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); |
1030 | page = pmd_page(orig_pmd); | ||
1047 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); | 1031 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); |
1048 | page_remove_rmap(page); | 1032 | page_remove_rmap(page); |
1049 | VM_BUG_ON(page_mapcount(page) < 0); | 1033 | VM_BUG_ON(page_mapcount(page) < 0); |
@@ -1207,7 +1191,11 @@ static int __split_huge_page_splitting(struct page *page, | |||
1207 | struct mm_struct *mm = vma->vm_mm; | 1191 | struct mm_struct *mm = vma->vm_mm; |
1208 | pmd_t *pmd; | 1192 | pmd_t *pmd; |
1209 | int ret = 0; | 1193 | int ret = 0; |
1194 | /* For mmu_notifiers */ | ||
1195 | const unsigned long mmun_start = address; | ||
1196 | const unsigned long mmun_end = address + HPAGE_PMD_SIZE; | ||
1210 | 1197 | ||
1198 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
1211 | spin_lock(&mm->page_table_lock); | 1199 | spin_lock(&mm->page_table_lock); |
1212 | pmd = page_check_address_pmd(page, mm, address, | 1200 | pmd = page_check_address_pmd(page, mm, address, |
1213 | PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); | 1201 | PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); |
@@ -1219,10 +1207,11 @@ static int __split_huge_page_splitting(struct page *page, | |||
1219 | * and it won't wait on the anon_vma->root->mutex to | 1207 | * and it won't wait on the anon_vma->root->mutex to |
1220 | * serialize against split_huge_page*. | 1208 | * serialize against split_huge_page*. |
1221 | */ | 1209 | */ |
1222 | pmdp_splitting_flush_notify(vma, address, pmd); | 1210 | pmdp_splitting_flush(vma, address, pmd); |
1223 | ret = 1; | 1211 | ret = 1; |
1224 | } | 1212 | } |
1225 | spin_unlock(&mm->page_table_lock); | 1213 | spin_unlock(&mm->page_table_lock); |
1214 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
1226 | 1215 | ||
1227 | return ret; | 1216 | return ret; |
1228 | } | 1217 | } |
@@ -1358,11 +1347,11 @@ static int __split_huge_page_map(struct page *page, | |||
1358 | pmd = page_check_address_pmd(page, mm, address, | 1347 | pmd = page_check_address_pmd(page, mm, address, |
1359 | PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); | 1348 | PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); |
1360 | if (pmd) { | 1349 | if (pmd) { |
1361 | pgtable = get_pmd_huge_pte(mm); | 1350 | pgtable = pgtable_trans_huge_withdraw(mm); |
1362 | pmd_populate(mm, &_pmd, pgtable); | 1351 | pmd_populate(mm, &_pmd, pgtable); |
1363 | 1352 | ||
1364 | for (i = 0, haddr = address; i < HPAGE_PMD_NR; | 1353 | haddr = address; |
1365 | i++, haddr += PAGE_SIZE) { | 1354 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { |
1366 | pte_t *pte, entry; | 1355 | pte_t *pte, entry; |
1367 | BUG_ON(PageCompound(page+i)); | 1356 | BUG_ON(PageCompound(page+i)); |
1368 | entry = mk_pte(page + i, vma->vm_page_prot); | 1357 | entry = mk_pte(page + i, vma->vm_page_prot); |
@@ -1406,8 +1395,7 @@ static int __split_huge_page_map(struct page *page, | |||
1406 | * SMP TLB and finally we write the non-huge version | 1395 | * SMP TLB and finally we write the non-huge version |
1407 | * of the pmd entry with pmd_populate. | 1396 | * of the pmd entry with pmd_populate. |
1408 | */ | 1397 | */ |
1409 | set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd)); | 1398 | pmdp_invalidate(vma, address, pmd); |
1410 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | ||
1411 | pmd_populate(mm, pmd, pgtable); | 1399 | pmd_populate(mm, pmd, pgtable); |
1412 | ret = 1; | 1400 | ret = 1; |
1413 | } | 1401 | } |
@@ -1421,18 +1409,17 @@ static void __split_huge_page(struct page *page, | |||
1421 | struct anon_vma *anon_vma) | 1409 | struct anon_vma *anon_vma) |
1422 | { | 1410 | { |
1423 | int mapcount, mapcount2; | 1411 | int mapcount, mapcount2; |
1412 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
1424 | struct anon_vma_chain *avc; | 1413 | struct anon_vma_chain *avc; |
1425 | 1414 | ||
1426 | BUG_ON(!PageHead(page)); | 1415 | BUG_ON(!PageHead(page)); |
1427 | BUG_ON(PageTail(page)); | 1416 | BUG_ON(PageTail(page)); |
1428 | 1417 | ||
1429 | mapcount = 0; | 1418 | mapcount = 0; |
1430 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { | 1419 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { |
1431 | struct vm_area_struct *vma = avc->vma; | 1420 | struct vm_area_struct *vma = avc->vma; |
1432 | unsigned long addr = vma_address(page, vma); | 1421 | unsigned long addr = vma_address(page, vma); |
1433 | BUG_ON(is_vma_temporary_stack(vma)); | 1422 | BUG_ON(is_vma_temporary_stack(vma)); |
1434 | if (addr == -EFAULT) | ||
1435 | continue; | ||
1436 | mapcount += __split_huge_page_splitting(page, vma, addr); | 1423 | mapcount += __split_huge_page_splitting(page, vma, addr); |
1437 | } | 1424 | } |
1438 | /* | 1425 | /* |
@@ -1453,12 +1440,10 @@ static void __split_huge_page(struct page *page, | |||
1453 | __split_huge_page_refcount(page); | 1440 | __split_huge_page_refcount(page); |
1454 | 1441 | ||
1455 | mapcount2 = 0; | 1442 | mapcount2 = 0; |
1456 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { | 1443 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { |
1457 | struct vm_area_struct *vma = avc->vma; | 1444 | struct vm_area_struct *vma = avc->vma; |
1458 | unsigned long addr = vma_address(page, vma); | 1445 | unsigned long addr = vma_address(page, vma); |
1459 | BUG_ON(is_vma_temporary_stack(vma)); | 1446 | BUG_ON(is_vma_temporary_stack(vma)); |
1460 | if (addr == -EFAULT) | ||
1461 | continue; | ||
1462 | mapcount2 += __split_huge_page_map(page, vma, addr); | 1447 | mapcount2 += __split_huge_page_map(page, vma, addr); |
1463 | } | 1448 | } |
1464 | if (mapcount != mapcount2) | 1449 | if (mapcount != mapcount2) |
@@ -1491,12 +1476,13 @@ out: | |||
1491 | return ret; | 1476 | return ret; |
1492 | } | 1477 | } |
1493 | 1478 | ||
1494 | #define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \ | 1479 | #define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE) |
1495 | VM_HUGETLB|VM_SHARED|VM_MAYSHARE) | ||
1496 | 1480 | ||
1497 | int hugepage_madvise(struct vm_area_struct *vma, | 1481 | int hugepage_madvise(struct vm_area_struct *vma, |
1498 | unsigned long *vm_flags, int advice) | 1482 | unsigned long *vm_flags, int advice) |
1499 | { | 1483 | { |
1484 | struct mm_struct *mm = vma->vm_mm; | ||
1485 | |||
1500 | switch (advice) { | 1486 | switch (advice) { |
1501 | case MADV_HUGEPAGE: | 1487 | case MADV_HUGEPAGE: |
1502 | /* | 1488 | /* |
@@ -1504,6 +1490,8 @@ int hugepage_madvise(struct vm_area_struct *vma, | |||
1504 | */ | 1490 | */ |
1505 | if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) | 1491 | if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) |
1506 | return -EINVAL; | 1492 | return -EINVAL; |
1493 | if (mm->def_flags & VM_NOHUGEPAGE) | ||
1494 | return -EINVAL; | ||
1507 | *vm_flags &= ~VM_NOHUGEPAGE; | 1495 | *vm_flags &= ~VM_NOHUGEPAGE; |
1508 | *vm_flags |= VM_HUGEPAGE; | 1496 | *vm_flags |= VM_HUGEPAGE; |
1509 | /* | 1497 | /* |
@@ -1655,11 +1643,7 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma) | |||
1655 | if (vma->vm_ops) | 1643 | if (vma->vm_ops) |
1656 | /* khugepaged not yet working on file or special mappings */ | 1644 | /* khugepaged not yet working on file or special mappings */ |
1657 | return 0; | 1645 | return 0; |
1658 | /* | 1646 | VM_BUG_ON(vma->vm_flags & VM_NO_THP); |
1659 | * If is_pfn_mapping() is true is_learn_pfn_mapping() must be | ||
1660 | * true too, verify it here. | ||
1661 | */ | ||
1662 | VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP); | ||
1663 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | 1647 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; |
1664 | hend = vma->vm_end & HPAGE_PMD_MASK; | 1648 | hend = vma->vm_end & HPAGE_PMD_MASK; |
1665 | if (hstart < hend) | 1649 | if (hstart < hend) |
@@ -1833,28 +1817,35 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, | |||
1833 | } | 1817 | } |
1834 | } | 1818 | } |
1835 | 1819 | ||
1836 | static void collapse_huge_page(struct mm_struct *mm, | 1820 | static void khugepaged_alloc_sleep(void) |
1837 | unsigned long address, | ||
1838 | struct page **hpage, | ||
1839 | struct vm_area_struct *vma, | ||
1840 | int node) | ||
1841 | { | 1821 | { |
1842 | pgd_t *pgd; | 1822 | wait_event_freezable_timeout(khugepaged_wait, false, |
1843 | pud_t *pud; | 1823 | msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); |
1844 | pmd_t *pmd, _pmd; | 1824 | } |
1845 | pte_t *pte; | ||
1846 | pgtable_t pgtable; | ||
1847 | struct page *new_page; | ||
1848 | spinlock_t *ptl; | ||
1849 | int isolated; | ||
1850 | unsigned long hstart, hend; | ||
1851 | 1825 | ||
1852 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | 1826 | #ifdef CONFIG_NUMA |
1853 | #ifndef CONFIG_NUMA | 1827 | static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) |
1854 | up_read(&mm->mmap_sem); | 1828 | { |
1855 | VM_BUG_ON(!*hpage); | 1829 | if (IS_ERR(*hpage)) { |
1856 | new_page = *hpage; | 1830 | if (!*wait) |
1857 | #else | 1831 | return false; |
1832 | |||
1833 | *wait = false; | ||
1834 | *hpage = NULL; | ||
1835 | khugepaged_alloc_sleep(); | ||
1836 | } else if (*hpage) { | ||
1837 | put_page(*hpage); | ||
1838 | *hpage = NULL; | ||
1839 | } | ||
1840 | |||
1841 | return true; | ||
1842 | } | ||
1843 | |||
1844 | static struct page | ||
1845 | *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, | ||
1846 | struct vm_area_struct *vma, unsigned long address, | ||
1847 | int node) | ||
1848 | { | ||
1858 | VM_BUG_ON(*hpage); | 1849 | VM_BUG_ON(*hpage); |
1859 | /* | 1850 | /* |
1860 | * Allocate the page while the vma is still valid and under | 1851 | * Allocate the page while the vma is still valid and under |
@@ -1866,7 +1857,7 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1866 | * mmap_sem in read mode is good idea also to allow greater | 1857 | * mmap_sem in read mode is good idea also to allow greater |
1867 | * scalability. | 1858 | * scalability. |
1868 | */ | 1859 | */ |
1869 | new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address, | 1860 | *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address, |
1870 | node, __GFP_OTHER_NODE); | 1861 | node, __GFP_OTHER_NODE); |
1871 | 1862 | ||
1872 | /* | 1863 | /* |
@@ -1874,20 +1865,85 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1874 | * preparation for taking it in write mode. | 1865 | * preparation for taking it in write mode. |
1875 | */ | 1866 | */ |
1876 | up_read(&mm->mmap_sem); | 1867 | up_read(&mm->mmap_sem); |
1877 | if (unlikely(!new_page)) { | 1868 | if (unlikely(!*hpage)) { |
1878 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | 1869 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); |
1879 | *hpage = ERR_PTR(-ENOMEM); | 1870 | *hpage = ERR_PTR(-ENOMEM); |
1880 | return; | 1871 | return NULL; |
1881 | } | 1872 | } |
1882 | #endif | ||
1883 | 1873 | ||
1884 | count_vm_event(THP_COLLAPSE_ALLOC); | 1874 | count_vm_event(THP_COLLAPSE_ALLOC); |
1885 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { | 1875 | return *hpage; |
1886 | #ifdef CONFIG_NUMA | 1876 | } |
1887 | put_page(new_page); | 1877 | #else |
1878 | static struct page *khugepaged_alloc_hugepage(bool *wait) | ||
1879 | { | ||
1880 | struct page *hpage; | ||
1881 | |||
1882 | do { | ||
1883 | hpage = alloc_hugepage(khugepaged_defrag()); | ||
1884 | if (!hpage) { | ||
1885 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | ||
1886 | if (!*wait) | ||
1887 | return NULL; | ||
1888 | |||
1889 | *wait = false; | ||
1890 | khugepaged_alloc_sleep(); | ||
1891 | } else | ||
1892 | count_vm_event(THP_COLLAPSE_ALLOC); | ||
1893 | } while (unlikely(!hpage) && likely(khugepaged_enabled())); | ||
1894 | |||
1895 | return hpage; | ||
1896 | } | ||
1897 | |||
1898 | static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) | ||
1899 | { | ||
1900 | if (!*hpage) | ||
1901 | *hpage = khugepaged_alloc_hugepage(wait); | ||
1902 | |||
1903 | if (unlikely(!*hpage)) | ||
1904 | return false; | ||
1905 | |||
1906 | return true; | ||
1907 | } | ||
1908 | |||
1909 | static struct page | ||
1910 | *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, | ||
1911 | struct vm_area_struct *vma, unsigned long address, | ||
1912 | int node) | ||
1913 | { | ||
1914 | up_read(&mm->mmap_sem); | ||
1915 | VM_BUG_ON(!*hpage); | ||
1916 | return *hpage; | ||
1917 | } | ||
1888 | #endif | 1918 | #endif |
1919 | |||
1920 | static void collapse_huge_page(struct mm_struct *mm, | ||
1921 | unsigned long address, | ||
1922 | struct page **hpage, | ||
1923 | struct vm_area_struct *vma, | ||
1924 | int node) | ||
1925 | { | ||
1926 | pgd_t *pgd; | ||
1927 | pud_t *pud; | ||
1928 | pmd_t *pmd, _pmd; | ||
1929 | pte_t *pte; | ||
1930 | pgtable_t pgtable; | ||
1931 | struct page *new_page; | ||
1932 | spinlock_t *ptl; | ||
1933 | int isolated; | ||
1934 | unsigned long hstart, hend; | ||
1935 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
1936 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
1937 | |||
1938 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
1939 | |||
1940 | /* release the mmap_sem read lock. */ | ||
1941 | new_page = khugepaged_alloc_page(hpage, mm, vma, address, node); | ||
1942 | if (!new_page) | ||
1943 | return; | ||
1944 | |||
1945 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) | ||
1889 | return; | 1946 | return; |
1890 | } | ||
1891 | 1947 | ||
1892 | /* | 1948 | /* |
1893 | * Prevent all access to pagetables with the exception of | 1949 | * Prevent all access to pagetables with the exception of |
@@ -1912,11 +1968,7 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1912 | goto out; | 1968 | goto out; |
1913 | if (is_vma_temporary_stack(vma)) | 1969 | if (is_vma_temporary_stack(vma)) |
1914 | goto out; | 1970 | goto out; |
1915 | /* | 1971 | VM_BUG_ON(vma->vm_flags & VM_NO_THP); |
1916 | * If is_pfn_mapping() is true is_learn_pfn_mapping() must be | ||
1917 | * true too, verify it here. | ||
1918 | */ | ||
1919 | VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP); | ||
1920 | 1972 | ||
1921 | pgd = pgd_offset(mm, address); | 1973 | pgd = pgd_offset(mm, address); |
1922 | if (!pgd_present(*pgd)) | 1974 | if (!pgd_present(*pgd)) |
@@ -1936,6 +1988,9 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1936 | pte = pte_offset_map(pmd, address); | 1988 | pte = pte_offset_map(pmd, address); |
1937 | ptl = pte_lockptr(mm, pmd); | 1989 | ptl = pte_lockptr(mm, pmd); |
1938 | 1990 | ||
1991 | mmun_start = address; | ||
1992 | mmun_end = address + HPAGE_PMD_SIZE; | ||
1993 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
1939 | spin_lock(&mm->page_table_lock); /* probably unnecessary */ | 1994 | spin_lock(&mm->page_table_lock); /* probably unnecessary */ |
1940 | /* | 1995 | /* |
1941 | * After this gup_fast can't run anymore. This also removes | 1996 | * After this gup_fast can't run anymore. This also removes |
@@ -1943,8 +1998,9 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1943 | * huge and small TLB entries for the same virtual address | 1998 | * huge and small TLB entries for the same virtual address |
1944 | * to avoid the risk of CPU bugs in that area. | 1999 | * to avoid the risk of CPU bugs in that area. |
1945 | */ | 2000 | */ |
1946 | _pmd = pmdp_clear_flush_notify(vma, address, pmd); | 2001 | _pmd = pmdp_clear_flush(vma, address, pmd); |
1947 | spin_unlock(&mm->page_table_lock); | 2002 | spin_unlock(&mm->page_table_lock); |
2003 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
1948 | 2004 | ||
1949 | spin_lock(ptl); | 2005 | spin_lock(ptl); |
1950 | isolated = __collapse_huge_page_isolate(vma, address, pte); | 2006 | isolated = __collapse_huge_page_isolate(vma, address, pte); |
@@ -1970,8 +2026,6 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1970 | pte_unmap(pte); | 2026 | pte_unmap(pte); |
1971 | __SetPageUptodate(new_page); | 2027 | __SetPageUptodate(new_page); |
1972 | pgtable = pmd_pgtable(_pmd); | 2028 | pgtable = pmd_pgtable(_pmd); |
1973 | VM_BUG_ON(page_count(pgtable) != 1); | ||
1974 | VM_BUG_ON(page_mapcount(pgtable) != 0); | ||
1975 | 2029 | ||
1976 | _pmd = mk_pmd(new_page, vma->vm_page_prot); | 2030 | _pmd = mk_pmd(new_page, vma->vm_page_prot); |
1977 | _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); | 2031 | _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); |
@@ -1988,13 +2042,12 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1988 | BUG_ON(!pmd_none(*pmd)); | 2042 | BUG_ON(!pmd_none(*pmd)); |
1989 | page_add_new_anon_rmap(new_page, vma, address); | 2043 | page_add_new_anon_rmap(new_page, vma, address); |
1990 | set_pmd_at(mm, address, pmd, _pmd); | 2044 | set_pmd_at(mm, address, pmd, _pmd); |
1991 | update_mmu_cache(vma, address, _pmd); | 2045 | update_mmu_cache_pmd(vma, address, pmd); |
1992 | prepare_pmd_huge_pte(pgtable, mm); | 2046 | pgtable_trans_huge_deposit(mm, pgtable); |
1993 | spin_unlock(&mm->page_table_lock); | 2047 | spin_unlock(&mm->page_table_lock); |
1994 | 2048 | ||
1995 | #ifndef CONFIG_NUMA | ||
1996 | *hpage = NULL; | 2049 | *hpage = NULL; |
1997 | #endif | 2050 | |
1998 | khugepaged_pages_collapsed++; | 2051 | khugepaged_pages_collapsed++; |
1999 | out_up_write: | 2052 | out_up_write: |
2000 | up_write(&mm->mmap_sem); | 2053 | up_write(&mm->mmap_sem); |
@@ -2002,9 +2055,6 @@ out_up_write: | |||
2002 | 2055 | ||
2003 | out: | 2056 | out: |
2004 | mem_cgroup_uncharge_page(new_page); | 2057 | mem_cgroup_uncharge_page(new_page); |
2005 | #ifdef CONFIG_NUMA | ||
2006 | put_page(new_page); | ||
2007 | #endif | ||
2008 | goto out_up_write; | 2058 | goto out_up_write; |
2009 | } | 2059 | } |
2010 | 2060 | ||
@@ -2154,12 +2204,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, | |||
2154 | goto skip; | 2204 | goto skip; |
2155 | if (is_vma_temporary_stack(vma)) | 2205 | if (is_vma_temporary_stack(vma)) |
2156 | goto skip; | 2206 | goto skip; |
2157 | /* | 2207 | VM_BUG_ON(vma->vm_flags & VM_NO_THP); |
2158 | * If is_pfn_mapping() is true is_learn_pfn_mapping() | ||
2159 | * must be true too, verify it here. | ||
2160 | */ | ||
2161 | VM_BUG_ON(is_linear_pfn_mapping(vma) || | ||
2162 | vma->vm_flags & VM_NO_THP); | ||
2163 | 2208 | ||
2164 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | 2209 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; |
2165 | hend = vma->vm_end & HPAGE_PMD_MASK; | 2210 | hend = vma->vm_end & HPAGE_PMD_MASK; |
@@ -2234,32 +2279,23 @@ static int khugepaged_has_work(void) | |||
2234 | static int khugepaged_wait_event(void) | 2279 | static int khugepaged_wait_event(void) |
2235 | { | 2280 | { |
2236 | return !list_empty(&khugepaged_scan.mm_head) || | 2281 | return !list_empty(&khugepaged_scan.mm_head) || |
2237 | !khugepaged_enabled(); | 2282 | kthread_should_stop(); |
2238 | } | 2283 | } |
2239 | 2284 | ||
2240 | static void khugepaged_do_scan(struct page **hpage) | 2285 | static void khugepaged_do_scan(void) |
2241 | { | 2286 | { |
2287 | struct page *hpage = NULL; | ||
2242 | unsigned int progress = 0, pass_through_head = 0; | 2288 | unsigned int progress = 0, pass_through_head = 0; |
2243 | unsigned int pages = khugepaged_pages_to_scan; | 2289 | unsigned int pages = khugepaged_pages_to_scan; |
2290 | bool wait = true; | ||
2244 | 2291 | ||
2245 | barrier(); /* write khugepaged_pages_to_scan to local stack */ | 2292 | barrier(); /* write khugepaged_pages_to_scan to local stack */ |
2246 | 2293 | ||
2247 | while (progress < pages) { | 2294 | while (progress < pages) { |
2248 | cond_resched(); | 2295 | if (!khugepaged_prealloc_page(&hpage, &wait)) |
2249 | |||
2250 | #ifndef CONFIG_NUMA | ||
2251 | if (!*hpage) { | ||
2252 | *hpage = alloc_hugepage(khugepaged_defrag()); | ||
2253 | if (unlikely(!*hpage)) { | ||
2254 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | ||
2255 | break; | ||
2256 | } | ||
2257 | count_vm_event(THP_COLLAPSE_ALLOC); | ||
2258 | } | ||
2259 | #else | ||
2260 | if (IS_ERR(*hpage)) | ||
2261 | break; | 2296 | break; |
2262 | #endif | 2297 | |
2298 | cond_resched(); | ||
2263 | 2299 | ||
2264 | if (unlikely(kthread_should_stop() || freezing(current))) | 2300 | if (unlikely(kthread_should_stop() || freezing(current))) |
2265 | break; | 2301 | break; |
@@ -2270,73 +2306,32 @@ static void khugepaged_do_scan(struct page **hpage) | |||
2270 | if (khugepaged_has_work() && | 2306 | if (khugepaged_has_work() && |
2271 | pass_through_head < 2) | 2307 | pass_through_head < 2) |
2272 | progress += khugepaged_scan_mm_slot(pages - progress, | 2308 | progress += khugepaged_scan_mm_slot(pages - progress, |
2273 | hpage); | 2309 | &hpage); |
2274 | else | 2310 | else |
2275 | progress = pages; | 2311 | progress = pages; |
2276 | spin_unlock(&khugepaged_mm_lock); | 2312 | spin_unlock(&khugepaged_mm_lock); |
2277 | } | 2313 | } |
2278 | } | ||
2279 | 2314 | ||
2280 | static void khugepaged_alloc_sleep(void) | 2315 | if (!IS_ERR_OR_NULL(hpage)) |
2281 | { | 2316 | put_page(hpage); |
2282 | wait_event_freezable_timeout(khugepaged_wait, false, | ||
2283 | msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); | ||
2284 | } | 2317 | } |
2285 | 2318 | ||
2286 | #ifndef CONFIG_NUMA | 2319 | static void khugepaged_wait_work(void) |
2287 | static struct page *khugepaged_alloc_hugepage(void) | ||
2288 | { | 2320 | { |
2289 | struct page *hpage; | 2321 | try_to_freeze(); |
2290 | |||
2291 | do { | ||
2292 | hpage = alloc_hugepage(khugepaged_defrag()); | ||
2293 | if (!hpage) { | ||
2294 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | ||
2295 | khugepaged_alloc_sleep(); | ||
2296 | } else | ||
2297 | count_vm_event(THP_COLLAPSE_ALLOC); | ||
2298 | } while (unlikely(!hpage) && | ||
2299 | likely(khugepaged_enabled())); | ||
2300 | return hpage; | ||
2301 | } | ||
2302 | #endif | ||
2303 | 2322 | ||
2304 | static void khugepaged_loop(void) | 2323 | if (khugepaged_has_work()) { |
2305 | { | 2324 | if (!khugepaged_scan_sleep_millisecs) |
2306 | struct page *hpage; | 2325 | return; |
2307 | 2326 | ||
2308 | #ifdef CONFIG_NUMA | 2327 | wait_event_freezable_timeout(khugepaged_wait, |
2309 | hpage = NULL; | 2328 | kthread_should_stop(), |
2310 | #endif | 2329 | msecs_to_jiffies(khugepaged_scan_sleep_millisecs)); |
2311 | while (likely(khugepaged_enabled())) { | 2330 | return; |
2312 | #ifndef CONFIG_NUMA | ||
2313 | hpage = khugepaged_alloc_hugepage(); | ||
2314 | if (unlikely(!hpage)) | ||
2315 | break; | ||
2316 | #else | ||
2317 | if (IS_ERR(hpage)) { | ||
2318 | khugepaged_alloc_sleep(); | ||
2319 | hpage = NULL; | ||
2320 | } | ||
2321 | #endif | ||
2322 | |||
2323 | khugepaged_do_scan(&hpage); | ||
2324 | #ifndef CONFIG_NUMA | ||
2325 | if (hpage) | ||
2326 | put_page(hpage); | ||
2327 | #endif | ||
2328 | try_to_freeze(); | ||
2329 | if (unlikely(kthread_should_stop())) | ||
2330 | break; | ||
2331 | if (khugepaged_has_work()) { | ||
2332 | if (!khugepaged_scan_sleep_millisecs) | ||
2333 | continue; | ||
2334 | wait_event_freezable_timeout(khugepaged_wait, false, | ||
2335 | msecs_to_jiffies(khugepaged_scan_sleep_millisecs)); | ||
2336 | } else if (khugepaged_enabled()) | ||
2337 | wait_event_freezable(khugepaged_wait, | ||
2338 | khugepaged_wait_event()); | ||
2339 | } | 2331 | } |
2332 | |||
2333 | if (khugepaged_enabled()) | ||
2334 | wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); | ||
2340 | } | 2335 | } |
2341 | 2336 | ||
2342 | static int khugepaged(void *none) | 2337 | static int khugepaged(void *none) |
@@ -2346,20 +2341,9 @@ static int khugepaged(void *none) | |||
2346 | set_freezable(); | 2341 | set_freezable(); |
2347 | set_user_nice(current, 19); | 2342 | set_user_nice(current, 19); |
2348 | 2343 | ||
2349 | /* serialize with start_khugepaged() */ | 2344 | while (!kthread_should_stop()) { |
2350 | mutex_lock(&khugepaged_mutex); | 2345 | khugepaged_do_scan(); |
2351 | 2346 | khugepaged_wait_work(); | |
2352 | for (;;) { | ||
2353 | mutex_unlock(&khugepaged_mutex); | ||
2354 | VM_BUG_ON(khugepaged_thread != current); | ||
2355 | khugepaged_loop(); | ||
2356 | VM_BUG_ON(khugepaged_thread != current); | ||
2357 | |||
2358 | mutex_lock(&khugepaged_mutex); | ||
2359 | if (!khugepaged_enabled()) | ||
2360 | break; | ||
2361 | if (unlikely(kthread_should_stop())) | ||
2362 | break; | ||
2363 | } | 2347 | } |
2364 | 2348 | ||
2365 | spin_lock(&khugepaged_mm_lock); | 2349 | spin_lock(&khugepaged_mm_lock); |
@@ -2368,10 +2352,6 @@ static int khugepaged(void *none) | |||
2368 | if (mm_slot) | 2352 | if (mm_slot) |
2369 | collect_mm_slot(mm_slot); | 2353 | collect_mm_slot(mm_slot); |
2370 | spin_unlock(&khugepaged_mm_lock); | 2354 | spin_unlock(&khugepaged_mm_lock); |
2371 | |||
2372 | khugepaged_thread = NULL; | ||
2373 | mutex_unlock(&khugepaged_mutex); | ||
2374 | |||
2375 | return 0; | 2355 | return 0; |
2376 | } | 2356 | } |
2377 | 2357 | ||
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bc727122dd44..59a0059b39e2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -30,7 +30,6 @@ | |||
30 | #include <linux/hugetlb.h> | 30 | #include <linux/hugetlb.h> |
31 | #include <linux/hugetlb_cgroup.h> | 31 | #include <linux/hugetlb_cgroup.h> |
32 | #include <linux/node.h> | 32 | #include <linux/node.h> |
33 | #include <linux/hugetlb_cgroup.h> | ||
34 | #include "internal.h" | 33 | #include "internal.h" |
35 | 34 | ||
36 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | 35 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; |
@@ -637,6 +636,7 @@ static void free_huge_page(struct page *page) | |||
637 | h->surplus_huge_pages--; | 636 | h->surplus_huge_pages--; |
638 | h->surplus_huge_pages_node[nid]--; | 637 | h->surplus_huge_pages_node[nid]--; |
639 | } else { | 638 | } else { |
639 | arch_clear_hugepage_flags(page); | ||
640 | enqueue_huge_page(h, page); | 640 | enqueue_huge_page(h, page); |
641 | } | 641 | } |
642 | spin_unlock(&hugetlb_lock); | 642 | spin_unlock(&hugetlb_lock); |
@@ -671,6 +671,11 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) | |||
671 | } | 671 | } |
672 | } | 672 | } |
673 | 673 | ||
674 | /* | ||
675 | * PageHuge() only returns true for hugetlbfs pages, but not for normal or | ||
676 | * transparent huge pages. See the PageTransHuge() documentation for more | ||
677 | * details. | ||
678 | */ | ||
674 | int PageHuge(struct page *page) | 679 | int PageHuge(struct page *page) |
675 | { | 680 | { |
676 | compound_page_dtor *dtor; | 681 | compound_page_dtor *dtor; |
@@ -2355,13 +2360,15 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
2355 | struct page *page; | 2360 | struct page *page; |
2356 | struct hstate *h = hstate_vma(vma); | 2361 | struct hstate *h = hstate_vma(vma); |
2357 | unsigned long sz = huge_page_size(h); | 2362 | unsigned long sz = huge_page_size(h); |
2363 | const unsigned long mmun_start = start; /* For mmu_notifiers */ | ||
2364 | const unsigned long mmun_end = end; /* For mmu_notifiers */ | ||
2358 | 2365 | ||
2359 | WARN_ON(!is_vm_hugetlb_page(vma)); | 2366 | WARN_ON(!is_vm_hugetlb_page(vma)); |
2360 | BUG_ON(start & ~huge_page_mask(h)); | 2367 | BUG_ON(start & ~huge_page_mask(h)); |
2361 | BUG_ON(end & ~huge_page_mask(h)); | 2368 | BUG_ON(end & ~huge_page_mask(h)); |
2362 | 2369 | ||
2363 | tlb_start_vma(tlb, vma); | 2370 | tlb_start_vma(tlb, vma); |
2364 | mmu_notifier_invalidate_range_start(mm, start, end); | 2371 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
2365 | again: | 2372 | again: |
2366 | spin_lock(&mm->page_table_lock); | 2373 | spin_lock(&mm->page_table_lock); |
2367 | for (address = start; address < end; address += sz) { | 2374 | for (address = start; address < end; address += sz) { |
@@ -2425,7 +2432,7 @@ again: | |||
2425 | if (address < end && !ref_page) | 2432 | if (address < end && !ref_page) |
2426 | goto again; | 2433 | goto again; |
2427 | } | 2434 | } |
2428 | mmu_notifier_invalidate_range_end(mm, start, end); | 2435 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
2429 | tlb_end_vma(tlb, vma); | 2436 | tlb_end_vma(tlb, vma); |
2430 | } | 2437 | } |
2431 | 2438 | ||
@@ -2473,7 +2480,6 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2473 | struct hstate *h = hstate_vma(vma); | 2480 | struct hstate *h = hstate_vma(vma); |
2474 | struct vm_area_struct *iter_vma; | 2481 | struct vm_area_struct *iter_vma; |
2475 | struct address_space *mapping; | 2482 | struct address_space *mapping; |
2476 | struct prio_tree_iter iter; | ||
2477 | pgoff_t pgoff; | 2483 | pgoff_t pgoff; |
2478 | 2484 | ||
2479 | /* | 2485 | /* |
@@ -2481,7 +2487,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2481 | * from page cache lookup which is in HPAGE_SIZE units. | 2487 | * from page cache lookup which is in HPAGE_SIZE units. |
2482 | */ | 2488 | */ |
2483 | address = address & huge_page_mask(h); | 2489 | address = address & huge_page_mask(h); |
2484 | pgoff = vma_hugecache_offset(h, vma, address); | 2490 | pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + |
2491 | vma->vm_pgoff; | ||
2485 | mapping = vma->vm_file->f_dentry->d_inode->i_mapping; | 2492 | mapping = vma->vm_file->f_dentry->d_inode->i_mapping; |
2486 | 2493 | ||
2487 | /* | 2494 | /* |
@@ -2490,7 +2497,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2490 | * __unmap_hugepage_range() is called as the lock is already held | 2497 | * __unmap_hugepage_range() is called as the lock is already held |
2491 | */ | 2498 | */ |
2492 | mutex_lock(&mapping->i_mmap_mutex); | 2499 | mutex_lock(&mapping->i_mmap_mutex); |
2493 | vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 2500 | vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { |
2494 | /* Do not unmap the current VMA */ | 2501 | /* Do not unmap the current VMA */ |
2495 | if (iter_vma == vma) | 2502 | if (iter_vma == vma) |
2496 | continue; | 2503 | continue; |
@@ -2525,6 +2532,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2525 | struct page *old_page, *new_page; | 2532 | struct page *old_page, *new_page; |
2526 | int avoidcopy; | 2533 | int avoidcopy; |
2527 | int outside_reserve = 0; | 2534 | int outside_reserve = 0; |
2535 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
2536 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
2528 | 2537 | ||
2529 | old_page = pte_page(pte); | 2538 | old_page = pte_page(pte); |
2530 | 2539 | ||
@@ -2611,6 +2620,9 @@ retry_avoidcopy: | |||
2611 | pages_per_huge_page(h)); | 2620 | pages_per_huge_page(h)); |
2612 | __SetPageUptodate(new_page); | 2621 | __SetPageUptodate(new_page); |
2613 | 2622 | ||
2623 | mmun_start = address & huge_page_mask(h); | ||
2624 | mmun_end = mmun_start + huge_page_size(h); | ||
2625 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
2614 | /* | 2626 | /* |
2615 | * Retake the page_table_lock to check for racing updates | 2627 | * Retake the page_table_lock to check for racing updates |
2616 | * before the page tables are altered | 2628 | * before the page tables are altered |
@@ -2619,9 +2631,6 @@ retry_avoidcopy: | |||
2619 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); | 2631 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); |
2620 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { | 2632 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { |
2621 | /* Break COW */ | 2633 | /* Break COW */ |
2622 | mmu_notifier_invalidate_range_start(mm, | ||
2623 | address & huge_page_mask(h), | ||
2624 | (address & huge_page_mask(h)) + huge_page_size(h)); | ||
2625 | huge_ptep_clear_flush(vma, address, ptep); | 2634 | huge_ptep_clear_flush(vma, address, ptep); |
2626 | set_huge_pte_at(mm, address, ptep, | 2635 | set_huge_pte_at(mm, address, ptep, |
2627 | make_huge_pte(vma, new_page, 1)); | 2636 | make_huge_pte(vma, new_page, 1)); |
@@ -2629,10 +2638,11 @@ retry_avoidcopy: | |||
2629 | hugepage_add_new_anon_rmap(new_page, vma, address); | 2638 | hugepage_add_new_anon_rmap(new_page, vma, address); |
2630 | /* Make the old page be freed below */ | 2639 | /* Make the old page be freed below */ |
2631 | new_page = old_page; | 2640 | new_page = old_page; |
2632 | mmu_notifier_invalidate_range_end(mm, | ||
2633 | address & huge_page_mask(h), | ||
2634 | (address & huge_page_mask(h)) + huge_page_size(h)); | ||
2635 | } | 2641 | } |
2642 | spin_unlock(&mm->page_table_lock); | ||
2643 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
2644 | /* Caller expects lock to be held */ | ||
2645 | spin_lock(&mm->page_table_lock); | ||
2636 | page_cache_release(new_page); | 2646 | page_cache_release(new_page); |
2637 | page_cache_release(old_page); | 2647 | page_cache_release(old_page); |
2638 | return 0; | 2648 | return 0; |
diff --git a/mm/internal.h b/mm/internal.h index b8c91b342e24..a4fa284f6bc2 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -118,26 +118,27 @@ struct compact_control { | |||
118 | unsigned long nr_freepages; /* Number of isolated free pages */ | 118 | unsigned long nr_freepages; /* Number of isolated free pages */ |
119 | unsigned long nr_migratepages; /* Number of pages to migrate */ | 119 | unsigned long nr_migratepages; /* Number of pages to migrate */ |
120 | unsigned long free_pfn; /* isolate_freepages search base */ | 120 | unsigned long free_pfn; /* isolate_freepages search base */ |
121 | unsigned long start_free_pfn; /* where we started the search */ | ||
122 | unsigned long migrate_pfn; /* isolate_migratepages search base */ | 121 | unsigned long migrate_pfn; /* isolate_migratepages search base */ |
123 | bool sync; /* Synchronous migration */ | 122 | bool sync; /* Synchronous migration */ |
124 | bool wrapped; /* Order > 0 compactions are | 123 | bool ignore_skip_hint; /* Scan blocks even if marked skip */ |
125 | incremental, once free_pfn | 124 | bool finished_update_free; /* True when the zone cached pfns are |
126 | and migrate_pfn meet, we restart | 125 | * no longer being updated |
127 | from the top of the zone; | 126 | */ |
128 | remember we wrapped around. */ | 127 | bool finished_update_migrate; |
129 | 128 | ||
130 | int order; /* order a direct compactor needs */ | 129 | int order; /* order a direct compactor needs */ |
131 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ | 130 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ |
132 | struct zone *zone; | 131 | struct zone *zone; |
133 | bool *contended; /* True if a lock was contended */ | 132 | bool contended; /* True if a lock was contended */ |
133 | struct page **page; /* Page captured of requested size */ | ||
134 | }; | 134 | }; |
135 | 135 | ||
136 | unsigned long | 136 | unsigned long |
137 | isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn); | 137 | isolate_freepages_range(struct compact_control *cc, |
138 | unsigned long start_pfn, unsigned long end_pfn); | ||
138 | unsigned long | 139 | unsigned long |
139 | isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | 140 | isolate_migratepages_range(struct zone *zone, struct compact_control *cc, |
140 | unsigned long low_pfn, unsigned long end_pfn); | 141 | unsigned long low_pfn, unsigned long end_pfn, bool unevictable); |
141 | 142 | ||
142 | #endif | 143 | #endif |
143 | 144 | ||
@@ -167,9 +168,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) | |||
167 | } | 168 | } |
168 | 169 | ||
169 | /* | 170 | /* |
170 | * Called only in fault path via page_evictable() for a new page | 171 | * Called only in fault path, to determine if a new page is being |
171 | * to determine if it's being mapped into a LOCKED vma. | 172 | * mapped into a LOCKED vma. If it is, mark page as mlocked. |
172 | * If so, mark page as mlocked. | ||
173 | */ | 173 | */ |
174 | static inline int mlocked_vma_newpage(struct vm_area_struct *vma, | 174 | static inline int mlocked_vma_newpage(struct vm_area_struct *vma, |
175 | struct page *page) | 175 | struct page *page) |
@@ -180,7 +180,8 @@ static inline int mlocked_vma_newpage(struct vm_area_struct *vma, | |||
180 | return 0; | 180 | return 0; |
181 | 181 | ||
182 | if (!TestSetPageMlocked(page)) { | 182 | if (!TestSetPageMlocked(page)) { |
183 | inc_zone_page_state(page, NR_MLOCK); | 183 | mod_zone_page_state(page_zone(page), NR_MLOCK, |
184 | hpage_nr_pages(page)); | ||
184 | count_vm_event(UNEVICTABLE_PGMLOCKED); | 185 | count_vm_event(UNEVICTABLE_PGMLOCKED); |
185 | } | 186 | } |
186 | return 1; | 187 | return 1; |
@@ -201,12 +202,7 @@ extern void munlock_vma_page(struct page *page); | |||
201 | * If called for a page that is still mapped by mlocked vmas, all we do | 202 | * If called for a page that is still mapped by mlocked vmas, all we do |
202 | * is revert to lazy LRU behaviour -- semantics are not broken. | 203 | * is revert to lazy LRU behaviour -- semantics are not broken. |
203 | */ | 204 | */ |
204 | extern void __clear_page_mlock(struct page *page); | 205 | extern void clear_page_mlock(struct page *page); |
205 | static inline void clear_page_mlock(struct page *page) | ||
206 | { | ||
207 | if (unlikely(TestClearPageMlocked(page))) | ||
208 | __clear_page_mlock(page); | ||
209 | } | ||
210 | 206 | ||
211 | /* | 207 | /* |
212 | * mlock_migrate_page - called only from migrate_page_copy() to | 208 | * mlock_migrate_page - called only from migrate_page_copy() to |
@@ -340,7 +336,6 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, | |||
340 | #define ZONE_RECLAIM_FULL -1 | 336 | #define ZONE_RECLAIM_FULL -1 |
341 | #define ZONE_RECLAIM_SOME 0 | 337 | #define ZONE_RECLAIM_SOME 0 |
342 | #define ZONE_RECLAIM_SUCCESS 1 | 338 | #define ZONE_RECLAIM_SUCCESS 1 |
343 | #endif | ||
344 | 339 | ||
345 | extern int hwpoison_filter(struct page *p); | 340 | extern int hwpoison_filter(struct page *p); |
346 | 341 | ||
@@ -356,3 +351,20 @@ extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, | |||
356 | unsigned long, unsigned long); | 351 | unsigned long, unsigned long); |
357 | 352 | ||
358 | extern void set_pageblock_order(void); | 353 | extern void set_pageblock_order(void); |
354 | unsigned long reclaim_clean_pages_from_list(struct zone *zone, | ||
355 | struct list_head *page_list); | ||
356 | /* The ALLOC_WMARK bits are used as an index to zone->watermark */ | ||
357 | #define ALLOC_WMARK_MIN WMARK_MIN | ||
358 | #define ALLOC_WMARK_LOW WMARK_LOW | ||
359 | #define ALLOC_WMARK_HIGH WMARK_HIGH | ||
360 | #define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ | ||
361 | |||
362 | /* Mask to get the watermark bits */ | ||
363 | #define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) | ||
364 | |||
365 | #define ALLOC_HARDER 0x10 /* try to alloc harder */ | ||
366 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ | ||
367 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ | ||
368 | #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ | ||
369 | |||
370 | #endif /* __MM_INTERNAL_H */ | ||
diff --git a/mm/interval_tree.c b/mm/interval_tree.c new file mode 100644 index 000000000000..4a5822a586e6 --- /dev/null +++ b/mm/interval_tree.c | |||
@@ -0,0 +1,112 @@ | |||
1 | /* | ||
2 | * mm/interval_tree.c - interval tree for mapping->i_mmap | ||
3 | * | ||
4 | * Copyright (C) 2012, Michel Lespinasse <walken@google.com> | ||
5 | * | ||
6 | * This file is released under the GPL v2. | ||
7 | */ | ||
8 | |||
9 | #include <linux/mm.h> | ||
10 | #include <linux/fs.h> | ||
11 | #include <linux/rmap.h> | ||
12 | #include <linux/interval_tree_generic.h> | ||
13 | |||
14 | static inline unsigned long vma_start_pgoff(struct vm_area_struct *v) | ||
15 | { | ||
16 | return v->vm_pgoff; | ||
17 | } | ||
18 | |||
19 | static inline unsigned long vma_last_pgoff(struct vm_area_struct *v) | ||
20 | { | ||
21 | return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1; | ||
22 | } | ||
23 | |||
24 | INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb, | ||
25 | unsigned long, shared.linear.rb_subtree_last, | ||
26 | vma_start_pgoff, vma_last_pgoff,, vma_interval_tree) | ||
27 | |||
28 | /* Insert node immediately after prev in the interval tree */ | ||
29 | void vma_interval_tree_insert_after(struct vm_area_struct *node, | ||
30 | struct vm_area_struct *prev, | ||
31 | struct rb_root *root) | ||
32 | { | ||
33 | struct rb_node **link; | ||
34 | struct vm_area_struct *parent; | ||
35 | unsigned long last = vma_last_pgoff(node); | ||
36 | |||
37 | VM_BUG_ON(vma_start_pgoff(node) != vma_start_pgoff(prev)); | ||
38 | |||
39 | if (!prev->shared.linear.rb.rb_right) { | ||
40 | parent = prev; | ||
41 | link = &prev->shared.linear.rb.rb_right; | ||
42 | } else { | ||
43 | parent = rb_entry(prev->shared.linear.rb.rb_right, | ||
44 | struct vm_area_struct, shared.linear.rb); | ||
45 | if (parent->shared.linear.rb_subtree_last < last) | ||
46 | parent->shared.linear.rb_subtree_last = last; | ||
47 | while (parent->shared.linear.rb.rb_left) { | ||
48 | parent = rb_entry(parent->shared.linear.rb.rb_left, | ||
49 | struct vm_area_struct, shared.linear.rb); | ||
50 | if (parent->shared.linear.rb_subtree_last < last) | ||
51 | parent->shared.linear.rb_subtree_last = last; | ||
52 | } | ||
53 | link = &parent->shared.linear.rb.rb_left; | ||
54 | } | ||
55 | |||
56 | node->shared.linear.rb_subtree_last = last; | ||
57 | rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link); | ||
58 | rb_insert_augmented(&node->shared.linear.rb, root, | ||
59 | &vma_interval_tree_augment); | ||
60 | } | ||
61 | |||
62 | static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc) | ||
63 | { | ||
64 | return vma_start_pgoff(avc->vma); | ||
65 | } | ||
66 | |||
67 | static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc) | ||
68 | { | ||
69 | return vma_last_pgoff(avc->vma); | ||
70 | } | ||
71 | |||
72 | INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last, | ||
73 | avc_start_pgoff, avc_last_pgoff, | ||
74 | static inline, __anon_vma_interval_tree) | ||
75 | |||
76 | void anon_vma_interval_tree_insert(struct anon_vma_chain *node, | ||
77 | struct rb_root *root) | ||
78 | { | ||
79 | #ifdef CONFIG_DEBUG_VM_RB | ||
80 | node->cached_vma_start = avc_start_pgoff(node); | ||
81 | node->cached_vma_last = avc_last_pgoff(node); | ||
82 | #endif | ||
83 | __anon_vma_interval_tree_insert(node, root); | ||
84 | } | ||
85 | |||
86 | void anon_vma_interval_tree_remove(struct anon_vma_chain *node, | ||
87 | struct rb_root *root) | ||
88 | { | ||
89 | __anon_vma_interval_tree_remove(node, root); | ||
90 | } | ||
91 | |||
92 | struct anon_vma_chain * | ||
93 | anon_vma_interval_tree_iter_first(struct rb_root *root, | ||
94 | unsigned long first, unsigned long last) | ||
95 | { | ||
96 | return __anon_vma_interval_tree_iter_first(root, first, last); | ||
97 | } | ||
98 | |||
99 | struct anon_vma_chain * | ||
100 | anon_vma_interval_tree_iter_next(struct anon_vma_chain *node, | ||
101 | unsigned long first, unsigned long last) | ||
102 | { | ||
103 | return __anon_vma_interval_tree_iter_next(node, first, last); | ||
104 | } | ||
105 | |||
106 | #ifdef CONFIG_DEBUG_VM_RB | ||
107 | void anon_vma_interval_tree_verify(struct anon_vma_chain *node) | ||
108 | { | ||
109 | WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node)); | ||
110 | WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node)); | ||
111 | } | ||
112 | #endif | ||
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 0de83b4541e9..a217cc544060 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -29,7 +29,7 @@ | |||
29 | * - kmemleak_lock (rwlock): protects the object_list modifications and | 29 | * - kmemleak_lock (rwlock): protects the object_list modifications and |
30 | * accesses to the object_tree_root. The object_list is the main list | 30 | * accesses to the object_tree_root. The object_list is the main list |
31 | * holding the metadata (struct kmemleak_object) for the allocated memory | 31 | * holding the metadata (struct kmemleak_object) for the allocated memory |
32 | * blocks. The object_tree_root is a priority search tree used to look-up | 32 | * blocks. The object_tree_root is a red black tree used to look-up |
33 | * metadata based on a pointer to the corresponding memory block. The | 33 | * metadata based on a pointer to the corresponding memory block. The |
34 | * kmemleak_object structures are added to the object_list and | 34 | * kmemleak_object structures are added to the object_list and |
35 | * object_tree_root in the create_object() function called from the | 35 | * object_tree_root in the create_object() function called from the |
@@ -71,7 +71,7 @@ | |||
71 | #include <linux/delay.h> | 71 | #include <linux/delay.h> |
72 | #include <linux/export.h> | 72 | #include <linux/export.h> |
73 | #include <linux/kthread.h> | 73 | #include <linux/kthread.h> |
74 | #include <linux/prio_tree.h> | 74 | #include <linux/rbtree.h> |
75 | #include <linux/fs.h> | 75 | #include <linux/fs.h> |
76 | #include <linux/debugfs.h> | 76 | #include <linux/debugfs.h> |
77 | #include <linux/seq_file.h> | 77 | #include <linux/seq_file.h> |
@@ -132,7 +132,7 @@ struct kmemleak_scan_area { | |||
132 | * Structure holding the metadata for each allocated memory block. | 132 | * Structure holding the metadata for each allocated memory block. |
133 | * Modifications to such objects should be made while holding the | 133 | * Modifications to such objects should be made while holding the |
134 | * object->lock. Insertions or deletions from object_list, gray_list or | 134 | * object->lock. Insertions or deletions from object_list, gray_list or |
135 | * tree_node are already protected by the corresponding locks or mutex (see | 135 | * rb_node are already protected by the corresponding locks or mutex (see |
136 | * the notes on locking above). These objects are reference-counted | 136 | * the notes on locking above). These objects are reference-counted |
137 | * (use_count) and freed using the RCU mechanism. | 137 | * (use_count) and freed using the RCU mechanism. |
138 | */ | 138 | */ |
@@ -141,7 +141,7 @@ struct kmemleak_object { | |||
141 | unsigned long flags; /* object status flags */ | 141 | unsigned long flags; /* object status flags */ |
142 | struct list_head object_list; | 142 | struct list_head object_list; |
143 | struct list_head gray_list; | 143 | struct list_head gray_list; |
144 | struct prio_tree_node tree_node; | 144 | struct rb_node rb_node; |
145 | struct rcu_head rcu; /* object_list lockless traversal */ | 145 | struct rcu_head rcu; /* object_list lockless traversal */ |
146 | /* object usage count; object freed when use_count == 0 */ | 146 | /* object usage count; object freed when use_count == 0 */ |
147 | atomic_t use_count; | 147 | atomic_t use_count; |
@@ -182,9 +182,9 @@ struct kmemleak_object { | |||
182 | static LIST_HEAD(object_list); | 182 | static LIST_HEAD(object_list); |
183 | /* the list of gray-colored objects (see color_gray comment below) */ | 183 | /* the list of gray-colored objects (see color_gray comment below) */ |
184 | static LIST_HEAD(gray_list); | 184 | static LIST_HEAD(gray_list); |
185 | /* prio search tree for object boundaries */ | 185 | /* search tree for object boundaries */ |
186 | static struct prio_tree_root object_tree_root; | 186 | static struct rb_root object_tree_root = RB_ROOT; |
187 | /* rw_lock protecting the access to object_list and prio_tree_root */ | 187 | /* rw_lock protecting the access to object_list and object_tree_root */ |
188 | static DEFINE_RWLOCK(kmemleak_lock); | 188 | static DEFINE_RWLOCK(kmemleak_lock); |
189 | 189 | ||
190 | /* allocation caches for kmemleak internal data */ | 190 | /* allocation caches for kmemleak internal data */ |
@@ -380,7 +380,7 @@ static void dump_object_info(struct kmemleak_object *object) | |||
380 | trace.entries = object->trace; | 380 | trace.entries = object->trace; |
381 | 381 | ||
382 | pr_notice("Object 0x%08lx (size %zu):\n", | 382 | pr_notice("Object 0x%08lx (size %zu):\n", |
383 | object->tree_node.start, object->size); | 383 | object->pointer, object->size); |
384 | pr_notice(" comm \"%s\", pid %d, jiffies %lu\n", | 384 | pr_notice(" comm \"%s\", pid %d, jiffies %lu\n", |
385 | object->comm, object->pid, object->jiffies); | 385 | object->comm, object->pid, object->jiffies); |
386 | pr_notice(" min_count = %d\n", object->min_count); | 386 | pr_notice(" min_count = %d\n", object->min_count); |
@@ -392,32 +392,32 @@ static void dump_object_info(struct kmemleak_object *object) | |||
392 | } | 392 | } |
393 | 393 | ||
394 | /* | 394 | /* |
395 | * Look-up a memory block metadata (kmemleak_object) in the priority search | 395 | * Look-up a memory block metadata (kmemleak_object) in the object search |
396 | * tree based on a pointer value. If alias is 0, only values pointing to the | 396 | * tree based on a pointer value. If alias is 0, only values pointing to the |
397 | * beginning of the memory block are allowed. The kmemleak_lock must be held | 397 | * beginning of the memory block are allowed. The kmemleak_lock must be held |
398 | * when calling this function. | 398 | * when calling this function. |
399 | */ | 399 | */ |
400 | static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) | 400 | static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) |
401 | { | 401 | { |
402 | struct prio_tree_node *node; | 402 | struct rb_node *rb = object_tree_root.rb_node; |
403 | struct prio_tree_iter iter; | 403 | |
404 | struct kmemleak_object *object; | 404 | while (rb) { |
405 | 405 | struct kmemleak_object *object = | |
406 | prio_tree_iter_init(&iter, &object_tree_root, ptr, ptr); | 406 | rb_entry(rb, struct kmemleak_object, rb_node); |
407 | node = prio_tree_next(&iter); | 407 | if (ptr < object->pointer) |
408 | if (node) { | 408 | rb = object->rb_node.rb_left; |
409 | object = prio_tree_entry(node, struct kmemleak_object, | 409 | else if (object->pointer + object->size <= ptr) |
410 | tree_node); | 410 | rb = object->rb_node.rb_right; |
411 | if (!alias && object->pointer != ptr) { | 411 | else if (object->pointer == ptr || alias) |
412 | return object; | ||
413 | else { | ||
412 | kmemleak_warn("Found object by alias at 0x%08lx\n", | 414 | kmemleak_warn("Found object by alias at 0x%08lx\n", |
413 | ptr); | 415 | ptr); |
414 | dump_object_info(object); | 416 | dump_object_info(object); |
415 | object = NULL; | 417 | break; |
416 | } | 418 | } |
417 | } else | 419 | } |
418 | object = NULL; | 420 | return NULL; |
419 | |||
420 | return object; | ||
421 | } | 421 | } |
422 | 422 | ||
423 | /* | 423 | /* |
@@ -471,7 +471,7 @@ static void put_object(struct kmemleak_object *object) | |||
471 | } | 471 | } |
472 | 472 | ||
473 | /* | 473 | /* |
474 | * Look up an object in the prio search tree and increase its use_count. | 474 | * Look up an object in the object search tree and increase its use_count. |
475 | */ | 475 | */ |
476 | static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) | 476 | static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) |
477 | { | 477 | { |
@@ -516,8 +516,8 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, | |||
516 | int min_count, gfp_t gfp) | 516 | int min_count, gfp_t gfp) |
517 | { | 517 | { |
518 | unsigned long flags; | 518 | unsigned long flags; |
519 | struct kmemleak_object *object; | 519 | struct kmemleak_object *object, *parent; |
520 | struct prio_tree_node *node; | 520 | struct rb_node **link, *rb_parent; |
521 | 521 | ||
522 | object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); | 522 | object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); |
523 | if (!object) { | 523 | if (!object) { |
@@ -560,31 +560,34 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, | |||
560 | /* kernel backtrace */ | 560 | /* kernel backtrace */ |
561 | object->trace_len = __save_stack_trace(object->trace); | 561 | object->trace_len = __save_stack_trace(object->trace); |
562 | 562 | ||
563 | INIT_PRIO_TREE_NODE(&object->tree_node); | ||
564 | object->tree_node.start = ptr; | ||
565 | object->tree_node.last = ptr + size - 1; | ||
566 | |||
567 | write_lock_irqsave(&kmemleak_lock, flags); | 563 | write_lock_irqsave(&kmemleak_lock, flags); |
568 | 564 | ||
569 | min_addr = min(min_addr, ptr); | 565 | min_addr = min(min_addr, ptr); |
570 | max_addr = max(max_addr, ptr + size); | 566 | max_addr = max(max_addr, ptr + size); |
571 | node = prio_tree_insert(&object_tree_root, &object->tree_node); | 567 | link = &object_tree_root.rb_node; |
572 | /* | 568 | rb_parent = NULL; |
573 | * The code calling the kernel does not yet have the pointer to the | 569 | while (*link) { |
574 | * memory block to be able to free it. However, we still hold the | 570 | rb_parent = *link; |
575 | * kmemleak_lock here in case parts of the kernel started freeing | 571 | parent = rb_entry(rb_parent, struct kmemleak_object, rb_node); |
576 | * random memory blocks. | 572 | if (ptr + size <= parent->pointer) |
577 | */ | 573 | link = &parent->rb_node.rb_left; |
578 | if (node != &object->tree_node) { | 574 | else if (parent->pointer + parent->size <= ptr) |
579 | kmemleak_stop("Cannot insert 0x%lx into the object search tree " | 575 | link = &parent->rb_node.rb_right; |
580 | "(already existing)\n", ptr); | 576 | else { |
581 | object = lookup_object(ptr, 1); | 577 | kmemleak_stop("Cannot insert 0x%lx into the object " |
582 | spin_lock(&object->lock); | 578 | "search tree (overlaps existing)\n", |
583 | dump_object_info(object); | 579 | ptr); |
584 | spin_unlock(&object->lock); | 580 | kmem_cache_free(object_cache, object); |
585 | 581 | object = parent; | |
586 | goto out; | 582 | spin_lock(&object->lock); |
583 | dump_object_info(object); | ||
584 | spin_unlock(&object->lock); | ||
585 | goto out; | ||
586 | } | ||
587 | } | 587 | } |
588 | rb_link_node(&object->rb_node, rb_parent, link); | ||
589 | rb_insert_color(&object->rb_node, &object_tree_root); | ||
590 | |||
588 | list_add_tail_rcu(&object->object_list, &object_list); | 591 | list_add_tail_rcu(&object->object_list, &object_list); |
589 | out: | 592 | out: |
590 | write_unlock_irqrestore(&kmemleak_lock, flags); | 593 | write_unlock_irqrestore(&kmemleak_lock, flags); |
@@ -600,7 +603,7 @@ static void __delete_object(struct kmemleak_object *object) | |||
600 | unsigned long flags; | 603 | unsigned long flags; |
601 | 604 | ||
602 | write_lock_irqsave(&kmemleak_lock, flags); | 605 | write_lock_irqsave(&kmemleak_lock, flags); |
603 | prio_tree_remove(&object_tree_root, &object->tree_node); | 606 | rb_erase(&object->rb_node, &object_tree_root); |
604 | list_del_rcu(&object->object_list); | 607 | list_del_rcu(&object->object_list); |
605 | write_unlock_irqrestore(&kmemleak_lock, flags); | 608 | write_unlock_irqrestore(&kmemleak_lock, flags); |
606 | 609 | ||
@@ -1766,7 +1769,6 @@ void __init kmemleak_init(void) | |||
1766 | 1769 | ||
1767 | object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); | 1770 | object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); |
1768 | scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); | 1771 | scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); |
1769 | INIT_PRIO_TREE_ROOT(&object_tree_root); | ||
1770 | 1772 | ||
1771 | if (crt_early_log >= ARRAY_SIZE(early_log)) | 1773 | if (crt_early_log >= ARRAY_SIZE(early_log)) |
1772 | pr_warning("Early log buffer exceeded (%d), please increase " | 1774 | pr_warning("Early log buffer exceeded (%d), please increase " |
@@ -709,15 +709,22 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, | |||
709 | spinlock_t *ptl; | 709 | spinlock_t *ptl; |
710 | int swapped; | 710 | int swapped; |
711 | int err = -EFAULT; | 711 | int err = -EFAULT; |
712 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
713 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
712 | 714 | ||
713 | addr = page_address_in_vma(page, vma); | 715 | addr = page_address_in_vma(page, vma); |
714 | if (addr == -EFAULT) | 716 | if (addr == -EFAULT) |
715 | goto out; | 717 | goto out; |
716 | 718 | ||
717 | BUG_ON(PageTransCompound(page)); | 719 | BUG_ON(PageTransCompound(page)); |
720 | |||
721 | mmun_start = addr; | ||
722 | mmun_end = addr + PAGE_SIZE; | ||
723 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
724 | |||
718 | ptep = page_check_address(page, mm, addr, &ptl, 0); | 725 | ptep = page_check_address(page, mm, addr, &ptl, 0); |
719 | if (!ptep) | 726 | if (!ptep) |
720 | goto out; | 727 | goto out_mn; |
721 | 728 | ||
722 | if (pte_write(*ptep) || pte_dirty(*ptep)) { | 729 | if (pte_write(*ptep) || pte_dirty(*ptep)) { |
723 | pte_t entry; | 730 | pte_t entry; |
@@ -752,6 +759,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, | |||
752 | 759 | ||
753 | out_unlock: | 760 | out_unlock: |
754 | pte_unmap_unlock(ptep, ptl); | 761 | pte_unmap_unlock(ptep, ptl); |
762 | out_mn: | ||
763 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
755 | out: | 764 | out: |
756 | return err; | 765 | return err; |
757 | } | 766 | } |
@@ -776,6 +785,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, | |||
776 | spinlock_t *ptl; | 785 | spinlock_t *ptl; |
777 | unsigned long addr; | 786 | unsigned long addr; |
778 | int err = -EFAULT; | 787 | int err = -EFAULT; |
788 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
789 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
779 | 790 | ||
780 | addr = page_address_in_vma(page, vma); | 791 | addr = page_address_in_vma(page, vma); |
781 | if (addr == -EFAULT) | 792 | if (addr == -EFAULT) |
@@ -794,10 +805,14 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, | |||
794 | if (!pmd_present(*pmd)) | 805 | if (!pmd_present(*pmd)) |
795 | goto out; | 806 | goto out; |
796 | 807 | ||
808 | mmun_start = addr; | ||
809 | mmun_end = addr + PAGE_SIZE; | ||
810 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
811 | |||
797 | ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); | 812 | ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); |
798 | if (!pte_same(*ptep, orig_pte)) { | 813 | if (!pte_same(*ptep, orig_pte)) { |
799 | pte_unmap_unlock(ptep, ptl); | 814 | pte_unmap_unlock(ptep, ptl); |
800 | goto out; | 815 | goto out_mn; |
801 | } | 816 | } |
802 | 817 | ||
803 | get_page(kpage); | 818 | get_page(kpage); |
@@ -814,6 +829,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, | |||
814 | 829 | ||
815 | pte_unmap_unlock(ptep, ptl); | 830 | pte_unmap_unlock(ptep, ptl); |
816 | err = 0; | 831 | err = 0; |
832 | out_mn: | ||
833 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
817 | out: | 834 | out: |
818 | return err; | 835 | return err; |
819 | } | 836 | } |
@@ -1469,10 +1486,14 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, | |||
1469 | */ | 1486 | */ |
1470 | if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | | 1487 | if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | |
1471 | VM_PFNMAP | VM_IO | VM_DONTEXPAND | | 1488 | VM_PFNMAP | VM_IO | VM_DONTEXPAND | |
1472 | VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | | 1489 | VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP)) |
1473 | VM_NONLINEAR | VM_MIXEDMAP | VM_SAO)) | ||
1474 | return 0; /* just ignore the advice */ | 1490 | return 0; /* just ignore the advice */ |
1475 | 1491 | ||
1492 | #ifdef VM_SAO | ||
1493 | if (*vm_flags & VM_SAO) | ||
1494 | return 0; | ||
1495 | #endif | ||
1496 | |||
1476 | if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { | 1497 | if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { |
1477 | err = __ksm_enter(mm); | 1498 | err = __ksm_enter(mm); |
1478 | if (err) | 1499 | if (err) |
@@ -1582,7 +1603,7 @@ struct page *ksm_does_need_to_copy(struct page *page, | |||
1582 | SetPageSwapBacked(new_page); | 1603 | SetPageSwapBacked(new_page); |
1583 | __set_page_locked(new_page); | 1604 | __set_page_locked(new_page); |
1584 | 1605 | ||
1585 | if (page_evictable(new_page, vma)) | 1606 | if (!mlocked_vma_newpage(vma, new_page)) |
1586 | lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); | 1607 | lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); |
1587 | else | 1608 | else |
1588 | add_page_to_unevictable_list(new_page); | 1609 | add_page_to_unevictable_list(new_page); |
@@ -1614,7 +1635,8 @@ again: | |||
1614 | struct vm_area_struct *vma; | 1635 | struct vm_area_struct *vma; |
1615 | 1636 | ||
1616 | anon_vma_lock(anon_vma); | 1637 | anon_vma_lock(anon_vma); |
1617 | list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { | 1638 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, |
1639 | 0, ULONG_MAX) { | ||
1618 | vma = vmac->vma; | 1640 | vma = vmac->vma; |
1619 | if (rmap_item->address < vma->vm_start || | 1641 | if (rmap_item->address < vma->vm_start || |
1620 | rmap_item->address >= vma->vm_end) | 1642 | rmap_item->address >= vma->vm_end) |
@@ -1667,7 +1689,8 @@ again: | |||
1667 | struct vm_area_struct *vma; | 1689 | struct vm_area_struct *vma; |
1668 | 1690 | ||
1669 | anon_vma_lock(anon_vma); | 1691 | anon_vma_lock(anon_vma); |
1670 | list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { | 1692 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, |
1693 | 0, ULONG_MAX) { | ||
1671 | vma = vmac->vma; | 1694 | vma = vmac->vma; |
1672 | if (rmap_item->address < vma->vm_start || | 1695 | if (rmap_item->address < vma->vm_start || |
1673 | rmap_item->address >= vma->vm_end) | 1696 | rmap_item->address >= vma->vm_end) |
@@ -1719,7 +1742,8 @@ again: | |||
1719 | struct vm_area_struct *vma; | 1742 | struct vm_area_struct *vma; |
1720 | 1743 | ||
1721 | anon_vma_lock(anon_vma); | 1744 | anon_vma_lock(anon_vma); |
1722 | list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { | 1745 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, |
1746 | 0, ULONG_MAX) { | ||
1723 | vma = vmac->vma; | 1747 | vma = vmac->vma; |
1724 | if (rmap_item->address < vma->vm_start || | 1748 | if (rmap_item->address < vma->vm_start || |
1725 | rmap_item->address >= vma->vm_end) | 1749 | rmap_item->address >= vma->vm_end) |
diff --git a/mm/madvise.c b/mm/madvise.c index 14d260fa0d17..03dfa5c7adb3 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -69,10 +69,14 @@ static long madvise_behavior(struct vm_area_struct * vma, | |||
69 | new_flags &= ~VM_DONTCOPY; | 69 | new_flags &= ~VM_DONTCOPY; |
70 | break; | 70 | break; |
71 | case MADV_DONTDUMP: | 71 | case MADV_DONTDUMP: |
72 | new_flags |= VM_NODUMP; | 72 | new_flags |= VM_DONTDUMP; |
73 | break; | 73 | break; |
74 | case MADV_DODUMP: | 74 | case MADV_DODUMP: |
75 | new_flags &= ~VM_NODUMP; | 75 | if (new_flags & VM_SPECIAL) { |
76 | error = -EINVAL; | ||
77 | goto out; | ||
78 | } | ||
79 | new_flags &= ~VM_DONTDUMP; | ||
76 | break; | 80 | break; |
77 | case MADV_MERGEABLE: | 81 | case MADV_MERGEABLE: |
78 | case MADV_UNMERGEABLE: | 82 | case MADV_UNMERGEABLE: |
diff --git a/mm/memblock.c b/mm/memblock.c index 82aa349d2f7a..931eef145af5 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -41,7 +41,8 @@ static int memblock_memory_in_slab __initdata_memblock = 0; | |||
41 | static int memblock_reserved_in_slab __initdata_memblock = 0; | 41 | static int memblock_reserved_in_slab __initdata_memblock = 0; |
42 | 42 | ||
43 | /* inline so we don't get a warning when pr_debug is compiled out */ | 43 | /* inline so we don't get a warning when pr_debug is compiled out */ |
44 | static inline const char *memblock_type_name(struct memblock_type *type) | 44 | static __init_memblock const char * |
45 | memblock_type_name(struct memblock_type *type) | ||
45 | { | 46 | { |
46 | if (type == &memblock.memory) | 47 | if (type == &memblock.memory) |
47 | return "memory"; | 48 | return "memory"; |
@@ -756,7 +757,7 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, | |||
756 | return ret; | 757 | return ret; |
757 | 758 | ||
758 | for (i = start_rgn; i < end_rgn; i++) | 759 | for (i = start_rgn; i < end_rgn; i++) |
759 | type->regions[i].nid = nid; | 760 | memblock_set_region_node(&type->regions[i], nid); |
760 | 761 | ||
761 | memblock_merge_regions(type); | 762 | memblock_merge_regions(type); |
762 | return 0; | 763 | return 0; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a72f2ffdc3d0..7acf43bf04a2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -51,6 +51,7 @@ | |||
51 | #include <linux/oom.h> | 51 | #include <linux/oom.h> |
52 | #include "internal.h" | 52 | #include "internal.h" |
53 | #include <net/sock.h> | 53 | #include <net/sock.h> |
54 | #include <net/ip.h> | ||
54 | #include <net/tcp_memcontrol.h> | 55 | #include <net/tcp_memcontrol.h> |
55 | 56 | ||
56 | #include <asm/uaccess.h> | 57 | #include <asm/uaccess.h> |
@@ -326,7 +327,7 @@ struct mem_cgroup { | |||
326 | struct mem_cgroup_stat_cpu nocpu_base; | 327 | struct mem_cgroup_stat_cpu nocpu_base; |
327 | spinlock_t pcp_counter_lock; | 328 | spinlock_t pcp_counter_lock; |
328 | 329 | ||
329 | #ifdef CONFIG_INET | 330 | #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) |
330 | struct tcp_memcontrol tcp_mem; | 331 | struct tcp_memcontrol tcp_mem; |
331 | #endif | 332 | #endif |
332 | }; | 333 | }; |
@@ -411,12 +412,14 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) | |||
411 | return container_of(s, struct mem_cgroup, css); | 412 | return container_of(s, struct mem_cgroup, css); |
412 | } | 413 | } |
413 | 414 | ||
415 | static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) | ||
416 | { | ||
417 | return (memcg == root_mem_cgroup); | ||
418 | } | ||
419 | |||
414 | /* Writing them here to avoid exposing memcg's inner layout */ | 420 | /* Writing them here to avoid exposing memcg's inner layout */ |
415 | #ifdef CONFIG_MEMCG_KMEM | 421 | #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) |
416 | #include <net/sock.h> | ||
417 | #include <net/ip.h> | ||
418 | 422 | ||
419 | static bool mem_cgroup_is_root(struct mem_cgroup *memcg); | ||
420 | void sock_update_memcg(struct sock *sk) | 423 | void sock_update_memcg(struct sock *sk) |
421 | { | 424 | { |
422 | if (mem_cgroup_sockets_enabled) { | 425 | if (mem_cgroup_sockets_enabled) { |
@@ -461,7 +464,6 @@ void sock_release_memcg(struct sock *sk) | |||
461 | } | 464 | } |
462 | } | 465 | } |
463 | 466 | ||
464 | #ifdef CONFIG_INET | ||
465 | struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) | 467 | struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) |
466 | { | 468 | { |
467 | if (!memcg || mem_cgroup_is_root(memcg)) | 469 | if (!memcg || mem_cgroup_is_root(memcg)) |
@@ -470,10 +472,7 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) | |||
470 | return &memcg->tcp_mem.cg_proto; | 472 | return &memcg->tcp_mem.cg_proto; |
471 | } | 473 | } |
472 | EXPORT_SYMBOL(tcp_proto_cgroup); | 474 | EXPORT_SYMBOL(tcp_proto_cgroup); |
473 | #endif /* CONFIG_INET */ | ||
474 | #endif /* CONFIG_MEMCG_KMEM */ | ||
475 | 475 | ||
476 | #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) | ||
477 | static void disarm_sock_keys(struct mem_cgroup *memcg) | 476 | static void disarm_sock_keys(struct mem_cgroup *memcg) |
478 | { | 477 | { |
479 | if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) | 478 | if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) |
@@ -1016,11 +1015,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *root, | |||
1016 | iter != NULL; \ | 1015 | iter != NULL; \ |
1017 | iter = mem_cgroup_iter(NULL, iter, NULL)) | 1016 | iter = mem_cgroup_iter(NULL, iter, NULL)) |
1018 | 1017 | ||
1019 | static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) | ||
1020 | { | ||
1021 | return (memcg == root_mem_cgroup); | ||
1022 | } | ||
1023 | |||
1024 | void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) | 1018 | void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) |
1025 | { | 1019 | { |
1026 | struct mem_cgroup *memcg; | 1020 | struct mem_cgroup *memcg; |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a6e2141a6610..6c5899b9034a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -400,18 +400,21 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, | |||
400 | struct vm_area_struct *vma; | 400 | struct vm_area_struct *vma; |
401 | struct task_struct *tsk; | 401 | struct task_struct *tsk; |
402 | struct anon_vma *av; | 402 | struct anon_vma *av; |
403 | pgoff_t pgoff; | ||
403 | 404 | ||
404 | av = page_lock_anon_vma(page); | 405 | av = page_lock_anon_vma(page); |
405 | if (av == NULL) /* Not actually mapped anymore */ | 406 | if (av == NULL) /* Not actually mapped anymore */ |
406 | return; | 407 | return; |
407 | 408 | ||
409 | pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
408 | read_lock(&tasklist_lock); | 410 | read_lock(&tasklist_lock); |
409 | for_each_process (tsk) { | 411 | for_each_process (tsk) { |
410 | struct anon_vma_chain *vmac; | 412 | struct anon_vma_chain *vmac; |
411 | 413 | ||
412 | if (!task_early_kill(tsk)) | 414 | if (!task_early_kill(tsk)) |
413 | continue; | 415 | continue; |
414 | list_for_each_entry(vmac, &av->head, same_anon_vma) { | 416 | anon_vma_interval_tree_foreach(vmac, &av->rb_root, |
417 | pgoff, pgoff) { | ||
415 | vma = vmac->vma; | 418 | vma = vmac->vma; |
416 | if (!page_mapped_in_vma(page, vma)) | 419 | if (!page_mapped_in_vma(page, vma)) |
417 | continue; | 420 | continue; |
@@ -431,7 +434,6 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, | |||
431 | { | 434 | { |
432 | struct vm_area_struct *vma; | 435 | struct vm_area_struct *vma; |
433 | struct task_struct *tsk; | 436 | struct task_struct *tsk; |
434 | struct prio_tree_iter iter; | ||
435 | struct address_space *mapping = page->mapping; | 437 | struct address_space *mapping = page->mapping; |
436 | 438 | ||
437 | mutex_lock(&mapping->i_mmap_mutex); | 439 | mutex_lock(&mapping->i_mmap_mutex); |
@@ -442,7 +444,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, | |||
442 | if (!task_early_kill(tsk)) | 444 | if (!task_early_kill(tsk)) |
443 | continue; | 445 | continue; |
444 | 446 | ||
445 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, | 447 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, |
446 | pgoff) { | 448 | pgoff) { |
447 | /* | 449 | /* |
448 | * Send early kill signal to tasks where a vma covers | 450 | * Send early kill signal to tasks where a vma covers |
diff --git a/mm/memory.c b/mm/memory.c index 57361708d1a5..fb135ba4aba9 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -712,7 +712,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, | |||
712 | add_taint(TAINT_BAD_PAGE); | 712 | add_taint(TAINT_BAD_PAGE); |
713 | } | 713 | } |
714 | 714 | ||
715 | static inline int is_cow_mapping(vm_flags_t flags) | 715 | static inline bool is_cow_mapping(vm_flags_t flags) |
716 | { | 716 | { |
717 | return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | 717 | return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; |
718 | } | 718 | } |
@@ -1039,6 +1039,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
1039 | unsigned long next; | 1039 | unsigned long next; |
1040 | unsigned long addr = vma->vm_start; | 1040 | unsigned long addr = vma->vm_start; |
1041 | unsigned long end = vma->vm_end; | 1041 | unsigned long end = vma->vm_end; |
1042 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
1043 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
1044 | bool is_cow; | ||
1042 | int ret; | 1045 | int ret; |
1043 | 1046 | ||
1044 | /* | 1047 | /* |
@@ -1047,7 +1050,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
1047 | * readonly mappings. The tradeoff is that copy_page_range is more | 1050 | * readonly mappings. The tradeoff is that copy_page_range is more |
1048 | * efficient than faulting. | 1051 | * efficient than faulting. |
1049 | */ | 1052 | */ |
1050 | if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) { | 1053 | if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR | |
1054 | VM_PFNMAP | VM_MIXEDMAP))) { | ||
1051 | if (!vma->anon_vma) | 1055 | if (!vma->anon_vma) |
1052 | return 0; | 1056 | return 0; |
1053 | } | 1057 | } |
@@ -1055,12 +1059,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
1055 | if (is_vm_hugetlb_page(vma)) | 1059 | if (is_vm_hugetlb_page(vma)) |
1056 | return copy_hugetlb_page_range(dst_mm, src_mm, vma); | 1060 | return copy_hugetlb_page_range(dst_mm, src_mm, vma); |
1057 | 1061 | ||
1058 | if (unlikely(is_pfn_mapping(vma))) { | 1062 | if (unlikely(vma->vm_flags & VM_PFNMAP)) { |
1059 | /* | 1063 | /* |
1060 | * We do not free on error cases below as remove_vma | 1064 | * We do not free on error cases below as remove_vma |
1061 | * gets called on error from higher level routine | 1065 | * gets called on error from higher level routine |
1062 | */ | 1066 | */ |
1063 | ret = track_pfn_vma_copy(vma); | 1067 | ret = track_pfn_copy(vma); |
1064 | if (ret) | 1068 | if (ret) |
1065 | return ret; | 1069 | return ret; |
1066 | } | 1070 | } |
@@ -1071,8 +1075,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
1071 | * parent mm. And a permission downgrade will only happen if | 1075 | * parent mm. And a permission downgrade will only happen if |
1072 | * is_cow_mapping() returns true. | 1076 | * is_cow_mapping() returns true. |
1073 | */ | 1077 | */ |
1074 | if (is_cow_mapping(vma->vm_flags)) | 1078 | is_cow = is_cow_mapping(vma->vm_flags); |
1075 | mmu_notifier_invalidate_range_start(src_mm, addr, end); | 1079 | mmun_start = addr; |
1080 | mmun_end = end; | ||
1081 | if (is_cow) | ||
1082 | mmu_notifier_invalidate_range_start(src_mm, mmun_start, | ||
1083 | mmun_end); | ||
1076 | 1084 | ||
1077 | ret = 0; | 1085 | ret = 0; |
1078 | dst_pgd = pgd_offset(dst_mm, addr); | 1086 | dst_pgd = pgd_offset(dst_mm, addr); |
@@ -1088,9 +1096,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
1088 | } | 1096 | } |
1089 | } while (dst_pgd++, src_pgd++, addr = next, addr != end); | 1097 | } while (dst_pgd++, src_pgd++, addr = next, addr != end); |
1090 | 1098 | ||
1091 | if (is_cow_mapping(vma->vm_flags)) | 1099 | if (is_cow) |
1092 | mmu_notifier_invalidate_range_end(src_mm, | 1100 | mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end); |
1093 | vma->vm_start, end); | ||
1094 | return ret; | 1101 | return ret; |
1095 | } | 1102 | } |
1096 | 1103 | ||
@@ -1327,8 +1334,8 @@ static void unmap_single_vma(struct mmu_gather *tlb, | |||
1327 | if (vma->vm_file) | 1334 | if (vma->vm_file) |
1328 | uprobe_munmap(vma, start, end); | 1335 | uprobe_munmap(vma, start, end); |
1329 | 1336 | ||
1330 | if (unlikely(is_pfn_mapping(vma))) | 1337 | if (unlikely(vma->vm_flags & VM_PFNMAP)) |
1331 | untrack_pfn_vma(vma, 0, 0); | 1338 | untrack_pfn(vma, 0, 0); |
1332 | 1339 | ||
1333 | if (start != end) { | 1340 | if (start != end) { |
1334 | if (unlikely(is_vm_hugetlb_page(vma))) { | 1341 | if (unlikely(is_vm_hugetlb_page(vma))) { |
@@ -1521,7 +1528,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
1521 | spin_unlock(&mm->page_table_lock); | 1528 | spin_unlock(&mm->page_table_lock); |
1522 | wait_split_huge_page(vma->anon_vma, pmd); | 1529 | wait_split_huge_page(vma->anon_vma, pmd); |
1523 | } else { | 1530 | } else { |
1524 | page = follow_trans_huge_pmd(mm, address, | 1531 | page = follow_trans_huge_pmd(vma, address, |
1525 | pmd, flags); | 1532 | pmd, flags); |
1526 | spin_unlock(&mm->page_table_lock); | 1533 | spin_unlock(&mm->page_table_lock); |
1527 | goto out; | 1534 | goto out; |
@@ -1576,12 +1583,12 @@ split_fallthrough: | |||
1576 | if (page->mapping && trylock_page(page)) { | 1583 | if (page->mapping && trylock_page(page)) { |
1577 | lru_add_drain(); /* push cached pages to LRU */ | 1584 | lru_add_drain(); /* push cached pages to LRU */ |
1578 | /* | 1585 | /* |
1579 | * Because we lock page here and migration is | 1586 | * Because we lock page here, and migration is |
1580 | * blocked by the pte's page reference, we need | 1587 | * blocked by the pte's page reference, and we |
1581 | * only check for file-cache page truncation. | 1588 | * know the page is still mapped, we don't even |
1589 | * need to check for file-cache page truncation. | ||
1582 | */ | 1590 | */ |
1583 | if (page->mapping) | 1591 | mlock_vma_page(page); |
1584 | mlock_vma_page(page); | ||
1585 | unlock_page(page); | 1592 | unlock_page(page); |
1586 | } | 1593 | } |
1587 | } | 1594 | } |
@@ -2085,6 +2092,11 @@ out: | |||
2085 | * ask for a shared writable mapping! | 2092 | * ask for a shared writable mapping! |
2086 | * | 2093 | * |
2087 | * The page does not need to be reserved. | 2094 | * The page does not need to be reserved. |
2095 | * | ||
2096 | * Usually this function is called from f_op->mmap() handler | ||
2097 | * under mm->mmap_sem write-lock, so it can change vma->vm_flags. | ||
2098 | * Caller must set VM_MIXEDMAP on vma if it wants to call this | ||
2099 | * function from other places, for example from page-fault handler. | ||
2088 | */ | 2100 | */ |
2089 | int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, | 2101 | int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, |
2090 | struct page *page) | 2102 | struct page *page) |
@@ -2093,7 +2105,11 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, | |||
2093 | return -EFAULT; | 2105 | return -EFAULT; |
2094 | if (!page_count(page)) | 2106 | if (!page_count(page)) |
2095 | return -EINVAL; | 2107 | return -EINVAL; |
2096 | vma->vm_flags |= VM_INSERTPAGE; | 2108 | if (!(vma->vm_flags & VM_MIXEDMAP)) { |
2109 | BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem)); | ||
2110 | BUG_ON(vma->vm_flags & VM_PFNMAP); | ||
2111 | vma->vm_flags |= VM_MIXEDMAP; | ||
2112 | } | ||
2097 | return insert_page(vma, addr, page, vma->vm_page_prot); | 2113 | return insert_page(vma, addr, page, vma->vm_page_prot); |
2098 | } | 2114 | } |
2099 | EXPORT_SYMBOL(vm_insert_page); | 2115 | EXPORT_SYMBOL(vm_insert_page); |
@@ -2132,7 +2148,7 @@ out: | |||
2132 | * @addr: target user address of this page | 2148 | * @addr: target user address of this page |
2133 | * @pfn: source kernel pfn | 2149 | * @pfn: source kernel pfn |
2134 | * | 2150 | * |
2135 | * Similar to vm_inert_page, this allows drivers to insert individual pages | 2151 | * Similar to vm_insert_page, this allows drivers to insert individual pages |
2136 | * they've allocated into a user vma. Same comments apply. | 2152 | * they've allocated into a user vma. Same comments apply. |
2137 | * | 2153 | * |
2138 | * This function should only be called from a vm_ops->fault handler, and | 2154 | * This function should only be called from a vm_ops->fault handler, and |
@@ -2162,14 +2178,11 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, | |||
2162 | 2178 | ||
2163 | if (addr < vma->vm_start || addr >= vma->vm_end) | 2179 | if (addr < vma->vm_start || addr >= vma->vm_end) |
2164 | return -EFAULT; | 2180 | return -EFAULT; |
2165 | if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE)) | 2181 | if (track_pfn_insert(vma, &pgprot, pfn)) |
2166 | return -EINVAL; | 2182 | return -EINVAL; |
2167 | 2183 | ||
2168 | ret = insert_pfn(vma, addr, pfn, pgprot); | 2184 | ret = insert_pfn(vma, addr, pfn, pgprot); |
2169 | 2185 | ||
2170 | if (ret) | ||
2171 | untrack_pfn_vma(vma, pfn, PAGE_SIZE); | ||
2172 | |||
2173 | return ret; | 2186 | return ret; |
2174 | } | 2187 | } |
2175 | EXPORT_SYMBOL(vm_insert_pfn); | 2188 | EXPORT_SYMBOL(vm_insert_pfn); |
@@ -2290,37 +2303,30 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, | |||
2290 | * rest of the world about it: | 2303 | * rest of the world about it: |
2291 | * VM_IO tells people not to look at these pages | 2304 | * VM_IO tells people not to look at these pages |
2292 | * (accesses can have side effects). | 2305 | * (accesses can have side effects). |
2293 | * VM_RESERVED is specified all over the place, because | ||
2294 | * in 2.4 it kept swapout's vma scan off this vma; but | ||
2295 | * in 2.6 the LRU scan won't even find its pages, so this | ||
2296 | * flag means no more than count its pages in reserved_vm, | ||
2297 | * and omit it from core dump, even when VM_IO turned off. | ||
2298 | * VM_PFNMAP tells the core MM that the base pages are just | 2306 | * VM_PFNMAP tells the core MM that the base pages are just |
2299 | * raw PFN mappings, and do not have a "struct page" associated | 2307 | * raw PFN mappings, and do not have a "struct page" associated |
2300 | * with them. | 2308 | * with them. |
2309 | * VM_DONTEXPAND | ||
2310 | * Disable vma merging and expanding with mremap(). | ||
2311 | * VM_DONTDUMP | ||
2312 | * Omit vma from core dump, even when VM_IO turned off. | ||
2301 | * | 2313 | * |
2302 | * There's a horrible special case to handle copy-on-write | 2314 | * There's a horrible special case to handle copy-on-write |
2303 | * behaviour that some programs depend on. We mark the "original" | 2315 | * behaviour that some programs depend on. We mark the "original" |
2304 | * un-COW'ed pages by matching them up with "vma->vm_pgoff". | 2316 | * un-COW'ed pages by matching them up with "vma->vm_pgoff". |
2317 | * See vm_normal_page() for details. | ||
2305 | */ | 2318 | */ |
2306 | if (addr == vma->vm_start && end == vma->vm_end) { | 2319 | if (is_cow_mapping(vma->vm_flags)) { |
2320 | if (addr != vma->vm_start || end != vma->vm_end) | ||
2321 | return -EINVAL; | ||
2307 | vma->vm_pgoff = pfn; | 2322 | vma->vm_pgoff = pfn; |
2308 | vma->vm_flags |= VM_PFN_AT_MMAP; | 2323 | } |
2309 | } else if (is_cow_mapping(vma->vm_flags)) | ||
2310 | return -EINVAL; | ||
2311 | |||
2312 | vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; | ||
2313 | 2324 | ||
2314 | err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size)); | 2325 | err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); |
2315 | if (err) { | 2326 | if (err) |
2316 | /* | ||
2317 | * To indicate that track_pfn related cleanup is not | ||
2318 | * needed from higher level routine calling unmap_vmas | ||
2319 | */ | ||
2320 | vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP); | ||
2321 | vma->vm_flags &= ~VM_PFN_AT_MMAP; | ||
2322 | return -EINVAL; | 2327 | return -EINVAL; |
2323 | } | 2328 | |
2329 | vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; | ||
2324 | 2330 | ||
2325 | BUG_ON(addr >= end); | 2331 | BUG_ON(addr >= end); |
2326 | pfn -= addr >> PAGE_SHIFT; | 2332 | pfn -= addr >> PAGE_SHIFT; |
@@ -2335,7 +2341,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, | |||
2335 | } while (pgd++, addr = next, addr != end); | 2341 | } while (pgd++, addr = next, addr != end); |
2336 | 2342 | ||
2337 | if (err) | 2343 | if (err) |
2338 | untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size)); | 2344 | untrack_pfn(vma, pfn, PAGE_ALIGN(size)); |
2339 | 2345 | ||
2340 | return err; | 2346 | return err; |
2341 | } | 2347 | } |
@@ -2516,11 +2522,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2516 | spinlock_t *ptl, pte_t orig_pte) | 2522 | spinlock_t *ptl, pte_t orig_pte) |
2517 | __releases(ptl) | 2523 | __releases(ptl) |
2518 | { | 2524 | { |
2519 | struct page *old_page, *new_page; | 2525 | struct page *old_page, *new_page = NULL; |
2520 | pte_t entry; | 2526 | pte_t entry; |
2521 | int ret = 0; | 2527 | int ret = 0; |
2522 | int page_mkwrite = 0; | 2528 | int page_mkwrite = 0; |
2523 | struct page *dirty_page = NULL; | 2529 | struct page *dirty_page = NULL; |
2530 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
2531 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
2532 | bool mmun_called = false; /* For mmu_notifiers */ | ||
2524 | 2533 | ||
2525 | old_page = vm_normal_page(vma, address, orig_pte); | 2534 | old_page = vm_normal_page(vma, address, orig_pte); |
2526 | if (!old_page) { | 2535 | if (!old_page) { |
@@ -2698,6 +2707,11 @@ gotten: | |||
2698 | if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) | 2707 | if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) |
2699 | goto oom_free_new; | 2708 | goto oom_free_new; |
2700 | 2709 | ||
2710 | mmun_start = address & PAGE_MASK; | ||
2711 | mmun_end = (address & PAGE_MASK) + PAGE_SIZE; | ||
2712 | mmun_called = true; | ||
2713 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
2714 | |||
2701 | /* | 2715 | /* |
2702 | * Re-check the pte - we dropped the lock | 2716 | * Re-check the pte - we dropped the lock |
2703 | */ | 2717 | */ |
@@ -2764,6 +2778,8 @@ gotten: | |||
2764 | page_cache_release(new_page); | 2778 | page_cache_release(new_page); |
2765 | unlock: | 2779 | unlock: |
2766 | pte_unmap_unlock(page_table, ptl); | 2780 | pte_unmap_unlock(page_table, ptl); |
2781 | if (mmun_called) | ||
2782 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
2767 | if (old_page) { | 2783 | if (old_page) { |
2768 | /* | 2784 | /* |
2769 | * Don't let another task, with possibly unlocked vma, | 2785 | * Don't let another task, with possibly unlocked vma, |
@@ -2801,14 +2817,13 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma, | |||
2801 | zap_page_range_single(vma, start_addr, end_addr - start_addr, details); | 2817 | zap_page_range_single(vma, start_addr, end_addr - start_addr, details); |
2802 | } | 2818 | } |
2803 | 2819 | ||
2804 | static inline void unmap_mapping_range_tree(struct prio_tree_root *root, | 2820 | static inline void unmap_mapping_range_tree(struct rb_root *root, |
2805 | struct zap_details *details) | 2821 | struct zap_details *details) |
2806 | { | 2822 | { |
2807 | struct vm_area_struct *vma; | 2823 | struct vm_area_struct *vma; |
2808 | struct prio_tree_iter iter; | ||
2809 | pgoff_t vba, vea, zba, zea; | 2824 | pgoff_t vba, vea, zba, zea; |
2810 | 2825 | ||
2811 | vma_prio_tree_foreach(vma, &iter, root, | 2826 | vma_interval_tree_foreach(vma, root, |
2812 | details->first_index, details->last_index) { | 2827 | details->first_index, details->last_index) { |
2813 | 2828 | ||
2814 | vba = vma->vm_pgoff; | 2829 | vba = vma->vm_pgoff; |
@@ -2839,7 +2854,7 @@ static inline void unmap_mapping_range_list(struct list_head *head, | |||
2839 | * across *all* the pages in each nonlinear VMA, not just the pages | 2854 | * across *all* the pages in each nonlinear VMA, not just the pages |
2840 | * whose virtual address lies outside the file truncation point. | 2855 | * whose virtual address lies outside the file truncation point. |
2841 | */ | 2856 | */ |
2842 | list_for_each_entry(vma, head, shared.vm_set.list) { | 2857 | list_for_each_entry(vma, head, shared.nonlinear) { |
2843 | details->nonlinear_vma = vma; | 2858 | details->nonlinear_vma = vma; |
2844 | unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details); | 2859 | unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details); |
2845 | } | 2860 | } |
@@ -2883,7 +2898,7 @@ void unmap_mapping_range(struct address_space *mapping, | |||
2883 | 2898 | ||
2884 | 2899 | ||
2885 | mutex_lock(&mapping->i_mmap_mutex); | 2900 | mutex_lock(&mapping->i_mmap_mutex); |
2886 | if (unlikely(!prio_tree_empty(&mapping->i_mmap))) | 2901 | if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) |
2887 | unmap_mapping_range_tree(&mapping->i_mmap, &details); | 2902 | unmap_mapping_range_tree(&mapping->i_mmap, &details); |
2888 | if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) | 2903 | if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) |
2889 | unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); | 2904 | unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6a5b90d0cfd7..56b758ae57d2 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -106,6 +106,7 @@ static void get_page_bootmem(unsigned long info, struct page *page, | |||
106 | void __ref put_page_bootmem(struct page *page) | 106 | void __ref put_page_bootmem(struct page *page) |
107 | { | 107 | { |
108 | unsigned long type; | 108 | unsigned long type; |
109 | struct zone *zone; | ||
109 | 110 | ||
110 | type = (unsigned long) page->lru.next; | 111 | type = (unsigned long) page->lru.next; |
111 | BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || | 112 | BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || |
@@ -116,6 +117,12 @@ void __ref put_page_bootmem(struct page *page) | |||
116 | set_page_private(page, 0); | 117 | set_page_private(page, 0); |
117 | INIT_LIST_HEAD(&page->lru); | 118 | INIT_LIST_HEAD(&page->lru); |
118 | __free_pages_bootmem(page, 0); | 119 | __free_pages_bootmem(page, 0); |
120 | |||
121 | zone = page_zone(page); | ||
122 | zone_span_writelock(zone); | ||
123 | zone->present_pages++; | ||
124 | zone_span_writeunlock(zone); | ||
125 | totalram_pages++; | ||
119 | } | 126 | } |
120 | 127 | ||
121 | } | 128 | } |
@@ -362,11 +369,11 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, | |||
362 | BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); | 369 | BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); |
363 | BUG_ON(nr_pages % PAGES_PER_SECTION); | 370 | BUG_ON(nr_pages % PAGES_PER_SECTION); |
364 | 371 | ||
372 | release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE); | ||
373 | |||
365 | sections_to_remove = nr_pages / PAGES_PER_SECTION; | 374 | sections_to_remove = nr_pages / PAGES_PER_SECTION; |
366 | for (i = 0; i < sections_to_remove; i++) { | 375 | for (i = 0; i < sections_to_remove; i++) { |
367 | unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; | 376 | unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; |
368 | release_mem_region(pfn << PAGE_SHIFT, | ||
369 | PAGES_PER_SECTION << PAGE_SHIFT); | ||
370 | ret = __remove_section(zone, __pfn_to_section(pfn)); | 377 | ret = __remove_section(zone, __pfn_to_section(pfn)); |
371 | if (ret) | 378 | if (ret) |
372 | break; | 379 | break; |
@@ -756,13 +763,6 @@ static unsigned long scan_lru_pages(unsigned long start, unsigned long end) | |||
756 | return 0; | 763 | return 0; |
757 | } | 764 | } |
758 | 765 | ||
759 | static struct page * | ||
760 | hotremove_migrate_alloc(struct page *page, unsigned long private, int **x) | ||
761 | { | ||
762 | /* This should be improooooved!! */ | ||
763 | return alloc_page(GFP_HIGHUSER_MOVABLE); | ||
764 | } | ||
765 | |||
766 | #define NR_OFFLINE_AT_ONCE_PAGES (256) | 766 | #define NR_OFFLINE_AT_ONCE_PAGES (256) |
767 | static int | 767 | static int |
768 | do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | 768 | do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) |
@@ -813,8 +813,12 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
813 | putback_lru_pages(&source); | 813 | putback_lru_pages(&source); |
814 | goto out; | 814 | goto out; |
815 | } | 815 | } |
816 | /* this function returns # of failed pages */ | 816 | |
817 | ret = migrate_pages(&source, hotremove_migrate_alloc, 0, | 817 | /* |
818 | * alloc_migrate_target should be improooooved!! | ||
819 | * migrate_pages returns # of failed pages. | ||
820 | */ | ||
821 | ret = migrate_pages(&source, alloc_migrate_target, 0, | ||
818 | true, MIGRATE_SYNC); | 822 | true, MIGRATE_SYNC); |
819 | if (ret) | 823 | if (ret) |
820 | putback_lru_pages(&source); | 824 | putback_lru_pages(&source); |
@@ -870,7 +874,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | |||
870 | return offlined; | 874 | return offlined; |
871 | } | 875 | } |
872 | 876 | ||
873 | static int __ref offline_pages(unsigned long start_pfn, | 877 | static int __ref __offline_pages(unsigned long start_pfn, |
874 | unsigned long end_pfn, unsigned long timeout) | 878 | unsigned long end_pfn, unsigned long timeout) |
875 | { | 879 | { |
876 | unsigned long pfn, nr_pages, expire; | 880 | unsigned long pfn, nr_pages, expire; |
@@ -970,8 +974,13 @@ repeat: | |||
970 | 974 | ||
971 | init_per_zone_wmark_min(); | 975 | init_per_zone_wmark_min(); |
972 | 976 | ||
973 | if (!populated_zone(zone)) | 977 | if (!populated_zone(zone)) { |
974 | zone_pcp_reset(zone); | 978 | zone_pcp_reset(zone); |
979 | mutex_lock(&zonelists_mutex); | ||
980 | build_all_zonelists(NULL, NULL); | ||
981 | mutex_unlock(&zonelists_mutex); | ||
982 | } else | ||
983 | zone_pcp_update(zone); | ||
975 | 984 | ||
976 | if (!node_present_pages(node)) { | 985 | if (!node_present_pages(node)) { |
977 | node_clear_state(node, N_HIGH_MEMORY); | 986 | node_clear_state(node, N_HIGH_MEMORY); |
@@ -998,15 +1007,55 @@ out: | |||
998 | return ret; | 1007 | return ret; |
999 | } | 1008 | } |
1000 | 1009 | ||
1010 | int offline_pages(unsigned long start_pfn, unsigned long nr_pages) | ||
1011 | { | ||
1012 | return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); | ||
1013 | } | ||
1014 | |||
1001 | int remove_memory(u64 start, u64 size) | 1015 | int remove_memory(u64 start, u64 size) |
1002 | { | 1016 | { |
1017 | struct memory_block *mem = NULL; | ||
1018 | struct mem_section *section; | ||
1003 | unsigned long start_pfn, end_pfn; | 1019 | unsigned long start_pfn, end_pfn; |
1020 | unsigned long pfn, section_nr; | ||
1021 | int ret; | ||
1004 | 1022 | ||
1005 | start_pfn = PFN_DOWN(start); | 1023 | start_pfn = PFN_DOWN(start); |
1006 | end_pfn = start_pfn + PFN_DOWN(size); | 1024 | end_pfn = start_pfn + PFN_DOWN(size); |
1007 | return offline_pages(start_pfn, end_pfn, 120 * HZ); | 1025 | |
1026 | for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { | ||
1027 | section_nr = pfn_to_section_nr(pfn); | ||
1028 | if (!present_section_nr(section_nr)) | ||
1029 | continue; | ||
1030 | |||
1031 | section = __nr_to_section(section_nr); | ||
1032 | /* same memblock? */ | ||
1033 | if (mem) | ||
1034 | if ((section_nr >= mem->start_section_nr) && | ||
1035 | (section_nr <= mem->end_section_nr)) | ||
1036 | continue; | ||
1037 | |||
1038 | mem = find_memory_block_hinted(section, mem); | ||
1039 | if (!mem) | ||
1040 | continue; | ||
1041 | |||
1042 | ret = offline_memory_block(mem); | ||
1043 | if (ret) { | ||
1044 | kobject_put(&mem->dev.kobj); | ||
1045 | return ret; | ||
1046 | } | ||
1047 | } | ||
1048 | |||
1049 | if (mem) | ||
1050 | kobject_put(&mem->dev.kobj); | ||
1051 | |||
1052 | return 0; | ||
1008 | } | 1053 | } |
1009 | #else | 1054 | #else |
1055 | int offline_pages(unsigned long start_pfn, unsigned long nr_pages) | ||
1056 | { | ||
1057 | return -EINVAL; | ||
1058 | } | ||
1010 | int remove_memory(u64 start, u64 size) | 1059 | int remove_memory(u64 start, u64 size) |
1011 | { | 1060 | { |
1012 | return -EINVAL; | 1061 | return -EINVAL; |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4ada3be6e252..0b78fb9ea65b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -607,6 +607,42 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
607 | return first; | 607 | return first; |
608 | } | 608 | } |
609 | 609 | ||
610 | /* | ||
611 | * Apply policy to a single VMA | ||
612 | * This must be called with the mmap_sem held for writing. | ||
613 | */ | ||
614 | static int vma_replace_policy(struct vm_area_struct *vma, | ||
615 | struct mempolicy *pol) | ||
616 | { | ||
617 | int err; | ||
618 | struct mempolicy *old; | ||
619 | struct mempolicy *new; | ||
620 | |||
621 | pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", | ||
622 | vma->vm_start, vma->vm_end, vma->vm_pgoff, | ||
623 | vma->vm_ops, vma->vm_file, | ||
624 | vma->vm_ops ? vma->vm_ops->set_policy : NULL); | ||
625 | |||
626 | new = mpol_dup(pol); | ||
627 | if (IS_ERR(new)) | ||
628 | return PTR_ERR(new); | ||
629 | |||
630 | if (vma->vm_ops && vma->vm_ops->set_policy) { | ||
631 | err = vma->vm_ops->set_policy(vma, new); | ||
632 | if (err) | ||
633 | goto err_out; | ||
634 | } | ||
635 | |||
636 | old = vma->vm_policy; | ||
637 | vma->vm_policy = new; /* protected by mmap_sem */ | ||
638 | mpol_put(old); | ||
639 | |||
640 | return 0; | ||
641 | err_out: | ||
642 | mpol_put(new); | ||
643 | return err; | ||
644 | } | ||
645 | |||
610 | /* Step 2: apply policy to a range and do splits. */ | 646 | /* Step 2: apply policy to a range and do splits. */ |
611 | static int mbind_range(struct mm_struct *mm, unsigned long start, | 647 | static int mbind_range(struct mm_struct *mm, unsigned long start, |
612 | unsigned long end, struct mempolicy *new_pol) | 648 | unsigned long end, struct mempolicy *new_pol) |
@@ -655,23 +691,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, | |||
655 | if (err) | 691 | if (err) |
656 | goto out; | 692 | goto out; |
657 | } | 693 | } |
658 | 694 | err = vma_replace_policy(vma, new_pol); | |
659 | /* | 695 | if (err) |
660 | * Apply policy to a single VMA. The reference counting of | 696 | goto out; |
661 | * policy for vma_policy linkages has already been handled by | ||
662 | * vma_merge and split_vma as necessary. If this is a shared | ||
663 | * policy then ->set_policy will increment the reference count | ||
664 | * for an sp node. | ||
665 | */ | ||
666 | pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", | ||
667 | vma->vm_start, vma->vm_end, vma->vm_pgoff, | ||
668 | vma->vm_ops, vma->vm_file, | ||
669 | vma->vm_ops ? vma->vm_ops->set_policy : NULL); | ||
670 | if (vma->vm_ops && vma->vm_ops->set_policy) { | ||
671 | err = vma->vm_ops->set_policy(vma, new_pol); | ||
672 | if (err) | ||
673 | goto out; | ||
674 | } | ||
675 | } | 697 | } |
676 | 698 | ||
677 | out: | 699 | out: |
@@ -924,15 +946,18 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, | |||
924 | nodemask_t nmask; | 946 | nodemask_t nmask; |
925 | LIST_HEAD(pagelist); | 947 | LIST_HEAD(pagelist); |
926 | int err = 0; | 948 | int err = 0; |
927 | struct vm_area_struct *vma; | ||
928 | 949 | ||
929 | nodes_clear(nmask); | 950 | nodes_clear(nmask); |
930 | node_set(source, nmask); | 951 | node_set(source, nmask); |
931 | 952 | ||
932 | vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, | 953 | /* |
954 | * This does not "check" the range but isolates all pages that | ||
955 | * need migration. Between passing in the full user address | ||
956 | * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. | ||
957 | */ | ||
958 | VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); | ||
959 | check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, | ||
933 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); | 960 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); |
934 | if (IS_ERR(vma)) | ||
935 | return PTR_ERR(vma); | ||
936 | 961 | ||
937 | if (!list_empty(&pagelist)) { | 962 | if (!list_empty(&pagelist)) { |
938 | err = migrate_pages(&pagelist, new_node_page, dest, | 963 | err = migrate_pages(&pagelist, new_node_page, dest, |
@@ -1530,8 +1555,18 @@ struct mempolicy *get_vma_policy(struct task_struct *task, | |||
1530 | addr); | 1555 | addr); |
1531 | if (vpol) | 1556 | if (vpol) |
1532 | pol = vpol; | 1557 | pol = vpol; |
1533 | } else if (vma->vm_policy) | 1558 | } else if (vma->vm_policy) { |
1534 | pol = vma->vm_policy; | 1559 | pol = vma->vm_policy; |
1560 | |||
1561 | /* | ||
1562 | * shmem_alloc_page() passes MPOL_F_SHARED policy with | ||
1563 | * a pseudo vma whose vma->vm_ops=NULL. Take a reference | ||
1564 | * count on these policies which will be dropped by | ||
1565 | * mpol_cond_put() later | ||
1566 | */ | ||
1567 | if (mpol_needs_cond_ref(pol)) | ||
1568 | mpol_get(pol); | ||
1569 | } | ||
1535 | } | 1570 | } |
1536 | if (!pol) | 1571 | if (!pol) |
1537 | pol = &default_policy; | 1572 | pol = &default_policy; |
@@ -2061,7 +2096,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) | |||
2061 | */ | 2096 | */ |
2062 | 2097 | ||
2063 | /* lookup first element intersecting start-end */ | 2098 | /* lookup first element intersecting start-end */ |
2064 | /* Caller holds sp->lock */ | 2099 | /* Caller holds sp->mutex */ |
2065 | static struct sp_node * | 2100 | static struct sp_node * |
2066 | sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) | 2101 | sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) |
2067 | { | 2102 | { |
@@ -2125,36 +2160,50 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) | |||
2125 | 2160 | ||
2126 | if (!sp->root.rb_node) | 2161 | if (!sp->root.rb_node) |
2127 | return NULL; | 2162 | return NULL; |
2128 | spin_lock(&sp->lock); | 2163 | mutex_lock(&sp->mutex); |
2129 | sn = sp_lookup(sp, idx, idx+1); | 2164 | sn = sp_lookup(sp, idx, idx+1); |
2130 | if (sn) { | 2165 | if (sn) { |
2131 | mpol_get(sn->policy); | 2166 | mpol_get(sn->policy); |
2132 | pol = sn->policy; | 2167 | pol = sn->policy; |
2133 | } | 2168 | } |
2134 | spin_unlock(&sp->lock); | 2169 | mutex_unlock(&sp->mutex); |
2135 | return pol; | 2170 | return pol; |
2136 | } | 2171 | } |
2137 | 2172 | ||
2173 | static void sp_free(struct sp_node *n) | ||
2174 | { | ||
2175 | mpol_put(n->policy); | ||
2176 | kmem_cache_free(sn_cache, n); | ||
2177 | } | ||
2178 | |||
2138 | static void sp_delete(struct shared_policy *sp, struct sp_node *n) | 2179 | static void sp_delete(struct shared_policy *sp, struct sp_node *n) |
2139 | { | 2180 | { |
2140 | pr_debug("deleting %lx-l%lx\n", n->start, n->end); | 2181 | pr_debug("deleting %lx-l%lx\n", n->start, n->end); |
2141 | rb_erase(&n->nd, &sp->root); | 2182 | rb_erase(&n->nd, &sp->root); |
2142 | mpol_put(n->policy); | 2183 | sp_free(n); |
2143 | kmem_cache_free(sn_cache, n); | ||
2144 | } | 2184 | } |
2145 | 2185 | ||
2146 | static struct sp_node *sp_alloc(unsigned long start, unsigned long end, | 2186 | static struct sp_node *sp_alloc(unsigned long start, unsigned long end, |
2147 | struct mempolicy *pol) | 2187 | struct mempolicy *pol) |
2148 | { | 2188 | { |
2149 | struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); | 2189 | struct sp_node *n; |
2190 | struct mempolicy *newpol; | ||
2150 | 2191 | ||
2192 | n = kmem_cache_alloc(sn_cache, GFP_KERNEL); | ||
2151 | if (!n) | 2193 | if (!n) |
2152 | return NULL; | 2194 | return NULL; |
2195 | |||
2196 | newpol = mpol_dup(pol); | ||
2197 | if (IS_ERR(newpol)) { | ||
2198 | kmem_cache_free(sn_cache, n); | ||
2199 | return NULL; | ||
2200 | } | ||
2201 | newpol->flags |= MPOL_F_SHARED; | ||
2202 | |||
2153 | n->start = start; | 2203 | n->start = start; |
2154 | n->end = end; | 2204 | n->end = end; |
2155 | mpol_get(pol); | 2205 | n->policy = newpol; |
2156 | pol->flags |= MPOL_F_SHARED; /* for unref */ | 2206 | |
2157 | n->policy = pol; | ||
2158 | return n; | 2207 | return n; |
2159 | } | 2208 | } |
2160 | 2209 | ||
@@ -2162,10 +2211,10 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end, | |||
2162 | static int shared_policy_replace(struct shared_policy *sp, unsigned long start, | 2211 | static int shared_policy_replace(struct shared_policy *sp, unsigned long start, |
2163 | unsigned long end, struct sp_node *new) | 2212 | unsigned long end, struct sp_node *new) |
2164 | { | 2213 | { |
2165 | struct sp_node *n, *new2 = NULL; | 2214 | struct sp_node *n; |
2215 | int ret = 0; | ||
2166 | 2216 | ||
2167 | restart: | 2217 | mutex_lock(&sp->mutex); |
2168 | spin_lock(&sp->lock); | ||
2169 | n = sp_lookup(sp, start, end); | 2218 | n = sp_lookup(sp, start, end); |
2170 | /* Take care of old policies in the same range. */ | 2219 | /* Take care of old policies in the same range. */ |
2171 | while (n && n->start < end) { | 2220 | while (n && n->start < end) { |
@@ -2178,16 +2227,14 @@ restart: | |||
2178 | } else { | 2227 | } else { |
2179 | /* Old policy spanning whole new range. */ | 2228 | /* Old policy spanning whole new range. */ |
2180 | if (n->end > end) { | 2229 | if (n->end > end) { |
2230 | struct sp_node *new2; | ||
2231 | new2 = sp_alloc(end, n->end, n->policy); | ||
2181 | if (!new2) { | 2232 | if (!new2) { |
2182 | spin_unlock(&sp->lock); | 2233 | ret = -ENOMEM; |
2183 | new2 = sp_alloc(end, n->end, n->policy); | 2234 | goto out; |
2184 | if (!new2) | ||
2185 | return -ENOMEM; | ||
2186 | goto restart; | ||
2187 | } | 2235 | } |
2188 | n->end = start; | 2236 | n->end = start; |
2189 | sp_insert(sp, new2); | 2237 | sp_insert(sp, new2); |
2190 | new2 = NULL; | ||
2191 | break; | 2238 | break; |
2192 | } else | 2239 | } else |
2193 | n->end = start; | 2240 | n->end = start; |
@@ -2198,12 +2245,9 @@ restart: | |||
2198 | } | 2245 | } |
2199 | if (new) | 2246 | if (new) |
2200 | sp_insert(sp, new); | 2247 | sp_insert(sp, new); |
2201 | spin_unlock(&sp->lock); | 2248 | out: |
2202 | if (new2) { | 2249 | mutex_unlock(&sp->mutex); |
2203 | mpol_put(new2->policy); | 2250 | return ret; |
2204 | kmem_cache_free(sn_cache, new2); | ||
2205 | } | ||
2206 | return 0; | ||
2207 | } | 2251 | } |
2208 | 2252 | ||
2209 | /** | 2253 | /** |
@@ -2221,7 +2265,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) | |||
2221 | int ret; | 2265 | int ret; |
2222 | 2266 | ||
2223 | sp->root = RB_ROOT; /* empty tree == default mempolicy */ | 2267 | sp->root = RB_ROOT; /* empty tree == default mempolicy */ |
2224 | spin_lock_init(&sp->lock); | 2268 | mutex_init(&sp->mutex); |
2225 | 2269 | ||
2226 | if (mpol) { | 2270 | if (mpol) { |
2227 | struct vm_area_struct pvma; | 2271 | struct vm_area_struct pvma; |
@@ -2275,7 +2319,7 @@ int mpol_set_shared_policy(struct shared_policy *info, | |||
2275 | } | 2319 | } |
2276 | err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); | 2320 | err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); |
2277 | if (err && new) | 2321 | if (err && new) |
2278 | kmem_cache_free(sn_cache, new); | 2322 | sp_free(new); |
2279 | return err; | 2323 | return err; |
2280 | } | 2324 | } |
2281 | 2325 | ||
@@ -2287,16 +2331,14 @@ void mpol_free_shared_policy(struct shared_policy *p) | |||
2287 | 2331 | ||
2288 | if (!p->root.rb_node) | 2332 | if (!p->root.rb_node) |
2289 | return; | 2333 | return; |
2290 | spin_lock(&p->lock); | 2334 | mutex_lock(&p->mutex); |
2291 | next = rb_first(&p->root); | 2335 | next = rb_first(&p->root); |
2292 | while (next) { | 2336 | while (next) { |
2293 | n = rb_entry(next, struct sp_node, nd); | 2337 | n = rb_entry(next, struct sp_node, nd); |
2294 | next = rb_next(&n->nd); | 2338 | next = rb_next(&n->nd); |
2295 | rb_erase(&n->nd, &p->root); | 2339 | sp_delete(p, n); |
2296 | mpol_put(n->policy); | ||
2297 | kmem_cache_free(sn_cache, n); | ||
2298 | } | 2340 | } |
2299 | spin_unlock(&p->lock); | 2341 | mutex_unlock(&p->mutex); |
2300 | } | 2342 | } |
2301 | 2343 | ||
2302 | /* assumes fs == KERNEL_DS */ | 2344 | /* assumes fs == KERNEL_DS */ |
diff --git a/mm/mlock.c b/mm/mlock.c index ef726e8aa8e9..f0b9ce572fc7 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -51,15 +51,13 @@ EXPORT_SYMBOL(can_do_mlock); | |||
51 | /* | 51 | /* |
52 | * LRU accounting for clear_page_mlock() | 52 | * LRU accounting for clear_page_mlock() |
53 | */ | 53 | */ |
54 | void __clear_page_mlock(struct page *page) | 54 | void clear_page_mlock(struct page *page) |
55 | { | 55 | { |
56 | VM_BUG_ON(!PageLocked(page)); | 56 | if (!TestClearPageMlocked(page)) |
57 | |||
58 | if (!page->mapping) { /* truncated ? */ | ||
59 | return; | 57 | return; |
60 | } | ||
61 | 58 | ||
62 | dec_zone_page_state(page, NR_MLOCK); | 59 | mod_zone_page_state(page_zone(page), NR_MLOCK, |
60 | -hpage_nr_pages(page)); | ||
63 | count_vm_event(UNEVICTABLE_PGCLEARED); | 61 | count_vm_event(UNEVICTABLE_PGCLEARED); |
64 | if (!isolate_lru_page(page)) { | 62 | if (!isolate_lru_page(page)) { |
65 | putback_lru_page(page); | 63 | putback_lru_page(page); |
@@ -81,7 +79,8 @@ void mlock_vma_page(struct page *page) | |||
81 | BUG_ON(!PageLocked(page)); | 79 | BUG_ON(!PageLocked(page)); |
82 | 80 | ||
83 | if (!TestSetPageMlocked(page)) { | 81 | if (!TestSetPageMlocked(page)) { |
84 | inc_zone_page_state(page, NR_MLOCK); | 82 | mod_zone_page_state(page_zone(page), NR_MLOCK, |
83 | hpage_nr_pages(page)); | ||
85 | count_vm_event(UNEVICTABLE_PGMLOCKED); | 84 | count_vm_event(UNEVICTABLE_PGMLOCKED); |
86 | if (!isolate_lru_page(page)) | 85 | if (!isolate_lru_page(page)) |
87 | putback_lru_page(page); | 86 | putback_lru_page(page); |
@@ -108,7 +107,8 @@ void munlock_vma_page(struct page *page) | |||
108 | BUG_ON(!PageLocked(page)); | 107 | BUG_ON(!PageLocked(page)); |
109 | 108 | ||
110 | if (TestClearPageMlocked(page)) { | 109 | if (TestClearPageMlocked(page)) { |
111 | dec_zone_page_state(page, NR_MLOCK); | 110 | mod_zone_page_state(page_zone(page), NR_MLOCK, |
111 | -hpage_nr_pages(page)); | ||
112 | if (!isolate_lru_page(page)) { | 112 | if (!isolate_lru_page(page)) { |
113 | int ret = SWAP_AGAIN; | 113 | int ret = SWAP_AGAIN; |
114 | 114 | ||
@@ -227,7 +227,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, | |||
227 | if (vma->vm_flags & (VM_IO | VM_PFNMAP)) | 227 | if (vma->vm_flags & (VM_IO | VM_PFNMAP)) |
228 | goto no_mlock; | 228 | goto no_mlock; |
229 | 229 | ||
230 | if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || | 230 | if (!((vma->vm_flags & VM_DONTEXPAND) || |
231 | is_vm_hugetlb_page(vma) || | 231 | is_vm_hugetlb_page(vma) || |
232 | vma == get_gate_vma(current->mm))) { | 232 | vma == get_gate_vma(current->mm))) { |
233 | 233 | ||
@@ -290,14 +290,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, | |||
290 | page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); | 290 | page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); |
291 | if (page && !IS_ERR(page)) { | 291 | if (page && !IS_ERR(page)) { |
292 | lock_page(page); | 292 | lock_page(page); |
293 | /* | 293 | munlock_vma_page(page); |
294 | * Like in __mlock_vma_pages_range(), | ||
295 | * because we lock page here and migration is | ||
296 | * blocked by the elevated reference, we need | ||
297 | * only check for file-cache page truncation. | ||
298 | */ | ||
299 | if (page->mapping) | ||
300 | munlock_vma_page(page); | ||
301 | unlock_page(page); | 294 | unlock_page(page); |
302 | put_page(page); | 295 | put_page(page); |
303 | } | 296 | } |
@@ -51,12 +51,6 @@ static void unmap_region(struct mm_struct *mm, | |||
51 | struct vm_area_struct *vma, struct vm_area_struct *prev, | 51 | struct vm_area_struct *vma, struct vm_area_struct *prev, |
52 | unsigned long start, unsigned long end); | 52 | unsigned long start, unsigned long end); |
53 | 53 | ||
54 | /* | ||
55 | * WARNING: the debugging will use recursive algorithms so never enable this | ||
56 | * unless you know what you are doing. | ||
57 | */ | ||
58 | #undef DEBUG_MM_RB | ||
59 | |||
60 | /* description of effects of mapping type and prot in current implementation. | 54 | /* description of effects of mapping type and prot in current implementation. |
61 | * this is due to the limited x86 page protection hardware. The expected | 55 | * this is due to the limited x86 page protection hardware. The expected |
62 | * behavior is in parens: | 56 | * behavior is in parens: |
@@ -199,14 +193,14 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, | |||
199 | 193 | ||
200 | flush_dcache_mmap_lock(mapping); | 194 | flush_dcache_mmap_lock(mapping); |
201 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) | 195 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) |
202 | list_del_init(&vma->shared.vm_set.list); | 196 | list_del_init(&vma->shared.nonlinear); |
203 | else | 197 | else |
204 | vma_prio_tree_remove(vma, &mapping->i_mmap); | 198 | vma_interval_tree_remove(vma, &mapping->i_mmap); |
205 | flush_dcache_mmap_unlock(mapping); | 199 | flush_dcache_mmap_unlock(mapping); |
206 | } | 200 | } |
207 | 201 | ||
208 | /* | 202 | /* |
209 | * Unlink a file-based vm structure from its prio_tree, to hide | 203 | * Unlink a file-based vm structure from its interval tree, to hide |
210 | * vma from rmap and vmtruncate before freeing its page tables. | 204 | * vma from rmap and vmtruncate before freeing its page tables. |
211 | */ | 205 | */ |
212 | void unlink_file_vma(struct vm_area_struct *vma) | 206 | void unlink_file_vma(struct vm_area_struct *vma) |
@@ -231,11 +225,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) | |||
231 | might_sleep(); | 225 | might_sleep(); |
232 | if (vma->vm_ops && vma->vm_ops->close) | 226 | if (vma->vm_ops && vma->vm_ops->close) |
233 | vma->vm_ops->close(vma); | 227 | vma->vm_ops->close(vma); |
234 | if (vma->vm_file) { | 228 | if (vma->vm_file) |
235 | fput(vma->vm_file); | 229 | fput(vma->vm_file); |
236 | if (vma->vm_flags & VM_EXECUTABLE) | ||
237 | removed_exe_file_vma(vma->vm_mm); | ||
238 | } | ||
239 | mpol_put(vma_policy(vma)); | 230 | mpol_put(vma_policy(vma)); |
240 | kmem_cache_free(vm_area_cachep, vma); | 231 | kmem_cache_free(vm_area_cachep, vma); |
241 | return next; | 232 | return next; |
@@ -306,7 +297,7 @@ out: | |||
306 | return retval; | 297 | return retval; |
307 | } | 298 | } |
308 | 299 | ||
309 | #ifdef DEBUG_MM_RB | 300 | #ifdef CONFIG_DEBUG_VM_RB |
310 | static int browse_rb(struct rb_root *root) | 301 | static int browse_rb(struct rb_root *root) |
311 | { | 302 | { |
312 | int i = 0, j; | 303 | int i = 0, j; |
@@ -340,9 +331,12 @@ void validate_mm(struct mm_struct *mm) | |||
340 | { | 331 | { |
341 | int bug = 0; | 332 | int bug = 0; |
342 | int i = 0; | 333 | int i = 0; |
343 | struct vm_area_struct *tmp = mm->mmap; | 334 | struct vm_area_struct *vma = mm->mmap; |
344 | while (tmp) { | 335 | while (vma) { |
345 | tmp = tmp->vm_next; | 336 | struct anon_vma_chain *avc; |
337 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) | ||
338 | anon_vma_interval_tree_verify(avc); | ||
339 | vma = vma->vm_next; | ||
346 | i++; | 340 | i++; |
347 | } | 341 | } |
348 | if (i != mm->map_count) | 342 | if (i != mm->map_count) |
@@ -356,17 +350,46 @@ void validate_mm(struct mm_struct *mm) | |||
356 | #define validate_mm(mm) do { } while (0) | 350 | #define validate_mm(mm) do { } while (0) |
357 | #endif | 351 | #endif |
358 | 352 | ||
359 | static struct vm_area_struct * | 353 | /* |
360 | find_vma_prepare(struct mm_struct *mm, unsigned long addr, | 354 | * vma has some anon_vma assigned, and is already inserted on that |
361 | struct vm_area_struct **pprev, struct rb_node ***rb_link, | 355 | * anon_vma's interval trees. |
362 | struct rb_node ** rb_parent) | 356 | * |
357 | * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the | ||
358 | * vma must be removed from the anon_vma's interval trees using | ||
359 | * anon_vma_interval_tree_pre_update_vma(). | ||
360 | * | ||
361 | * After the update, the vma will be reinserted using | ||
362 | * anon_vma_interval_tree_post_update_vma(). | ||
363 | * | ||
364 | * The entire update must be protected by exclusive mmap_sem and by | ||
365 | * the root anon_vma's mutex. | ||
366 | */ | ||
367 | static inline void | ||
368 | anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) | ||
363 | { | 369 | { |
364 | struct vm_area_struct * vma; | 370 | struct anon_vma_chain *avc; |
365 | struct rb_node ** __rb_link, * __rb_parent, * rb_prev; | 371 | |
372 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) | ||
373 | anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); | ||
374 | } | ||
375 | |||
376 | static inline void | ||
377 | anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) | ||
378 | { | ||
379 | struct anon_vma_chain *avc; | ||
380 | |||
381 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) | ||
382 | anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); | ||
383 | } | ||
384 | |||
385 | static int find_vma_links(struct mm_struct *mm, unsigned long addr, | ||
386 | unsigned long end, struct vm_area_struct **pprev, | ||
387 | struct rb_node ***rb_link, struct rb_node **rb_parent) | ||
388 | { | ||
389 | struct rb_node **__rb_link, *__rb_parent, *rb_prev; | ||
366 | 390 | ||
367 | __rb_link = &mm->mm_rb.rb_node; | 391 | __rb_link = &mm->mm_rb.rb_node; |
368 | rb_prev = __rb_parent = NULL; | 392 | rb_prev = __rb_parent = NULL; |
369 | vma = NULL; | ||
370 | 393 | ||
371 | while (*__rb_link) { | 394 | while (*__rb_link) { |
372 | struct vm_area_struct *vma_tmp; | 395 | struct vm_area_struct *vma_tmp; |
@@ -375,9 +398,9 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr, | |||
375 | vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); | 398 | vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); |
376 | 399 | ||
377 | if (vma_tmp->vm_end > addr) { | 400 | if (vma_tmp->vm_end > addr) { |
378 | vma = vma_tmp; | 401 | /* Fail if an existing vma overlaps the area */ |
379 | if (vma_tmp->vm_start <= addr) | 402 | if (vma_tmp->vm_start < end) |
380 | break; | 403 | return -ENOMEM; |
381 | __rb_link = &__rb_parent->rb_left; | 404 | __rb_link = &__rb_parent->rb_left; |
382 | } else { | 405 | } else { |
383 | rb_prev = __rb_parent; | 406 | rb_prev = __rb_parent; |
@@ -390,7 +413,7 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr, | |||
390 | *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); | 413 | *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); |
391 | *rb_link = __rb_link; | 414 | *rb_link = __rb_link; |
392 | *rb_parent = __rb_parent; | 415 | *rb_parent = __rb_parent; |
393 | return vma; | 416 | return 0; |
394 | } | 417 | } |
395 | 418 | ||
396 | void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, | 419 | void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, |
@@ -417,7 +440,7 @@ static void __vma_link_file(struct vm_area_struct *vma) | |||
417 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) | 440 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) |
418 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); | 441 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); |
419 | else | 442 | else |
420 | vma_prio_tree_insert(vma, &mapping->i_mmap); | 443 | vma_interval_tree_insert(vma, &mapping->i_mmap); |
421 | flush_dcache_mmap_unlock(mapping); | 444 | flush_dcache_mmap_unlock(mapping); |
422 | } | 445 | } |
423 | } | 446 | } |
@@ -455,15 +478,16 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, | |||
455 | 478 | ||
456 | /* | 479 | /* |
457 | * Helper for vma_adjust() in the split_vma insert case: insert a vma into the | 480 | * Helper for vma_adjust() in the split_vma insert case: insert a vma into the |
458 | * mm's list and rbtree. It has already been inserted into the prio_tree. | 481 | * mm's list and rbtree. It has already been inserted into the interval tree. |
459 | */ | 482 | */ |
460 | static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) | 483 | static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) |
461 | { | 484 | { |
462 | struct vm_area_struct *__vma, *prev; | 485 | struct vm_area_struct *prev; |
463 | struct rb_node **rb_link, *rb_parent; | 486 | struct rb_node **rb_link, *rb_parent; |
464 | 487 | ||
465 | __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent); | 488 | if (find_vma_links(mm, vma->vm_start, vma->vm_end, |
466 | BUG_ON(__vma && __vma->vm_start < vma->vm_end); | 489 | &prev, &rb_link, &rb_parent)) |
490 | BUG(); | ||
467 | __vma_link(mm, vma, prev, rb_link, rb_parent); | 491 | __vma_link(mm, vma, prev, rb_link, rb_parent); |
468 | mm->map_count++; | 492 | mm->map_count++; |
469 | } | 493 | } |
@@ -496,7 +520,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, | |||
496 | struct vm_area_struct *next = vma->vm_next; | 520 | struct vm_area_struct *next = vma->vm_next; |
497 | struct vm_area_struct *importer = NULL; | 521 | struct vm_area_struct *importer = NULL; |
498 | struct address_space *mapping = NULL; | 522 | struct address_space *mapping = NULL; |
499 | struct prio_tree_root *root = NULL; | 523 | struct rb_root *root = NULL; |
500 | struct anon_vma *anon_vma = NULL; | 524 | struct anon_vma *anon_vma = NULL; |
501 | struct file *file = vma->vm_file; | 525 | struct file *file = vma->vm_file; |
502 | long adjust_next = 0; | 526 | long adjust_next = 0; |
@@ -559,7 +583,7 @@ again: remove_next = 1 + (end > next->vm_end); | |||
559 | mutex_lock(&mapping->i_mmap_mutex); | 583 | mutex_lock(&mapping->i_mmap_mutex); |
560 | if (insert) { | 584 | if (insert) { |
561 | /* | 585 | /* |
562 | * Put into prio_tree now, so instantiated pages | 586 | * Put into interval tree now, so instantiated pages |
563 | * are visible to arm/parisc __flush_dcache_page | 587 | * are visible to arm/parisc __flush_dcache_page |
564 | * throughout; but we cannot insert into address | 588 | * throughout; but we cannot insert into address |
565 | * space until vma start or end is updated. | 589 | * space until vma start or end is updated. |
@@ -570,22 +594,23 @@ again: remove_next = 1 + (end > next->vm_end); | |||
570 | 594 | ||
571 | vma_adjust_trans_huge(vma, start, end, adjust_next); | 595 | vma_adjust_trans_huge(vma, start, end, adjust_next); |
572 | 596 | ||
573 | /* | 597 | anon_vma = vma->anon_vma; |
574 | * When changing only vma->vm_end, we don't really need anon_vma | 598 | if (!anon_vma && adjust_next) |
575 | * lock. This is a fairly rare case by itself, but the anon_vma | 599 | anon_vma = next->anon_vma; |
576 | * lock may be shared between many sibling processes. Skipping | 600 | if (anon_vma) { |
577 | * the lock for brk adjustments makes a difference sometimes. | 601 | VM_BUG_ON(adjust_next && next->anon_vma && |
578 | */ | 602 | anon_vma != next->anon_vma); |
579 | if (vma->anon_vma && (importer || start != vma->vm_start)) { | ||
580 | anon_vma = vma->anon_vma; | ||
581 | anon_vma_lock(anon_vma); | 603 | anon_vma_lock(anon_vma); |
604 | anon_vma_interval_tree_pre_update_vma(vma); | ||
605 | if (adjust_next) | ||
606 | anon_vma_interval_tree_pre_update_vma(next); | ||
582 | } | 607 | } |
583 | 608 | ||
584 | if (root) { | 609 | if (root) { |
585 | flush_dcache_mmap_lock(mapping); | 610 | flush_dcache_mmap_lock(mapping); |
586 | vma_prio_tree_remove(vma, root); | 611 | vma_interval_tree_remove(vma, root); |
587 | if (adjust_next) | 612 | if (adjust_next) |
588 | vma_prio_tree_remove(next, root); | 613 | vma_interval_tree_remove(next, root); |
589 | } | 614 | } |
590 | 615 | ||
591 | vma->vm_start = start; | 616 | vma->vm_start = start; |
@@ -598,8 +623,8 @@ again: remove_next = 1 + (end > next->vm_end); | |||
598 | 623 | ||
599 | if (root) { | 624 | if (root) { |
600 | if (adjust_next) | 625 | if (adjust_next) |
601 | vma_prio_tree_insert(next, root); | 626 | vma_interval_tree_insert(next, root); |
602 | vma_prio_tree_insert(vma, root); | 627 | vma_interval_tree_insert(vma, root); |
603 | flush_dcache_mmap_unlock(mapping); | 628 | flush_dcache_mmap_unlock(mapping); |
604 | } | 629 | } |
605 | 630 | ||
@@ -620,8 +645,12 @@ again: remove_next = 1 + (end > next->vm_end); | |||
620 | __insert_vm_struct(mm, insert); | 645 | __insert_vm_struct(mm, insert); |
621 | } | 646 | } |
622 | 647 | ||
623 | if (anon_vma) | 648 | if (anon_vma) { |
649 | anon_vma_interval_tree_post_update_vma(vma); | ||
650 | if (adjust_next) | ||
651 | anon_vma_interval_tree_post_update_vma(next); | ||
624 | anon_vma_unlock(anon_vma); | 652 | anon_vma_unlock(anon_vma); |
653 | } | ||
625 | if (mapping) | 654 | if (mapping) |
626 | mutex_unlock(&mapping->i_mmap_mutex); | 655 | mutex_unlock(&mapping->i_mmap_mutex); |
627 | 656 | ||
@@ -636,8 +665,6 @@ again: remove_next = 1 + (end > next->vm_end); | |||
636 | if (file) { | 665 | if (file) { |
637 | uprobe_munmap(next, next->vm_start, next->vm_end); | 666 | uprobe_munmap(next, next->vm_start, next->vm_end); |
638 | fput(file); | 667 | fput(file); |
639 | if (next->vm_flags & VM_EXECUTABLE) | ||
640 | removed_exe_file_vma(mm); | ||
641 | } | 668 | } |
642 | if (next->anon_vma) | 669 | if (next->anon_vma) |
643 | anon_vma_merge(vma, next); | 670 | anon_vma_merge(vma, next); |
@@ -669,8 +696,7 @@ again: remove_next = 1 + (end > next->vm_end); | |||
669 | static inline int is_mergeable_vma(struct vm_area_struct *vma, | 696 | static inline int is_mergeable_vma(struct vm_area_struct *vma, |
670 | struct file *file, unsigned long vm_flags) | 697 | struct file *file, unsigned long vm_flags) |
671 | { | 698 | { |
672 | /* VM_CAN_NONLINEAR may get set later by f_op->mmap() */ | 699 | if (vma->vm_flags ^ vm_flags) |
673 | if ((vma->vm_flags ^ vm_flags) & ~VM_CAN_NONLINEAR) | ||
674 | return 0; | 700 | return 0; |
675 | if (vma->vm_file != file) | 701 | if (vma->vm_file != file) |
676 | return 0; | 702 | return 0; |
@@ -951,8 +977,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags, | |||
951 | mm->exec_vm += pages; | 977 | mm->exec_vm += pages; |
952 | } else if (flags & stack_flags) | 978 | } else if (flags & stack_flags) |
953 | mm->stack_vm += pages; | 979 | mm->stack_vm += pages; |
954 | if (flags & (VM_RESERVED|VM_IO)) | ||
955 | mm->reserved_vm += pages; | ||
956 | } | 980 | } |
957 | #endif /* CONFIG_PROC_FS */ | 981 | #endif /* CONFIG_PROC_FS */ |
958 | 982 | ||
@@ -1190,7 +1214,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma) | |||
1190 | return 0; | 1214 | return 0; |
1191 | 1215 | ||
1192 | /* Specialty mapping? */ | 1216 | /* Specialty mapping? */ |
1193 | if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE)) | 1217 | if (vm_flags & VM_PFNMAP) |
1194 | return 0; | 1218 | return 0; |
1195 | 1219 | ||
1196 | /* Can the mapping track the dirty pages? */ | 1220 | /* Can the mapping track the dirty pages? */ |
@@ -1229,8 +1253,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, | |||
1229 | /* Clear old maps */ | 1253 | /* Clear old maps */ |
1230 | error = -ENOMEM; | 1254 | error = -ENOMEM; |
1231 | munmap_back: | 1255 | munmap_back: |
1232 | vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); | 1256 | if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { |
1233 | if (vma && vma->vm_start < addr + len) { | ||
1234 | if (do_munmap(mm, addr, len)) | 1257 | if (do_munmap(mm, addr, len)) |
1235 | return -ENOMEM; | 1258 | return -ENOMEM; |
1236 | goto munmap_back; | 1259 | goto munmap_back; |
@@ -1305,8 +1328,6 @@ munmap_back: | |||
1305 | error = file->f_op->mmap(file, vma); | 1328 | error = file->f_op->mmap(file, vma); |
1306 | if (error) | 1329 | if (error) |
1307 | goto unmap_and_free_vma; | 1330 | goto unmap_and_free_vma; |
1308 | if (vm_flags & VM_EXECUTABLE) | ||
1309 | added_exe_file_vma(mm); | ||
1310 | 1331 | ||
1311 | /* Can addr have changed?? | 1332 | /* Can addr have changed?? |
1312 | * | 1333 | * |
@@ -1757,13 +1778,16 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) | |||
1757 | if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { | 1778 | if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { |
1758 | error = acct_stack_growth(vma, size, grow); | 1779 | error = acct_stack_growth(vma, size, grow); |
1759 | if (!error) { | 1780 | if (!error) { |
1781 | anon_vma_interval_tree_pre_update_vma(vma); | ||
1760 | vma->vm_end = address; | 1782 | vma->vm_end = address; |
1783 | anon_vma_interval_tree_post_update_vma(vma); | ||
1761 | perf_event_mmap(vma); | 1784 | perf_event_mmap(vma); |
1762 | } | 1785 | } |
1763 | } | 1786 | } |
1764 | } | 1787 | } |
1765 | vma_unlock_anon_vma(vma); | 1788 | vma_unlock_anon_vma(vma); |
1766 | khugepaged_enter_vma_merge(vma); | 1789 | khugepaged_enter_vma_merge(vma); |
1790 | validate_mm(vma->vm_mm); | ||
1767 | return error; | 1791 | return error; |
1768 | } | 1792 | } |
1769 | #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ | 1793 | #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ |
@@ -1807,14 +1831,17 @@ int expand_downwards(struct vm_area_struct *vma, | |||
1807 | if (grow <= vma->vm_pgoff) { | 1831 | if (grow <= vma->vm_pgoff) { |
1808 | error = acct_stack_growth(vma, size, grow); | 1832 | error = acct_stack_growth(vma, size, grow); |
1809 | if (!error) { | 1833 | if (!error) { |
1834 | anon_vma_interval_tree_pre_update_vma(vma); | ||
1810 | vma->vm_start = address; | 1835 | vma->vm_start = address; |
1811 | vma->vm_pgoff -= grow; | 1836 | vma->vm_pgoff -= grow; |
1837 | anon_vma_interval_tree_post_update_vma(vma); | ||
1812 | perf_event_mmap(vma); | 1838 | perf_event_mmap(vma); |
1813 | } | 1839 | } |
1814 | } | 1840 | } |
1815 | } | 1841 | } |
1816 | vma_unlock_anon_vma(vma); | 1842 | vma_unlock_anon_vma(vma); |
1817 | khugepaged_enter_vma_merge(vma); | 1843 | khugepaged_enter_vma_merge(vma); |
1844 | validate_mm(vma->vm_mm); | ||
1818 | return error; | 1845 | return error; |
1819 | } | 1846 | } |
1820 | 1847 | ||
@@ -1988,11 +2015,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | |||
1988 | if (anon_vma_clone(new, vma)) | 2015 | if (anon_vma_clone(new, vma)) |
1989 | goto out_free_mpol; | 2016 | goto out_free_mpol; |
1990 | 2017 | ||
1991 | if (new->vm_file) { | 2018 | if (new->vm_file) |
1992 | get_file(new->vm_file); | 2019 | get_file(new->vm_file); |
1993 | if (vma->vm_flags & VM_EXECUTABLE) | ||
1994 | added_exe_file_vma(mm); | ||
1995 | } | ||
1996 | 2020 | ||
1997 | if (new->vm_ops && new->vm_ops->open) | 2021 | if (new->vm_ops && new->vm_ops->open) |
1998 | new->vm_ops->open(new); | 2022 | new->vm_ops->open(new); |
@@ -2010,11 +2034,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | |||
2010 | /* Clean everything up if vma_adjust failed. */ | 2034 | /* Clean everything up if vma_adjust failed. */ |
2011 | if (new->vm_ops && new->vm_ops->close) | 2035 | if (new->vm_ops && new->vm_ops->close) |
2012 | new->vm_ops->close(new); | 2036 | new->vm_ops->close(new); |
2013 | if (new->vm_file) { | 2037 | if (new->vm_file) |
2014 | if (vma->vm_flags & VM_EXECUTABLE) | ||
2015 | removed_exe_file_vma(mm); | ||
2016 | fput(new->vm_file); | 2038 | fput(new->vm_file); |
2017 | } | ||
2018 | unlink_anon_vmas(new); | 2039 | unlink_anon_vmas(new); |
2019 | out_free_mpol: | 2040 | out_free_mpol: |
2020 | mpol_put(pol); | 2041 | mpol_put(pol); |
@@ -2199,8 +2220,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) | |||
2199 | * Clear old maps. this also does some error checking for us | 2220 | * Clear old maps. this also does some error checking for us |
2200 | */ | 2221 | */ |
2201 | munmap_back: | 2222 | munmap_back: |
2202 | vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); | 2223 | if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { |
2203 | if (vma && vma->vm_start < addr + len) { | ||
2204 | if (do_munmap(mm, addr, len)) | 2224 | if (do_munmap(mm, addr, len)) |
2205 | return -ENOMEM; | 2225 | return -ENOMEM; |
2206 | goto munmap_back; | 2226 | goto munmap_back; |
@@ -2314,10 +2334,10 @@ void exit_mmap(struct mm_struct *mm) | |||
2314 | * and into the inode's i_mmap tree. If vm_file is non-NULL | 2334 | * and into the inode's i_mmap tree. If vm_file is non-NULL |
2315 | * then i_mmap_mutex is taken here. | 2335 | * then i_mmap_mutex is taken here. |
2316 | */ | 2336 | */ |
2317 | int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) | 2337 | int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) |
2318 | { | 2338 | { |
2319 | struct vm_area_struct * __vma, * prev; | 2339 | struct vm_area_struct *prev; |
2320 | struct rb_node ** rb_link, * rb_parent; | 2340 | struct rb_node **rb_link, *rb_parent; |
2321 | 2341 | ||
2322 | /* | 2342 | /* |
2323 | * The vm_pgoff of a purely anonymous vma should be irrelevant | 2343 | * The vm_pgoff of a purely anonymous vma should be irrelevant |
@@ -2335,8 +2355,8 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) | |||
2335 | BUG_ON(vma->anon_vma); | 2355 | BUG_ON(vma->anon_vma); |
2336 | vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; | 2356 | vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; |
2337 | } | 2357 | } |
2338 | __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent); | 2358 | if (find_vma_links(mm, vma->vm_start, vma->vm_end, |
2339 | if (__vma && __vma->vm_start < vma->vm_end) | 2359 | &prev, &rb_link, &rb_parent)) |
2340 | return -ENOMEM; | 2360 | return -ENOMEM; |
2341 | if ((vma->vm_flags & VM_ACCOUNT) && | 2361 | if ((vma->vm_flags & VM_ACCOUNT) && |
2342 | security_vm_enough_memory_mm(mm, vma_pages(vma))) | 2362 | security_vm_enough_memory_mm(mm, vma_pages(vma))) |
@@ -2351,7 +2371,8 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) | |||
2351 | * prior to moving page table entries, to effect an mremap move. | 2371 | * prior to moving page table entries, to effect an mremap move. |
2352 | */ | 2372 | */ |
2353 | struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | 2373 | struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, |
2354 | unsigned long addr, unsigned long len, pgoff_t pgoff) | 2374 | unsigned long addr, unsigned long len, pgoff_t pgoff, |
2375 | bool *need_rmap_locks) | ||
2355 | { | 2376 | { |
2356 | struct vm_area_struct *vma = *vmap; | 2377 | struct vm_area_struct *vma = *vmap; |
2357 | unsigned long vma_start = vma->vm_start; | 2378 | unsigned long vma_start = vma->vm_start; |
@@ -2370,7 +2391,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
2370 | faulted_in_anon_vma = false; | 2391 | faulted_in_anon_vma = false; |
2371 | } | 2392 | } |
2372 | 2393 | ||
2373 | find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); | 2394 | if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) |
2395 | return NULL; /* should never get here */ | ||
2374 | new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, | 2396 | new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, |
2375 | vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); | 2397 | vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); |
2376 | if (new_vma) { | 2398 | if (new_vma) { |
@@ -2392,32 +2414,29 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
2392 | * linear if there are no pages mapped yet. | 2414 | * linear if there are no pages mapped yet. |
2393 | */ | 2415 | */ |
2394 | VM_BUG_ON(faulted_in_anon_vma); | 2416 | VM_BUG_ON(faulted_in_anon_vma); |
2395 | *vmap = new_vma; | 2417 | *vmap = vma = new_vma; |
2396 | } else | 2418 | } |
2397 | anon_vma_moveto_tail(new_vma); | 2419 | *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); |
2398 | } else { | 2420 | } else { |
2399 | new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); | 2421 | new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
2400 | if (new_vma) { | 2422 | if (new_vma) { |
2401 | *new_vma = *vma; | 2423 | *new_vma = *vma; |
2424 | new_vma->vm_start = addr; | ||
2425 | new_vma->vm_end = addr + len; | ||
2426 | new_vma->vm_pgoff = pgoff; | ||
2402 | pol = mpol_dup(vma_policy(vma)); | 2427 | pol = mpol_dup(vma_policy(vma)); |
2403 | if (IS_ERR(pol)) | 2428 | if (IS_ERR(pol)) |
2404 | goto out_free_vma; | 2429 | goto out_free_vma; |
2430 | vma_set_policy(new_vma, pol); | ||
2405 | INIT_LIST_HEAD(&new_vma->anon_vma_chain); | 2431 | INIT_LIST_HEAD(&new_vma->anon_vma_chain); |
2406 | if (anon_vma_clone(new_vma, vma)) | 2432 | if (anon_vma_clone(new_vma, vma)) |
2407 | goto out_free_mempol; | 2433 | goto out_free_mempol; |
2408 | vma_set_policy(new_vma, pol); | 2434 | if (new_vma->vm_file) |
2409 | new_vma->vm_start = addr; | ||
2410 | new_vma->vm_end = addr + len; | ||
2411 | new_vma->vm_pgoff = pgoff; | ||
2412 | if (new_vma->vm_file) { | ||
2413 | get_file(new_vma->vm_file); | 2435 | get_file(new_vma->vm_file); |
2414 | |||
2415 | if (vma->vm_flags & VM_EXECUTABLE) | ||
2416 | added_exe_file_vma(mm); | ||
2417 | } | ||
2418 | if (new_vma->vm_ops && new_vma->vm_ops->open) | 2436 | if (new_vma->vm_ops && new_vma->vm_ops->open) |
2419 | new_vma->vm_ops->open(new_vma); | 2437 | new_vma->vm_ops->open(new_vma); |
2420 | vma_link(mm, new_vma, prev, rb_link, rb_parent); | 2438 | vma_link(mm, new_vma, prev, rb_link, rb_parent); |
2439 | *need_rmap_locks = false; | ||
2421 | } | 2440 | } |
2422 | } | 2441 | } |
2423 | return new_vma; | 2442 | return new_vma; |
@@ -2535,7 +2554,7 @@ static DEFINE_MUTEX(mm_all_locks_mutex); | |||
2535 | 2554 | ||
2536 | static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) | 2555 | static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) |
2537 | { | 2556 | { |
2538 | if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { | 2557 | if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { |
2539 | /* | 2558 | /* |
2540 | * The LSB of head.next can't change from under us | 2559 | * The LSB of head.next can't change from under us |
2541 | * because we hold the mm_all_locks_mutex. | 2560 | * because we hold the mm_all_locks_mutex. |
@@ -2551,7 +2570,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) | |||
2551 | * anon_vma->root->mutex. | 2570 | * anon_vma->root->mutex. |
2552 | */ | 2571 | */ |
2553 | if (__test_and_set_bit(0, (unsigned long *) | 2572 | if (__test_and_set_bit(0, (unsigned long *) |
2554 | &anon_vma->root->head.next)) | 2573 | &anon_vma->root->rb_root.rb_node)) |
2555 | BUG(); | 2574 | BUG(); |
2556 | } | 2575 | } |
2557 | } | 2576 | } |
@@ -2592,7 +2611,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) | |||
2592 | * A single task can't take more than one mm_take_all_locks() in a row | 2611 | * A single task can't take more than one mm_take_all_locks() in a row |
2593 | * or it would deadlock. | 2612 | * or it would deadlock. |
2594 | * | 2613 | * |
2595 | * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in | 2614 | * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in |
2596 | * mapping->flags avoid to take the same lock twice, if more than one | 2615 | * mapping->flags avoid to take the same lock twice, if more than one |
2597 | * vma in this mm is backed by the same anon_vma or address_space. | 2616 | * vma in this mm is backed by the same anon_vma or address_space. |
2598 | * | 2617 | * |
@@ -2639,13 +2658,13 @@ out_unlock: | |||
2639 | 2658 | ||
2640 | static void vm_unlock_anon_vma(struct anon_vma *anon_vma) | 2659 | static void vm_unlock_anon_vma(struct anon_vma *anon_vma) |
2641 | { | 2660 | { |
2642 | if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { | 2661 | if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { |
2643 | /* | 2662 | /* |
2644 | * The LSB of head.next can't change to 0 from under | 2663 | * The LSB of head.next can't change to 0 from under |
2645 | * us because we hold the mm_all_locks_mutex. | 2664 | * us because we hold the mm_all_locks_mutex. |
2646 | * | 2665 | * |
2647 | * We must however clear the bitflag before unlocking | 2666 | * We must however clear the bitflag before unlocking |
2648 | * the vma so the users using the anon_vma->head will | 2667 | * the vma so the users using the anon_vma->rb_root will |
2649 | * never see our bitflag. | 2668 | * never see our bitflag. |
2650 | * | 2669 | * |
2651 | * No need of atomic instructions here, head.next | 2670 | * No need of atomic instructions here, head.next |
@@ -2653,7 +2672,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma) | |||
2653 | * anon_vma->root->mutex. | 2672 | * anon_vma->root->mutex. |
2654 | */ | 2673 | */ |
2655 | if (!__test_and_clear_bit(0, (unsigned long *) | 2674 | if (!__test_and_clear_bit(0, (unsigned long *) |
2656 | &anon_vma->root->head.next)) | 2675 | &anon_vma->root->rb_root.rb_node)) |
2657 | BUG(); | 2676 | BUG(); |
2658 | anon_vma_unlock(anon_vma); | 2677 | anon_vma_unlock(anon_vma); |
2659 | } | 2678 | } |
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 862b60822d9f..479a1e751a73 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c | |||
@@ -14,10 +14,14 @@ | |||
14 | #include <linux/export.h> | 14 | #include <linux/export.h> |
15 | #include <linux/mm.h> | 15 | #include <linux/mm.h> |
16 | #include <linux/err.h> | 16 | #include <linux/err.h> |
17 | #include <linux/srcu.h> | ||
17 | #include <linux/rcupdate.h> | 18 | #include <linux/rcupdate.h> |
18 | #include <linux/sched.h> | 19 | #include <linux/sched.h> |
19 | #include <linux/slab.h> | 20 | #include <linux/slab.h> |
20 | 21 | ||
22 | /* global SRCU for all MMs */ | ||
23 | static struct srcu_struct srcu; | ||
24 | |||
21 | /* | 25 | /* |
22 | * This function can't run concurrently against mmu_notifier_register | 26 | * This function can't run concurrently against mmu_notifier_register |
23 | * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap | 27 | * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap |
@@ -25,8 +29,8 @@ | |||
25 | * in parallel despite there being no task using this mm any more, | 29 | * in parallel despite there being no task using this mm any more, |
26 | * through the vmas outside of the exit_mmap context, such as with | 30 | * through the vmas outside of the exit_mmap context, such as with |
27 | * vmtruncate. This serializes against mmu_notifier_unregister with | 31 | * vmtruncate. This serializes against mmu_notifier_unregister with |
28 | * the mmu_notifier_mm->lock in addition to RCU and it serializes | 32 | * the mmu_notifier_mm->lock in addition to SRCU and it serializes |
29 | * against the other mmu notifiers with RCU. struct mmu_notifier_mm | 33 | * against the other mmu notifiers with SRCU. struct mmu_notifier_mm |
30 | * can't go away from under us as exit_mmap holds an mm_count pin | 34 | * can't go away from under us as exit_mmap holds an mm_count pin |
31 | * itself. | 35 | * itself. |
32 | */ | 36 | */ |
@@ -34,12 +38,13 @@ void __mmu_notifier_release(struct mm_struct *mm) | |||
34 | { | 38 | { |
35 | struct mmu_notifier *mn; | 39 | struct mmu_notifier *mn; |
36 | struct hlist_node *n; | 40 | struct hlist_node *n; |
41 | int id; | ||
37 | 42 | ||
38 | /* | 43 | /* |
39 | * RCU here will block mmu_notifier_unregister until | 44 | * SRCU here will block mmu_notifier_unregister until |
40 | * ->release returns. | 45 | * ->release returns. |
41 | */ | 46 | */ |
42 | rcu_read_lock(); | 47 | id = srcu_read_lock(&srcu); |
43 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) | 48 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) |
44 | /* | 49 | /* |
45 | * if ->release runs before mmu_notifier_unregister it | 50 | * if ->release runs before mmu_notifier_unregister it |
@@ -50,7 +55,7 @@ void __mmu_notifier_release(struct mm_struct *mm) | |||
50 | */ | 55 | */ |
51 | if (mn->ops->release) | 56 | if (mn->ops->release) |
52 | mn->ops->release(mn, mm); | 57 | mn->ops->release(mn, mm); |
53 | rcu_read_unlock(); | 58 | srcu_read_unlock(&srcu, id); |
54 | 59 | ||
55 | spin_lock(&mm->mmu_notifier_mm->lock); | 60 | spin_lock(&mm->mmu_notifier_mm->lock); |
56 | while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { | 61 | while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { |
@@ -68,7 +73,7 @@ void __mmu_notifier_release(struct mm_struct *mm) | |||
68 | spin_unlock(&mm->mmu_notifier_mm->lock); | 73 | spin_unlock(&mm->mmu_notifier_mm->lock); |
69 | 74 | ||
70 | /* | 75 | /* |
71 | * synchronize_rcu here prevents mmu_notifier_release to | 76 | * synchronize_srcu here prevents mmu_notifier_release to |
72 | * return to exit_mmap (which would proceed freeing all pages | 77 | * return to exit_mmap (which would proceed freeing all pages |
73 | * in the mm) until the ->release method returns, if it was | 78 | * in the mm) until the ->release method returns, if it was |
74 | * invoked by mmu_notifier_unregister. | 79 | * invoked by mmu_notifier_unregister. |
@@ -76,7 +81,7 @@ void __mmu_notifier_release(struct mm_struct *mm) | |||
76 | * The mmu_notifier_mm can't go away from under us because one | 81 | * The mmu_notifier_mm can't go away from under us because one |
77 | * mm_count is hold by exit_mmap. | 82 | * mm_count is hold by exit_mmap. |
78 | */ | 83 | */ |
79 | synchronize_rcu(); | 84 | synchronize_srcu(&srcu); |
80 | } | 85 | } |
81 | 86 | ||
82 | /* | 87 | /* |
@@ -89,14 +94,14 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, | |||
89 | { | 94 | { |
90 | struct mmu_notifier *mn; | 95 | struct mmu_notifier *mn; |
91 | struct hlist_node *n; | 96 | struct hlist_node *n; |
92 | int young = 0; | 97 | int young = 0, id; |
93 | 98 | ||
94 | rcu_read_lock(); | 99 | id = srcu_read_lock(&srcu); |
95 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | 100 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { |
96 | if (mn->ops->clear_flush_young) | 101 | if (mn->ops->clear_flush_young) |
97 | young |= mn->ops->clear_flush_young(mn, mm, address); | 102 | young |= mn->ops->clear_flush_young(mn, mm, address); |
98 | } | 103 | } |
99 | rcu_read_unlock(); | 104 | srcu_read_unlock(&srcu, id); |
100 | 105 | ||
101 | return young; | 106 | return young; |
102 | } | 107 | } |
@@ -106,9 +111,9 @@ int __mmu_notifier_test_young(struct mm_struct *mm, | |||
106 | { | 111 | { |
107 | struct mmu_notifier *mn; | 112 | struct mmu_notifier *mn; |
108 | struct hlist_node *n; | 113 | struct hlist_node *n; |
109 | int young = 0; | 114 | int young = 0, id; |
110 | 115 | ||
111 | rcu_read_lock(); | 116 | id = srcu_read_lock(&srcu); |
112 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | 117 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { |
113 | if (mn->ops->test_young) { | 118 | if (mn->ops->test_young) { |
114 | young = mn->ops->test_young(mn, mm, address); | 119 | young = mn->ops->test_young(mn, mm, address); |
@@ -116,7 +121,7 @@ int __mmu_notifier_test_young(struct mm_struct *mm, | |||
116 | break; | 121 | break; |
117 | } | 122 | } |
118 | } | 123 | } |
119 | rcu_read_unlock(); | 124 | srcu_read_unlock(&srcu, id); |
120 | 125 | ||
121 | return young; | 126 | return young; |
122 | } | 127 | } |
@@ -126,19 +131,14 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, | |||
126 | { | 131 | { |
127 | struct mmu_notifier *mn; | 132 | struct mmu_notifier *mn; |
128 | struct hlist_node *n; | 133 | struct hlist_node *n; |
134 | int id; | ||
129 | 135 | ||
130 | rcu_read_lock(); | 136 | id = srcu_read_lock(&srcu); |
131 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | 137 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { |
132 | if (mn->ops->change_pte) | 138 | if (mn->ops->change_pte) |
133 | mn->ops->change_pte(mn, mm, address, pte); | 139 | mn->ops->change_pte(mn, mm, address, pte); |
134 | /* | ||
135 | * Some drivers don't have change_pte, | ||
136 | * so we must call invalidate_page in that case. | ||
137 | */ | ||
138 | else if (mn->ops->invalidate_page) | ||
139 | mn->ops->invalidate_page(mn, mm, address); | ||
140 | } | 140 | } |
141 | rcu_read_unlock(); | 141 | srcu_read_unlock(&srcu, id); |
142 | } | 142 | } |
143 | 143 | ||
144 | void __mmu_notifier_invalidate_page(struct mm_struct *mm, | 144 | void __mmu_notifier_invalidate_page(struct mm_struct *mm, |
@@ -146,13 +146,14 @@ void __mmu_notifier_invalidate_page(struct mm_struct *mm, | |||
146 | { | 146 | { |
147 | struct mmu_notifier *mn; | 147 | struct mmu_notifier *mn; |
148 | struct hlist_node *n; | 148 | struct hlist_node *n; |
149 | int id; | ||
149 | 150 | ||
150 | rcu_read_lock(); | 151 | id = srcu_read_lock(&srcu); |
151 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | 152 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { |
152 | if (mn->ops->invalidate_page) | 153 | if (mn->ops->invalidate_page) |
153 | mn->ops->invalidate_page(mn, mm, address); | 154 | mn->ops->invalidate_page(mn, mm, address); |
154 | } | 155 | } |
155 | rcu_read_unlock(); | 156 | srcu_read_unlock(&srcu, id); |
156 | } | 157 | } |
157 | 158 | ||
158 | void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, | 159 | void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, |
@@ -160,13 +161,14 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, | |||
160 | { | 161 | { |
161 | struct mmu_notifier *mn; | 162 | struct mmu_notifier *mn; |
162 | struct hlist_node *n; | 163 | struct hlist_node *n; |
164 | int id; | ||
163 | 165 | ||
164 | rcu_read_lock(); | 166 | id = srcu_read_lock(&srcu); |
165 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | 167 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { |
166 | if (mn->ops->invalidate_range_start) | 168 | if (mn->ops->invalidate_range_start) |
167 | mn->ops->invalidate_range_start(mn, mm, start, end); | 169 | mn->ops->invalidate_range_start(mn, mm, start, end); |
168 | } | 170 | } |
169 | rcu_read_unlock(); | 171 | srcu_read_unlock(&srcu, id); |
170 | } | 172 | } |
171 | 173 | ||
172 | void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, | 174 | void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, |
@@ -174,13 +176,14 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, | |||
174 | { | 176 | { |
175 | struct mmu_notifier *mn; | 177 | struct mmu_notifier *mn; |
176 | struct hlist_node *n; | 178 | struct hlist_node *n; |
179 | int id; | ||
177 | 180 | ||
178 | rcu_read_lock(); | 181 | id = srcu_read_lock(&srcu); |
179 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | 182 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { |
180 | if (mn->ops->invalidate_range_end) | 183 | if (mn->ops->invalidate_range_end) |
181 | mn->ops->invalidate_range_end(mn, mm, start, end); | 184 | mn->ops->invalidate_range_end(mn, mm, start, end); |
182 | } | 185 | } |
183 | rcu_read_unlock(); | 186 | srcu_read_unlock(&srcu, id); |
184 | } | 187 | } |
185 | 188 | ||
186 | static int do_mmu_notifier_register(struct mmu_notifier *mn, | 189 | static int do_mmu_notifier_register(struct mmu_notifier *mn, |
@@ -192,22 +195,29 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, | |||
192 | 195 | ||
193 | BUG_ON(atomic_read(&mm->mm_users) <= 0); | 196 | BUG_ON(atomic_read(&mm->mm_users) <= 0); |
194 | 197 | ||
195 | ret = -ENOMEM; | 198 | /* |
196 | mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); | 199 | * Verify that mmu_notifier_init() already run and the global srcu is |
197 | if (unlikely(!mmu_notifier_mm)) | 200 | * initialized. |
198 | goto out; | 201 | */ |
202 | BUG_ON(!srcu.per_cpu_ref); | ||
199 | 203 | ||
200 | if (take_mmap_sem) | 204 | if (take_mmap_sem) |
201 | down_write(&mm->mmap_sem); | 205 | down_write(&mm->mmap_sem); |
202 | ret = mm_take_all_locks(mm); | 206 | ret = mm_take_all_locks(mm); |
203 | if (unlikely(ret)) | 207 | if (unlikely(ret)) |
204 | goto out_cleanup; | 208 | goto out; |
205 | 209 | ||
206 | if (!mm_has_notifiers(mm)) { | 210 | if (!mm_has_notifiers(mm)) { |
211 | mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), | ||
212 | GFP_KERNEL); | ||
213 | if (unlikely(!mmu_notifier_mm)) { | ||
214 | ret = -ENOMEM; | ||
215 | goto out_of_mem; | ||
216 | } | ||
207 | INIT_HLIST_HEAD(&mmu_notifier_mm->list); | 217 | INIT_HLIST_HEAD(&mmu_notifier_mm->list); |
208 | spin_lock_init(&mmu_notifier_mm->lock); | 218 | spin_lock_init(&mmu_notifier_mm->lock); |
219 | |||
209 | mm->mmu_notifier_mm = mmu_notifier_mm; | 220 | mm->mmu_notifier_mm = mmu_notifier_mm; |
210 | mmu_notifier_mm = NULL; | ||
211 | } | 221 | } |
212 | atomic_inc(&mm->mm_count); | 222 | atomic_inc(&mm->mm_count); |
213 | 223 | ||
@@ -223,13 +233,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, | |||
223 | hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list); | 233 | hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list); |
224 | spin_unlock(&mm->mmu_notifier_mm->lock); | 234 | spin_unlock(&mm->mmu_notifier_mm->lock); |
225 | 235 | ||
236 | out_of_mem: | ||
226 | mm_drop_all_locks(mm); | 237 | mm_drop_all_locks(mm); |
227 | out_cleanup: | 238 | out: |
228 | if (take_mmap_sem) | 239 | if (take_mmap_sem) |
229 | up_write(&mm->mmap_sem); | 240 | up_write(&mm->mmap_sem); |
230 | /* kfree() does nothing if mmu_notifier_mm is NULL */ | 241 | |
231 | kfree(mmu_notifier_mm); | ||
232 | out: | ||
233 | BUG_ON(atomic_read(&mm->mm_users) <= 0); | 242 | BUG_ON(atomic_read(&mm->mm_users) <= 0); |
234 | return ret; | 243 | return ret; |
235 | } | 244 | } |
@@ -274,8 +283,8 @@ void __mmu_notifier_mm_destroy(struct mm_struct *mm) | |||
274 | /* | 283 | /* |
275 | * This releases the mm_count pin automatically and frees the mm | 284 | * This releases the mm_count pin automatically and frees the mm |
276 | * structure if it was the last user of it. It serializes against | 285 | * structure if it was the last user of it. It serializes against |
277 | * running mmu notifiers with RCU and against mmu_notifier_unregister | 286 | * running mmu notifiers with SRCU and against mmu_notifier_unregister |
278 | * with the unregister lock + RCU. All sptes must be dropped before | 287 | * with the unregister lock + SRCU. All sptes must be dropped before |
279 | * calling mmu_notifier_unregister. ->release or any other notifier | 288 | * calling mmu_notifier_unregister. ->release or any other notifier |
280 | * method may be invoked concurrently with mmu_notifier_unregister, | 289 | * method may be invoked concurrently with mmu_notifier_unregister, |
281 | * and only after mmu_notifier_unregister returned we're guaranteed | 290 | * and only after mmu_notifier_unregister returned we're guaranteed |
@@ -287,11 +296,12 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) | |||
287 | 296 | ||
288 | if (!hlist_unhashed(&mn->hlist)) { | 297 | if (!hlist_unhashed(&mn->hlist)) { |
289 | /* | 298 | /* |
290 | * RCU here will force exit_mmap to wait ->release to finish | 299 | * SRCU here will force exit_mmap to wait ->release to finish |
291 | * before freeing the pages. | 300 | * before freeing the pages. |
292 | */ | 301 | */ |
293 | rcu_read_lock(); | 302 | int id; |
294 | 303 | ||
304 | id = srcu_read_lock(&srcu); | ||
295 | /* | 305 | /* |
296 | * exit_mmap will block in mmu_notifier_release to | 306 | * exit_mmap will block in mmu_notifier_release to |
297 | * guarantee ->release is called before freeing the | 307 | * guarantee ->release is called before freeing the |
@@ -299,7 +309,7 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) | |||
299 | */ | 309 | */ |
300 | if (mn->ops->release) | 310 | if (mn->ops->release) |
301 | mn->ops->release(mn, mm); | 311 | mn->ops->release(mn, mm); |
302 | rcu_read_unlock(); | 312 | srcu_read_unlock(&srcu, id); |
303 | 313 | ||
304 | spin_lock(&mm->mmu_notifier_mm->lock); | 314 | spin_lock(&mm->mmu_notifier_mm->lock); |
305 | hlist_del_rcu(&mn->hlist); | 315 | hlist_del_rcu(&mn->hlist); |
@@ -310,10 +320,17 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) | |||
310 | * Wait any running method to finish, of course including | 320 | * Wait any running method to finish, of course including |
311 | * ->release if it was run by mmu_notifier_relase instead of us. | 321 | * ->release if it was run by mmu_notifier_relase instead of us. |
312 | */ | 322 | */ |
313 | synchronize_rcu(); | 323 | synchronize_srcu(&srcu); |
314 | 324 | ||
315 | BUG_ON(atomic_read(&mm->mm_count) <= 0); | 325 | BUG_ON(atomic_read(&mm->mm_count) <= 0); |
316 | 326 | ||
317 | mmdrop(mm); | 327 | mmdrop(mm); |
318 | } | 328 | } |
319 | EXPORT_SYMBOL_GPL(mmu_notifier_unregister); | 329 | EXPORT_SYMBOL_GPL(mmu_notifier_unregister); |
330 | |||
331 | static int __init mmu_notifier_init(void) | ||
332 | { | ||
333 | return init_srcu_struct(&srcu); | ||
334 | } | ||
335 | |||
336 | module_init(mmu_notifier_init); | ||
diff --git a/mm/mremap.c b/mm/mremap.c index cc06d0e48d05..1b61c2d3307a 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -71,22 +71,41 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, | |||
71 | static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | 71 | static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, |
72 | unsigned long old_addr, unsigned long old_end, | 72 | unsigned long old_addr, unsigned long old_end, |
73 | struct vm_area_struct *new_vma, pmd_t *new_pmd, | 73 | struct vm_area_struct *new_vma, pmd_t *new_pmd, |
74 | unsigned long new_addr) | 74 | unsigned long new_addr, bool need_rmap_locks) |
75 | { | 75 | { |
76 | struct address_space *mapping = NULL; | 76 | struct address_space *mapping = NULL; |
77 | struct anon_vma *anon_vma = NULL; | ||
77 | struct mm_struct *mm = vma->vm_mm; | 78 | struct mm_struct *mm = vma->vm_mm; |
78 | pte_t *old_pte, *new_pte, pte; | 79 | pte_t *old_pte, *new_pte, pte; |
79 | spinlock_t *old_ptl, *new_ptl; | 80 | spinlock_t *old_ptl, *new_ptl; |
80 | 81 | ||
81 | if (vma->vm_file) { | 82 | /* |
82 | /* | 83 | * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma |
83 | * Subtle point from Rajesh Venkatasubramanian: before | 84 | * locks to ensure that rmap will always observe either the old or the |
84 | * moving file-based ptes, we must lock truncate_pagecache | 85 | * new ptes. This is the easiest way to avoid races with |
85 | * out, since it might clean the dst vma before the src vma, | 86 | * truncate_pagecache(), page migration, etc... |
86 | * and we propagate stale pages into the dst afterward. | 87 | * |
87 | */ | 88 | * When need_rmap_locks is false, we use other ways to avoid |
88 | mapping = vma->vm_file->f_mapping; | 89 | * such races: |
89 | mutex_lock(&mapping->i_mmap_mutex); | 90 | * |
91 | * - During exec() shift_arg_pages(), we use a specially tagged vma | ||
92 | * which rmap call sites look for using is_vma_temporary_stack(). | ||
93 | * | ||
94 | * - During mremap(), new_vma is often known to be placed after vma | ||
95 | * in rmap traversal order. This ensures rmap will always observe | ||
96 | * either the old pte, or the new pte, or both (the page table locks | ||
97 | * serialize access to individual ptes, but only rmap traversal | ||
98 | * order guarantees that we won't miss both the old and new ptes). | ||
99 | */ | ||
100 | if (need_rmap_locks) { | ||
101 | if (vma->vm_file) { | ||
102 | mapping = vma->vm_file->f_mapping; | ||
103 | mutex_lock(&mapping->i_mmap_mutex); | ||
104 | } | ||
105 | if (vma->anon_vma) { | ||
106 | anon_vma = vma->anon_vma; | ||
107 | anon_vma_lock(anon_vma); | ||
108 | } | ||
90 | } | 109 | } |
91 | 110 | ||
92 | /* | 111 | /* |
@@ -114,6 +133,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
114 | spin_unlock(new_ptl); | 133 | spin_unlock(new_ptl); |
115 | pte_unmap(new_pte - 1); | 134 | pte_unmap(new_pte - 1); |
116 | pte_unmap_unlock(old_pte - 1, old_ptl); | 135 | pte_unmap_unlock(old_pte - 1, old_ptl); |
136 | if (anon_vma) | ||
137 | anon_vma_unlock(anon_vma); | ||
117 | if (mapping) | 138 | if (mapping) |
118 | mutex_unlock(&mapping->i_mmap_mutex); | 139 | mutex_unlock(&mapping->i_mmap_mutex); |
119 | } | 140 | } |
@@ -122,16 +143,21 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
122 | 143 | ||
123 | unsigned long move_page_tables(struct vm_area_struct *vma, | 144 | unsigned long move_page_tables(struct vm_area_struct *vma, |
124 | unsigned long old_addr, struct vm_area_struct *new_vma, | 145 | unsigned long old_addr, struct vm_area_struct *new_vma, |
125 | unsigned long new_addr, unsigned long len) | 146 | unsigned long new_addr, unsigned long len, |
147 | bool need_rmap_locks) | ||
126 | { | 148 | { |
127 | unsigned long extent, next, old_end; | 149 | unsigned long extent, next, old_end; |
128 | pmd_t *old_pmd, *new_pmd; | 150 | pmd_t *old_pmd, *new_pmd; |
129 | bool need_flush = false; | 151 | bool need_flush = false; |
152 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
153 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
130 | 154 | ||
131 | old_end = old_addr + len; | 155 | old_end = old_addr + len; |
132 | flush_cache_range(vma, old_addr, old_end); | 156 | flush_cache_range(vma, old_addr, old_end); |
133 | 157 | ||
134 | mmu_notifier_invalidate_range_start(vma->vm_mm, old_addr, old_end); | 158 | mmun_start = old_addr; |
159 | mmun_end = old_end; | ||
160 | mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); | ||
135 | 161 | ||
136 | for (; old_addr < old_end; old_addr += extent, new_addr += extent) { | 162 | for (; old_addr < old_end; old_addr += extent, new_addr += extent) { |
137 | cond_resched(); | 163 | cond_resched(); |
@@ -169,13 +195,13 @@ unsigned long move_page_tables(struct vm_area_struct *vma, | |||
169 | if (extent > LATENCY_LIMIT) | 195 | if (extent > LATENCY_LIMIT) |
170 | extent = LATENCY_LIMIT; | 196 | extent = LATENCY_LIMIT; |
171 | move_ptes(vma, old_pmd, old_addr, old_addr + extent, | 197 | move_ptes(vma, old_pmd, old_addr, old_addr + extent, |
172 | new_vma, new_pmd, new_addr); | 198 | new_vma, new_pmd, new_addr, need_rmap_locks); |
173 | need_flush = true; | 199 | need_flush = true; |
174 | } | 200 | } |
175 | if (likely(need_flush)) | 201 | if (likely(need_flush)) |
176 | flush_tlb_range(vma, old_end-len, old_addr); | 202 | flush_tlb_range(vma, old_end-len, old_addr); |
177 | 203 | ||
178 | mmu_notifier_invalidate_range_end(vma->vm_mm, old_end-len, old_end); | 204 | mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); |
179 | 205 | ||
180 | return len + old_addr - old_end; /* how much done */ | 206 | return len + old_addr - old_end; /* how much done */ |
181 | } | 207 | } |
@@ -193,6 +219,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
193 | unsigned long hiwater_vm; | 219 | unsigned long hiwater_vm; |
194 | int split = 0; | 220 | int split = 0; |
195 | int err; | 221 | int err; |
222 | bool need_rmap_locks; | ||
196 | 223 | ||
197 | /* | 224 | /* |
198 | * We'd prefer to avoid failure later on in do_munmap: | 225 | * We'd prefer to avoid failure later on in do_munmap: |
@@ -214,27 +241,21 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
214 | return err; | 241 | return err; |
215 | 242 | ||
216 | new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); | 243 | new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); |
217 | new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); | 244 | new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff, |
245 | &need_rmap_locks); | ||
218 | if (!new_vma) | 246 | if (!new_vma) |
219 | return -ENOMEM; | 247 | return -ENOMEM; |
220 | 248 | ||
221 | moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); | 249 | moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, |
250 | need_rmap_locks); | ||
222 | if (moved_len < old_len) { | 251 | if (moved_len < old_len) { |
223 | /* | 252 | /* |
224 | * Before moving the page tables from the new vma to | ||
225 | * the old vma, we need to be sure the old vma is | ||
226 | * queued after new vma in the same_anon_vma list to | ||
227 | * prevent SMP races with rmap_walk (that could lead | ||
228 | * rmap_walk to miss some page table). | ||
229 | */ | ||
230 | anon_vma_moveto_tail(vma); | ||
231 | |||
232 | /* | ||
233 | * On error, move entries back from new area to old, | 253 | * On error, move entries back from new area to old, |
234 | * which will succeed since page tables still there, | 254 | * which will succeed since page tables still there, |
235 | * and then proceed to unmap new area instead of old. | 255 | * and then proceed to unmap new area instead of old. |
236 | */ | 256 | */ |
237 | move_page_tables(new_vma, new_addr, vma, old_addr, moved_len); | 257 | move_page_tables(new_vma, new_addr, vma, old_addr, moved_len, |
258 | true); | ||
238 | vma = new_vma; | 259 | vma = new_vma; |
239 | old_len = new_len; | 260 | old_len = new_len; |
240 | old_addr = new_addr; | 261 | old_addr = new_addr; |
diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 405573010f99..714d5d650470 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c | |||
@@ -116,6 +116,8 @@ static unsigned long __init __free_memory_core(phys_addr_t start, | |||
116 | return 0; | 116 | return 0; |
117 | 117 | ||
118 | __free_pages_memory(start_pfn, end_pfn); | 118 | __free_pages_memory(start_pfn, end_pfn); |
119 | fixup_zone_present_pages(pfn_to_nid(start >> PAGE_SHIFT), | ||
120 | start_pfn, end_pfn); | ||
119 | 121 | ||
120 | return end_pfn - start_pfn; | 122 | return end_pfn - start_pfn; |
121 | } | 123 | } |
@@ -126,6 +128,7 @@ unsigned long __init free_low_memory_core_early(int nodeid) | |||
126 | phys_addr_t start, end, size; | 128 | phys_addr_t start, end, size; |
127 | u64 i; | 129 | u64 i; |
128 | 130 | ||
131 | reset_zone_present_pages(); | ||
129 | for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) | 132 | for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) |
130 | count += __free_memory_core(start, end); | 133 | count += __free_memory_core(start, end); |
131 | 134 | ||
@@ -162,8 +165,6 @@ unsigned long __init free_all_bootmem(void) | |||
162 | * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id | 165 | * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id |
163 | * because in some case like Node0 doesn't have RAM installed | 166 | * because in some case like Node0 doesn't have RAM installed |
164 | * low ram will be on Node1 | 167 | * low ram will be on Node1 |
165 | * Use MAX_NUMNODES will make sure all ranges in early_node_map[] | ||
166 | * will be used instead of only Node0 related | ||
167 | */ | 168 | */ |
168 | return free_low_memory_core_early(MAX_NUMNODES); | 169 | return free_low_memory_core_early(MAX_NUMNODES); |
169 | } | 170 | } |
diff --git a/mm/nommu.c b/mm/nommu.c index dee2ff89fd58..45131b41bcdb 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -698,7 +698,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) | |||
698 | 698 | ||
699 | mutex_lock(&mapping->i_mmap_mutex); | 699 | mutex_lock(&mapping->i_mmap_mutex); |
700 | flush_dcache_mmap_lock(mapping); | 700 | flush_dcache_mmap_lock(mapping); |
701 | vma_prio_tree_insert(vma, &mapping->i_mmap); | 701 | vma_interval_tree_insert(vma, &mapping->i_mmap); |
702 | flush_dcache_mmap_unlock(mapping); | 702 | flush_dcache_mmap_unlock(mapping); |
703 | mutex_unlock(&mapping->i_mmap_mutex); | 703 | mutex_unlock(&mapping->i_mmap_mutex); |
704 | } | 704 | } |
@@ -764,7 +764,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) | |||
764 | 764 | ||
765 | mutex_lock(&mapping->i_mmap_mutex); | 765 | mutex_lock(&mapping->i_mmap_mutex); |
766 | flush_dcache_mmap_lock(mapping); | 766 | flush_dcache_mmap_lock(mapping); |
767 | vma_prio_tree_remove(vma, &mapping->i_mmap); | 767 | vma_interval_tree_remove(vma, &mapping->i_mmap); |
768 | flush_dcache_mmap_unlock(mapping); | 768 | flush_dcache_mmap_unlock(mapping); |
769 | mutex_unlock(&mapping->i_mmap_mutex); | 769 | mutex_unlock(&mapping->i_mmap_mutex); |
770 | } | 770 | } |
@@ -789,11 +789,8 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) | |||
789 | kenter("%p", vma); | 789 | kenter("%p", vma); |
790 | if (vma->vm_ops && vma->vm_ops->close) | 790 | if (vma->vm_ops && vma->vm_ops->close) |
791 | vma->vm_ops->close(vma); | 791 | vma->vm_ops->close(vma); |
792 | if (vma->vm_file) { | 792 | if (vma->vm_file) |
793 | fput(vma->vm_file); | 793 | fput(vma->vm_file); |
794 | if (vma->vm_flags & VM_EXECUTABLE) | ||
795 | removed_exe_file_vma(mm); | ||
796 | } | ||
797 | put_nommu_region(vma->vm_region); | 794 | put_nommu_region(vma->vm_region); |
798 | kmem_cache_free(vm_area_cachep, vma); | 795 | kmem_cache_free(vm_area_cachep, vma); |
799 | } | 796 | } |
@@ -1284,10 +1281,6 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1284 | if (file) { | 1281 | if (file) { |
1285 | region->vm_file = get_file(file); | 1282 | region->vm_file = get_file(file); |
1286 | vma->vm_file = get_file(file); | 1283 | vma->vm_file = get_file(file); |
1287 | if (vm_flags & VM_EXECUTABLE) { | ||
1288 | added_exe_file_vma(current->mm); | ||
1289 | vma->vm_mm = current->mm; | ||
1290 | } | ||
1291 | } | 1284 | } |
1292 | 1285 | ||
1293 | down_write(&nommu_region_sem); | 1286 | down_write(&nommu_region_sem); |
@@ -1440,8 +1433,6 @@ error: | |||
1440 | kmem_cache_free(vm_region_jar, region); | 1433 | kmem_cache_free(vm_region_jar, region); |
1441 | if (vma->vm_file) | 1434 | if (vma->vm_file) |
1442 | fput(vma->vm_file); | 1435 | fput(vma->vm_file); |
1443 | if (vma->vm_flags & VM_EXECUTABLE) | ||
1444 | removed_exe_file_vma(vma->vm_mm); | ||
1445 | kmem_cache_free(vm_area_cachep, vma); | 1436 | kmem_cache_free(vm_area_cachep, vma); |
1446 | kleave(" = %d", ret); | 1437 | kleave(" = %d", ret); |
1447 | return ret; | 1438 | return ret; |
@@ -1820,7 +1811,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, | |||
1820 | if (addr != (pfn << PAGE_SHIFT)) | 1811 | if (addr != (pfn << PAGE_SHIFT)) |
1821 | return -EINVAL; | 1812 | return -EINVAL; |
1822 | 1813 | ||
1823 | vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; | 1814 | vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; |
1824 | return 0; | 1815 | return 0; |
1825 | } | 1816 | } |
1826 | EXPORT_SYMBOL(remap_pfn_range); | 1817 | EXPORT_SYMBOL(remap_pfn_range); |
@@ -1961,6 +1952,14 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1961 | } | 1952 | } |
1962 | EXPORT_SYMBOL(filemap_fault); | 1953 | EXPORT_SYMBOL(filemap_fault); |
1963 | 1954 | ||
1955 | int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, | ||
1956 | unsigned long size, pgoff_t pgoff) | ||
1957 | { | ||
1958 | BUG(); | ||
1959 | return 0; | ||
1960 | } | ||
1961 | EXPORT_SYMBOL(generic_file_remap_pages); | ||
1962 | |||
1964 | static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, | 1963 | static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, |
1965 | unsigned long addr, void *buf, int len, int write) | 1964 | unsigned long addr, void *buf, int len, int write) |
1966 | { | 1965 | { |
@@ -2045,7 +2044,6 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, | |||
2045 | size_t newsize) | 2044 | size_t newsize) |
2046 | { | 2045 | { |
2047 | struct vm_area_struct *vma; | 2046 | struct vm_area_struct *vma; |
2048 | struct prio_tree_iter iter; | ||
2049 | struct vm_region *region; | 2047 | struct vm_region *region; |
2050 | pgoff_t low, high; | 2048 | pgoff_t low, high; |
2051 | size_t r_size, r_top; | 2049 | size_t r_size, r_top; |
@@ -2057,8 +2055,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, | |||
2057 | mutex_lock(&inode->i_mapping->i_mmap_mutex); | 2055 | mutex_lock(&inode->i_mapping->i_mmap_mutex); |
2058 | 2056 | ||
2059 | /* search for VMAs that fall within the dead zone */ | 2057 | /* search for VMAs that fall within the dead zone */ |
2060 | vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, | 2058 | vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) { |
2061 | low, high) { | ||
2062 | /* found one - only interested if it's shared out of the page | 2059 | /* found one - only interested if it's shared out of the page |
2063 | * cache */ | 2060 | * cache */ |
2064 | if (vma->vm_flags & VM_SHARED) { | 2061 | if (vma->vm_flags & VM_SHARED) { |
@@ -2074,8 +2071,8 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, | |||
2074 | * we don't check for any regions that start beyond the EOF as there | 2071 | * we don't check for any regions that start beyond the EOF as there |
2075 | * shouldn't be any | 2072 | * shouldn't be any |
2076 | */ | 2073 | */ |
2077 | vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, | 2074 | vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, |
2078 | 0, ULONG_MAX) { | 2075 | 0, ULONG_MAX) { |
2079 | if (!(vma->vm_flags & VM_SHARED)) | 2076 | if (!(vma->vm_flags & VM_SHARED)) |
2080 | continue; | 2077 | continue; |
2081 | 2078 | ||
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 198600861638..79e0f3e24831 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -428,8 +428,8 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | |||
428 | { | 428 | { |
429 | task_lock(current); | 429 | task_lock(current); |
430 | pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " | 430 | pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " |
431 | "oom_adj=%d, oom_score_adj=%d\n", | 431 | "oom_score_adj=%d\n", |
432 | current->comm, gfp_mask, order, current->signal->oom_adj, | 432 | current->comm, gfp_mask, order, |
433 | current->signal->oom_score_adj); | 433 | current->signal->oom_score_adj); |
434 | cpuset_print_task_mems_allowed(current); | 434 | cpuset_print_task_mems_allowed(current); |
435 | task_unlock(current); | 435 | task_unlock(current); |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c13ea7538891..bb90971182bd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -558,7 +558,8 @@ static inline void __free_one_page(struct page *page, | |||
558 | if (page_is_guard(buddy)) { | 558 | if (page_is_guard(buddy)) { |
559 | clear_page_guard_flag(buddy); | 559 | clear_page_guard_flag(buddy); |
560 | set_page_private(page, 0); | 560 | set_page_private(page, 0); |
561 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); | 561 | __mod_zone_freepage_state(zone, 1 << order, |
562 | migratetype); | ||
562 | } else { | 563 | } else { |
563 | list_del(&buddy->lru); | 564 | list_del(&buddy->lru); |
564 | zone->free_area[order].nr_free--; | 565 | zone->free_area[order].nr_free--; |
@@ -597,17 +598,6 @@ out: | |||
597 | zone->free_area[order].nr_free++; | 598 | zone->free_area[order].nr_free++; |
598 | } | 599 | } |
599 | 600 | ||
600 | /* | ||
601 | * free_page_mlock() -- clean up attempts to free and mlocked() page. | ||
602 | * Page should not be on lru, so no need to fix that up. | ||
603 | * free_pages_check() will verify... | ||
604 | */ | ||
605 | static inline void free_page_mlock(struct page *page) | ||
606 | { | ||
607 | __dec_zone_page_state(page, NR_MLOCK); | ||
608 | __count_vm_event(UNEVICTABLE_MLOCKFREED); | ||
609 | } | ||
610 | |||
611 | static inline int free_pages_check(struct page *page) | 601 | static inline int free_pages_check(struct page *page) |
612 | { | 602 | { |
613 | if (unlikely(page_mapcount(page) | | 603 | if (unlikely(page_mapcount(page) | |
@@ -668,12 +658,17 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
668 | batch_free = to_free; | 658 | batch_free = to_free; |
669 | 659 | ||
670 | do { | 660 | do { |
661 | int mt; /* migratetype of the to-be-freed page */ | ||
662 | |||
671 | page = list_entry(list->prev, struct page, lru); | 663 | page = list_entry(list->prev, struct page, lru); |
672 | /* must delete as __free_one_page list manipulates */ | 664 | /* must delete as __free_one_page list manipulates */ |
673 | list_del(&page->lru); | 665 | list_del(&page->lru); |
666 | mt = get_freepage_migratetype(page); | ||
674 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ | 667 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ |
675 | __free_one_page(page, zone, 0, page_private(page)); | 668 | __free_one_page(page, zone, 0, mt); |
676 | trace_mm_page_pcpu_drain(page, 0, page_private(page)); | 669 | trace_mm_page_pcpu_drain(page, 0, mt); |
670 | if (is_migrate_cma(mt)) | ||
671 | __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); | ||
677 | } while (--to_free && --batch_free && !list_empty(list)); | 672 | } while (--to_free && --batch_free && !list_empty(list)); |
678 | } | 673 | } |
679 | __mod_zone_page_state(zone, NR_FREE_PAGES, count); | 674 | __mod_zone_page_state(zone, NR_FREE_PAGES, count); |
@@ -688,7 +683,8 @@ static void free_one_page(struct zone *zone, struct page *page, int order, | |||
688 | zone->pages_scanned = 0; | 683 | zone->pages_scanned = 0; |
689 | 684 | ||
690 | __free_one_page(page, zone, order, migratetype); | 685 | __free_one_page(page, zone, order, migratetype); |
691 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); | 686 | if (unlikely(migratetype != MIGRATE_ISOLATE)) |
687 | __mod_zone_freepage_state(zone, 1 << order, migratetype); | ||
692 | spin_unlock(&zone->lock); | 688 | spin_unlock(&zone->lock); |
693 | } | 689 | } |
694 | 690 | ||
@@ -721,17 +717,16 @@ static bool free_pages_prepare(struct page *page, unsigned int order) | |||
721 | static void __free_pages_ok(struct page *page, unsigned int order) | 717 | static void __free_pages_ok(struct page *page, unsigned int order) |
722 | { | 718 | { |
723 | unsigned long flags; | 719 | unsigned long flags; |
724 | int wasMlocked = __TestClearPageMlocked(page); | 720 | int migratetype; |
725 | 721 | ||
726 | if (!free_pages_prepare(page, order)) | 722 | if (!free_pages_prepare(page, order)) |
727 | return; | 723 | return; |
728 | 724 | ||
729 | local_irq_save(flags); | 725 | local_irq_save(flags); |
730 | if (unlikely(wasMlocked)) | ||
731 | free_page_mlock(page); | ||
732 | __count_vm_events(PGFREE, 1 << order); | 726 | __count_vm_events(PGFREE, 1 << order); |
733 | free_one_page(page_zone(page), page, order, | 727 | migratetype = get_pageblock_migratetype(page); |
734 | get_pageblock_migratetype(page)); | 728 | set_freepage_migratetype(page, migratetype); |
729 | free_one_page(page_zone(page), page, order, migratetype); | ||
735 | local_irq_restore(flags); | 730 | local_irq_restore(flags); |
736 | } | 731 | } |
737 | 732 | ||
@@ -811,7 +806,8 @@ static inline void expand(struct zone *zone, struct page *page, | |||
811 | set_page_guard_flag(&page[size]); | 806 | set_page_guard_flag(&page[size]); |
812 | set_page_private(&page[size], high); | 807 | set_page_private(&page[size], high); |
813 | /* Guard pages are not available for any usage */ | 808 | /* Guard pages are not available for any usage */ |
814 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high)); | 809 | __mod_zone_freepage_state(zone, -(1 << high), |
810 | migratetype); | ||
815 | continue; | 811 | continue; |
816 | } | 812 | } |
817 | #endif | 813 | #endif |
@@ -915,7 +911,7 @@ static int fallbacks[MIGRATE_TYPES][4] = { | |||
915 | * Note that start_page and end_pages are not aligned on a pageblock | 911 | * Note that start_page and end_pages are not aligned on a pageblock |
916 | * boundary. If alignment is required, use move_freepages_block() | 912 | * boundary. If alignment is required, use move_freepages_block() |
917 | */ | 913 | */ |
918 | static int move_freepages(struct zone *zone, | 914 | int move_freepages(struct zone *zone, |
919 | struct page *start_page, struct page *end_page, | 915 | struct page *start_page, struct page *end_page, |
920 | int migratetype) | 916 | int migratetype) |
921 | { | 917 | { |
@@ -951,6 +947,7 @@ static int move_freepages(struct zone *zone, | |||
951 | order = page_order(page); | 947 | order = page_order(page); |
952 | list_move(&page->lru, | 948 | list_move(&page->lru, |
953 | &zone->free_area[order].free_list[migratetype]); | 949 | &zone->free_area[order].free_list[migratetype]); |
950 | set_freepage_migratetype(page, migratetype); | ||
954 | page += 1 << order; | 951 | page += 1 << order; |
955 | pages_moved += 1 << order; | 952 | pages_moved += 1 << order; |
956 | } | 953 | } |
@@ -1135,8 +1132,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
1135 | if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) | 1132 | if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) |
1136 | mt = migratetype; | 1133 | mt = migratetype; |
1137 | } | 1134 | } |
1138 | set_page_private(page, mt); | 1135 | set_freepage_migratetype(page, mt); |
1139 | list = &page->lru; | 1136 | list = &page->lru; |
1137 | if (is_migrate_cma(mt)) | ||
1138 | __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, | ||
1139 | -(1 << order)); | ||
1140 | } | 1140 | } |
1141 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); | 1141 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); |
1142 | spin_unlock(&zone->lock); | 1142 | spin_unlock(&zone->lock); |
@@ -1296,16 +1296,13 @@ void free_hot_cold_page(struct page *page, int cold) | |||
1296 | struct per_cpu_pages *pcp; | 1296 | struct per_cpu_pages *pcp; |
1297 | unsigned long flags; | 1297 | unsigned long flags; |
1298 | int migratetype; | 1298 | int migratetype; |
1299 | int wasMlocked = __TestClearPageMlocked(page); | ||
1300 | 1299 | ||
1301 | if (!free_pages_prepare(page, 0)) | 1300 | if (!free_pages_prepare(page, 0)) |
1302 | return; | 1301 | return; |
1303 | 1302 | ||
1304 | migratetype = get_pageblock_migratetype(page); | 1303 | migratetype = get_pageblock_migratetype(page); |
1305 | set_page_private(page, migratetype); | 1304 | set_freepage_migratetype(page, migratetype); |
1306 | local_irq_save(flags); | 1305 | local_irq_save(flags); |
1307 | if (unlikely(wasMlocked)) | ||
1308 | free_page_mlock(page); | ||
1309 | __count_vm_event(PGFREE); | 1306 | __count_vm_event(PGFREE); |
1310 | 1307 | ||
1311 | /* | 1308 | /* |
@@ -1380,20 +1377,16 @@ void split_page(struct page *page, unsigned int order) | |||
1380 | } | 1377 | } |
1381 | 1378 | ||
1382 | /* | 1379 | /* |
1383 | * Similar to split_page except the page is already free. As this is only | 1380 | * Similar to the split_page family of functions except that the page |
1384 | * being used for migration, the migratetype of the block also changes. | 1381 | * required at the given order and being isolated now to prevent races |
1385 | * As this is called with interrupts disabled, the caller is responsible | 1382 | * with parallel allocators |
1386 | * for calling arch_alloc_page() and kernel_map_page() after interrupts | ||
1387 | * are enabled. | ||
1388 | * | ||
1389 | * Note: this is probably too low level an operation for use in drivers. | ||
1390 | * Please consult with lkml before using this in your driver. | ||
1391 | */ | 1383 | */ |
1392 | int split_free_page(struct page *page) | 1384 | int capture_free_page(struct page *page, int alloc_order, int migratetype) |
1393 | { | 1385 | { |
1394 | unsigned int order; | 1386 | unsigned int order; |
1395 | unsigned long watermark; | 1387 | unsigned long watermark; |
1396 | struct zone *zone; | 1388 | struct zone *zone; |
1389 | int mt; | ||
1397 | 1390 | ||
1398 | BUG_ON(!PageBuddy(page)); | 1391 | BUG_ON(!PageBuddy(page)); |
1399 | 1392 | ||
@@ -1409,12 +1402,16 @@ int split_free_page(struct page *page) | |||
1409 | list_del(&page->lru); | 1402 | list_del(&page->lru); |
1410 | zone->free_area[order].nr_free--; | 1403 | zone->free_area[order].nr_free--; |
1411 | rmv_page_order(page); | 1404 | rmv_page_order(page); |
1412 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order)); | ||
1413 | 1405 | ||
1414 | /* Split into individual pages */ | 1406 | mt = get_pageblock_migratetype(page); |
1415 | set_page_refcounted(page); | 1407 | if (unlikely(mt != MIGRATE_ISOLATE)) |
1416 | split_page(page, order); | 1408 | __mod_zone_freepage_state(zone, -(1UL << order), mt); |
1417 | 1409 | ||
1410 | if (alloc_order != order) | ||
1411 | expand(zone, page, alloc_order, order, | ||
1412 | &zone->free_area[order], migratetype); | ||
1413 | |||
1414 | /* Set the pageblock if the captured page is at least a pageblock */ | ||
1418 | if (order >= pageblock_order - 1) { | 1415 | if (order >= pageblock_order - 1) { |
1419 | struct page *endpage = page + (1 << order) - 1; | 1416 | struct page *endpage = page + (1 << order) - 1; |
1420 | for (; page < endpage; page += pageblock_nr_pages) { | 1417 | for (; page < endpage; page += pageblock_nr_pages) { |
@@ -1425,7 +1422,35 @@ int split_free_page(struct page *page) | |||
1425 | } | 1422 | } |
1426 | } | 1423 | } |
1427 | 1424 | ||
1428 | return 1 << order; | 1425 | return 1UL << order; |
1426 | } | ||
1427 | |||
1428 | /* | ||
1429 | * Similar to split_page except the page is already free. As this is only | ||
1430 | * being used for migration, the migratetype of the block also changes. | ||
1431 | * As this is called with interrupts disabled, the caller is responsible | ||
1432 | * for calling arch_alloc_page() and kernel_map_page() after interrupts | ||
1433 | * are enabled. | ||
1434 | * | ||
1435 | * Note: this is probably too low level an operation for use in drivers. | ||
1436 | * Please consult with lkml before using this in your driver. | ||
1437 | */ | ||
1438 | int split_free_page(struct page *page) | ||
1439 | { | ||
1440 | unsigned int order; | ||
1441 | int nr_pages; | ||
1442 | |||
1443 | BUG_ON(!PageBuddy(page)); | ||
1444 | order = page_order(page); | ||
1445 | |||
1446 | nr_pages = capture_free_page(page, order, 0); | ||
1447 | if (!nr_pages) | ||
1448 | return 0; | ||
1449 | |||
1450 | /* Split into individual pages */ | ||
1451 | set_page_refcounted(page); | ||
1452 | split_page(page, order); | ||
1453 | return nr_pages; | ||
1429 | } | 1454 | } |
1430 | 1455 | ||
1431 | /* | 1456 | /* |
@@ -1484,7 +1509,8 @@ again: | |||
1484 | spin_unlock(&zone->lock); | 1509 | spin_unlock(&zone->lock); |
1485 | if (!page) | 1510 | if (!page) |
1486 | goto failed; | 1511 | goto failed; |
1487 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); | 1512 | __mod_zone_freepage_state(zone, -(1 << order), |
1513 | get_pageblock_migratetype(page)); | ||
1488 | } | 1514 | } |
1489 | 1515 | ||
1490 | __count_zone_vm_events(PGALLOC, zone, 1 << order); | 1516 | __count_zone_vm_events(PGALLOC, zone, 1 << order); |
@@ -1501,19 +1527,6 @@ failed: | |||
1501 | return NULL; | 1527 | return NULL; |
1502 | } | 1528 | } |
1503 | 1529 | ||
1504 | /* The ALLOC_WMARK bits are used as an index to zone->watermark */ | ||
1505 | #define ALLOC_WMARK_MIN WMARK_MIN | ||
1506 | #define ALLOC_WMARK_LOW WMARK_LOW | ||
1507 | #define ALLOC_WMARK_HIGH WMARK_HIGH | ||
1508 | #define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ | ||
1509 | |||
1510 | /* Mask to get the watermark bits */ | ||
1511 | #define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) | ||
1512 | |||
1513 | #define ALLOC_HARDER 0x10 /* try to alloc harder */ | ||
1514 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ | ||
1515 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ | ||
1516 | |||
1517 | #ifdef CONFIG_FAIL_PAGE_ALLOC | 1530 | #ifdef CONFIG_FAIL_PAGE_ALLOC |
1518 | 1531 | ||
1519 | static struct { | 1532 | static struct { |
@@ -1608,7 +1621,11 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1608 | min -= min / 2; | 1621 | min -= min / 2; |
1609 | if (alloc_flags & ALLOC_HARDER) | 1622 | if (alloc_flags & ALLOC_HARDER) |
1610 | min -= min / 4; | 1623 | min -= min / 4; |
1611 | 1624 | #ifdef CONFIG_CMA | |
1625 | /* If allocation can't use CMA areas don't use free CMA pages */ | ||
1626 | if (!(alloc_flags & ALLOC_CMA)) | ||
1627 | free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); | ||
1628 | #endif | ||
1612 | if (free_pages <= min + lowmem_reserve) | 1629 | if (free_pages <= min + lowmem_reserve) |
1613 | return false; | 1630 | return false; |
1614 | for (o = 0; o < order; o++) { | 1631 | for (o = 0; o < order; o++) { |
@@ -1782,6 +1799,22 @@ static void zlc_clear_zones_full(struct zonelist *zonelist) | |||
1782 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); | 1799 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); |
1783 | } | 1800 | } |
1784 | 1801 | ||
1802 | static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) | ||
1803 | { | ||
1804 | return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes); | ||
1805 | } | ||
1806 | |||
1807 | static void __paginginit init_zone_allows_reclaim(int nid) | ||
1808 | { | ||
1809 | int i; | ||
1810 | |||
1811 | for_each_online_node(i) | ||
1812 | if (node_distance(nid, i) <= RECLAIM_DISTANCE) { | ||
1813 | node_set(i, NODE_DATA(nid)->reclaim_nodes); | ||
1814 | zone_reclaim_mode = 1; | ||
1815 | } | ||
1816 | } | ||
1817 | |||
1785 | #else /* CONFIG_NUMA */ | 1818 | #else /* CONFIG_NUMA */ |
1786 | 1819 | ||
1787 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | 1820 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) |
@@ -1802,6 +1835,15 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) | |||
1802 | static void zlc_clear_zones_full(struct zonelist *zonelist) | 1835 | static void zlc_clear_zones_full(struct zonelist *zonelist) |
1803 | { | 1836 | { |
1804 | } | 1837 | } |
1838 | |||
1839 | static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) | ||
1840 | { | ||
1841 | return true; | ||
1842 | } | ||
1843 | |||
1844 | static inline void init_zone_allows_reclaim(int nid) | ||
1845 | { | ||
1846 | } | ||
1805 | #endif /* CONFIG_NUMA */ | 1847 | #endif /* CONFIG_NUMA */ |
1806 | 1848 | ||
1807 | /* | 1849 | /* |
@@ -1886,7 +1928,8 @@ zonelist_scan: | |||
1886 | did_zlc_setup = 1; | 1928 | did_zlc_setup = 1; |
1887 | } | 1929 | } |
1888 | 1930 | ||
1889 | if (zone_reclaim_mode == 0) | 1931 | if (zone_reclaim_mode == 0 || |
1932 | !zone_allows_reclaim(preferred_zone, zone)) | ||
1890 | goto this_zone_full; | 1933 | goto this_zone_full; |
1891 | 1934 | ||
1892 | /* | 1935 | /* |
@@ -2105,7 +2148,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2105 | bool *contended_compaction, bool *deferred_compaction, | 2148 | bool *contended_compaction, bool *deferred_compaction, |
2106 | unsigned long *did_some_progress) | 2149 | unsigned long *did_some_progress) |
2107 | { | 2150 | { |
2108 | struct page *page; | 2151 | struct page *page = NULL; |
2109 | 2152 | ||
2110 | if (!order) | 2153 | if (!order) |
2111 | return NULL; | 2154 | return NULL; |
@@ -2118,10 +2161,16 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2118 | current->flags |= PF_MEMALLOC; | 2161 | current->flags |= PF_MEMALLOC; |
2119 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, | 2162 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, |
2120 | nodemask, sync_migration, | 2163 | nodemask, sync_migration, |
2121 | contended_compaction); | 2164 | contended_compaction, &page); |
2122 | current->flags &= ~PF_MEMALLOC; | 2165 | current->flags &= ~PF_MEMALLOC; |
2123 | if (*did_some_progress != COMPACT_SKIPPED) { | ||
2124 | 2166 | ||
2167 | /* If compaction captured a page, prep and use it */ | ||
2168 | if (page) { | ||
2169 | prep_new_page(page, order, gfp_mask); | ||
2170 | goto got_page; | ||
2171 | } | ||
2172 | |||
2173 | if (*did_some_progress != COMPACT_SKIPPED) { | ||
2125 | /* Page migration frees to the PCP lists but we want merging */ | 2174 | /* Page migration frees to the PCP lists but we want merging */ |
2126 | drain_pages(get_cpu()); | 2175 | drain_pages(get_cpu()); |
2127 | put_cpu(); | 2176 | put_cpu(); |
@@ -2131,6 +2180,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2131 | alloc_flags & ~ALLOC_NO_WATERMARKS, | 2180 | alloc_flags & ~ALLOC_NO_WATERMARKS, |
2132 | preferred_zone, migratetype); | 2181 | preferred_zone, migratetype); |
2133 | if (page) { | 2182 | if (page) { |
2183 | got_page: | ||
2184 | preferred_zone->compact_blockskip_flush = false; | ||
2134 | preferred_zone->compact_considered = 0; | 2185 | preferred_zone->compact_considered = 0; |
2135 | preferred_zone->compact_defer_shift = 0; | 2186 | preferred_zone->compact_defer_shift = 0; |
2136 | if (order >= preferred_zone->compact_order_failed) | 2187 | if (order >= preferred_zone->compact_order_failed) |
@@ -2315,7 +2366,10 @@ gfp_to_alloc_flags(gfp_t gfp_mask) | |||
2315 | unlikely(test_thread_flag(TIF_MEMDIE)))) | 2366 | unlikely(test_thread_flag(TIF_MEMDIE)))) |
2316 | alloc_flags |= ALLOC_NO_WATERMARKS; | 2367 | alloc_flags |= ALLOC_NO_WATERMARKS; |
2317 | } | 2368 | } |
2318 | 2369 | #ifdef CONFIG_CMA | |
2370 | if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) | ||
2371 | alloc_flags |= ALLOC_CMA; | ||
2372 | #endif | ||
2319 | return alloc_flags; | 2373 | return alloc_flags; |
2320 | } | 2374 | } |
2321 | 2375 | ||
@@ -2362,9 +2416,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
2362 | goto nopage; | 2416 | goto nopage; |
2363 | 2417 | ||
2364 | restart: | 2418 | restart: |
2365 | if (!(gfp_mask & __GFP_NO_KSWAPD)) | 2419 | wake_all_kswapd(order, zonelist, high_zoneidx, |
2366 | wake_all_kswapd(order, zonelist, high_zoneidx, | 2420 | zone_idx(preferred_zone)); |
2367 | zone_idx(preferred_zone)); | ||
2368 | 2421 | ||
2369 | /* | 2422 | /* |
2370 | * OK, we're below the kswapd watermark and have kicked background | 2423 | * OK, we're below the kswapd watermark and have kicked background |
@@ -2441,7 +2494,7 @@ rebalance: | |||
2441 | * system then fail the allocation instead of entering direct reclaim. | 2494 | * system then fail the allocation instead of entering direct reclaim. |
2442 | */ | 2495 | */ |
2443 | if ((deferred_compaction || contended_compaction) && | 2496 | if ((deferred_compaction || contended_compaction) && |
2444 | (gfp_mask & __GFP_NO_KSWAPD)) | 2497 | (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE) |
2445 | goto nopage; | 2498 | goto nopage; |
2446 | 2499 | ||
2447 | /* Try direct reclaim and then allocating */ | 2500 | /* Try direct reclaim and then allocating */ |
@@ -2541,6 +2594,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2541 | struct page *page = NULL; | 2594 | struct page *page = NULL; |
2542 | int migratetype = allocflags_to_migratetype(gfp_mask); | 2595 | int migratetype = allocflags_to_migratetype(gfp_mask); |
2543 | unsigned int cpuset_mems_cookie; | 2596 | unsigned int cpuset_mems_cookie; |
2597 | int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; | ||
2544 | 2598 | ||
2545 | gfp_mask &= gfp_allowed_mask; | 2599 | gfp_mask &= gfp_allowed_mask; |
2546 | 2600 | ||
@@ -2569,9 +2623,13 @@ retry_cpuset: | |||
2569 | if (!preferred_zone) | 2623 | if (!preferred_zone) |
2570 | goto out; | 2624 | goto out; |
2571 | 2625 | ||
2626 | #ifdef CONFIG_CMA | ||
2627 | if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) | ||
2628 | alloc_flags |= ALLOC_CMA; | ||
2629 | #endif | ||
2572 | /* First allocation attempt */ | 2630 | /* First allocation attempt */ |
2573 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 2631 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, |
2574 | zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET, | 2632 | zonelist, high_zoneidx, alloc_flags, |
2575 | preferred_zone, migratetype); | 2633 | preferred_zone, migratetype); |
2576 | if (unlikely(!page)) | 2634 | if (unlikely(!page)) |
2577 | page = __alloc_pages_slowpath(gfp_mask, order, | 2635 | page = __alloc_pages_slowpath(gfp_mask, order, |
@@ -2852,7 +2910,8 @@ void show_free_areas(unsigned int filter) | |||
2852 | " unevictable:%lu" | 2910 | " unevictable:%lu" |
2853 | " dirty:%lu writeback:%lu unstable:%lu\n" | 2911 | " dirty:%lu writeback:%lu unstable:%lu\n" |
2854 | " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" | 2912 | " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" |
2855 | " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n", | 2913 | " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" |
2914 | " free_cma:%lu\n", | ||
2856 | global_page_state(NR_ACTIVE_ANON), | 2915 | global_page_state(NR_ACTIVE_ANON), |
2857 | global_page_state(NR_INACTIVE_ANON), | 2916 | global_page_state(NR_INACTIVE_ANON), |
2858 | global_page_state(NR_ISOLATED_ANON), | 2917 | global_page_state(NR_ISOLATED_ANON), |
@@ -2869,7 +2928,8 @@ void show_free_areas(unsigned int filter) | |||
2869 | global_page_state(NR_FILE_MAPPED), | 2928 | global_page_state(NR_FILE_MAPPED), |
2870 | global_page_state(NR_SHMEM), | 2929 | global_page_state(NR_SHMEM), |
2871 | global_page_state(NR_PAGETABLE), | 2930 | global_page_state(NR_PAGETABLE), |
2872 | global_page_state(NR_BOUNCE)); | 2931 | global_page_state(NR_BOUNCE), |
2932 | global_page_state(NR_FREE_CMA_PAGES)); | ||
2873 | 2933 | ||
2874 | for_each_populated_zone(zone) { | 2934 | for_each_populated_zone(zone) { |
2875 | int i; | 2935 | int i; |
@@ -2901,6 +2961,7 @@ void show_free_areas(unsigned int filter) | |||
2901 | " pagetables:%lukB" | 2961 | " pagetables:%lukB" |
2902 | " unstable:%lukB" | 2962 | " unstable:%lukB" |
2903 | " bounce:%lukB" | 2963 | " bounce:%lukB" |
2964 | " free_cma:%lukB" | ||
2904 | " writeback_tmp:%lukB" | 2965 | " writeback_tmp:%lukB" |
2905 | " pages_scanned:%lu" | 2966 | " pages_scanned:%lu" |
2906 | " all_unreclaimable? %s" | 2967 | " all_unreclaimable? %s" |
@@ -2930,6 +2991,7 @@ void show_free_areas(unsigned int filter) | |||
2930 | K(zone_page_state(zone, NR_PAGETABLE)), | 2991 | K(zone_page_state(zone, NR_PAGETABLE)), |
2931 | K(zone_page_state(zone, NR_UNSTABLE_NFS)), | 2992 | K(zone_page_state(zone, NR_UNSTABLE_NFS)), |
2932 | K(zone_page_state(zone, NR_BOUNCE)), | 2993 | K(zone_page_state(zone, NR_BOUNCE)), |
2994 | K(zone_page_state(zone, NR_FREE_CMA_PAGES)), | ||
2933 | K(zone_page_state(zone, NR_WRITEBACK_TEMP)), | 2995 | K(zone_page_state(zone, NR_WRITEBACK_TEMP)), |
2934 | zone->pages_scanned, | 2996 | zone->pages_scanned, |
2935 | (zone->all_unreclaimable ? "yes" : "no") | 2997 | (zone->all_unreclaimable ? "yes" : "no") |
@@ -3328,21 +3390,13 @@ static void build_zonelists(pg_data_t *pgdat) | |||
3328 | j = 0; | 3390 | j = 0; |
3329 | 3391 | ||
3330 | while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { | 3392 | while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { |
3331 | int distance = node_distance(local_node, node); | ||
3332 | |||
3333 | /* | ||
3334 | * If another node is sufficiently far away then it is better | ||
3335 | * to reclaim pages in a zone before going off node. | ||
3336 | */ | ||
3337 | if (distance > RECLAIM_DISTANCE) | ||
3338 | zone_reclaim_mode = 1; | ||
3339 | |||
3340 | /* | 3393 | /* |
3341 | * We don't want to pressure a particular node. | 3394 | * We don't want to pressure a particular node. |
3342 | * So adding penalty to the first node in same | 3395 | * So adding penalty to the first node in same |
3343 | * distance group to make it round-robin. | 3396 | * distance group to make it round-robin. |
3344 | */ | 3397 | */ |
3345 | if (distance != node_distance(local_node, prev_node)) | 3398 | if (node_distance(local_node, node) != |
3399 | node_distance(local_node, prev_node)) | ||
3346 | node_load[node] = load; | 3400 | node_load[node] = load; |
3347 | 3401 | ||
3348 | prev_node = node; | 3402 | prev_node = node; |
@@ -4438,11 +4492,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4438 | 4492 | ||
4439 | zone->spanned_pages = size; | 4493 | zone->spanned_pages = size; |
4440 | zone->present_pages = realsize; | 4494 | zone->present_pages = realsize; |
4441 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA | ||
4442 | zone->compact_cached_free_pfn = zone->zone_start_pfn + | ||
4443 | zone->spanned_pages; | ||
4444 | zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1); | ||
4445 | #endif | ||
4446 | #ifdef CONFIG_NUMA | 4495 | #ifdef CONFIG_NUMA |
4447 | zone->node = nid; | 4496 | zone->node = nid; |
4448 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) | 4497 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) |
@@ -4521,6 +4570,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | |||
4521 | 4570 | ||
4522 | pgdat->node_id = nid; | 4571 | pgdat->node_id = nid; |
4523 | pgdat->node_start_pfn = node_start_pfn; | 4572 | pgdat->node_start_pfn = node_start_pfn; |
4573 | init_zone_allows_reclaim(nid); | ||
4524 | calculate_node_totalpages(pgdat, zones_size, zholes_size); | 4574 | calculate_node_totalpages(pgdat, zones_size, zholes_size); |
4525 | 4575 | ||
4526 | alloc_node_mem_map(pgdat); | 4576 | alloc_node_mem_map(pgdat); |
@@ -4879,7 +4929,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
4879 | zone_movable_pfn[i] << PAGE_SHIFT); | 4929 | zone_movable_pfn[i] << PAGE_SHIFT); |
4880 | } | 4930 | } |
4881 | 4931 | ||
4882 | /* Print out the early_node_map[] */ | 4932 | /* Print out the early node map */ |
4883 | printk("Early memory node ranges\n"); | 4933 | printk("Early memory node ranges\n"); |
4884 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) | 4934 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) |
4885 | printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, | 4935 | printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, |
@@ -5619,47 +5669,28 @@ static unsigned long pfn_max_align_up(unsigned long pfn) | |||
5619 | pageblock_nr_pages)); | 5669 | pageblock_nr_pages)); |
5620 | } | 5670 | } |
5621 | 5671 | ||
5622 | static struct page * | ||
5623 | __alloc_contig_migrate_alloc(struct page *page, unsigned long private, | ||
5624 | int **resultp) | ||
5625 | { | ||
5626 | gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; | ||
5627 | |||
5628 | if (PageHighMem(page)) | ||
5629 | gfp_mask |= __GFP_HIGHMEM; | ||
5630 | |||
5631 | return alloc_page(gfp_mask); | ||
5632 | } | ||
5633 | |||
5634 | /* [start, end) must belong to a single zone. */ | 5672 | /* [start, end) must belong to a single zone. */ |
5635 | static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) | 5673 | static int __alloc_contig_migrate_range(struct compact_control *cc, |
5674 | unsigned long start, unsigned long end) | ||
5636 | { | 5675 | { |
5637 | /* This function is based on compact_zone() from compaction.c. */ | 5676 | /* This function is based on compact_zone() from compaction.c. */ |
5638 | 5677 | unsigned long nr_reclaimed; | |
5639 | unsigned long pfn = start; | 5678 | unsigned long pfn = start; |
5640 | unsigned int tries = 0; | 5679 | unsigned int tries = 0; |
5641 | int ret = 0; | 5680 | int ret = 0; |
5642 | 5681 | ||
5643 | struct compact_control cc = { | ||
5644 | .nr_migratepages = 0, | ||
5645 | .order = -1, | ||
5646 | .zone = page_zone(pfn_to_page(start)), | ||
5647 | .sync = true, | ||
5648 | }; | ||
5649 | INIT_LIST_HEAD(&cc.migratepages); | ||
5650 | |||
5651 | migrate_prep_local(); | 5682 | migrate_prep_local(); |
5652 | 5683 | ||
5653 | while (pfn < end || !list_empty(&cc.migratepages)) { | 5684 | while (pfn < end || !list_empty(&cc->migratepages)) { |
5654 | if (fatal_signal_pending(current)) { | 5685 | if (fatal_signal_pending(current)) { |
5655 | ret = -EINTR; | 5686 | ret = -EINTR; |
5656 | break; | 5687 | break; |
5657 | } | 5688 | } |
5658 | 5689 | ||
5659 | if (list_empty(&cc.migratepages)) { | 5690 | if (list_empty(&cc->migratepages)) { |
5660 | cc.nr_migratepages = 0; | 5691 | cc->nr_migratepages = 0; |
5661 | pfn = isolate_migratepages_range(cc.zone, &cc, | 5692 | pfn = isolate_migratepages_range(cc->zone, cc, |
5662 | pfn, end); | 5693 | pfn, end, true); |
5663 | if (!pfn) { | 5694 | if (!pfn) { |
5664 | ret = -EINTR; | 5695 | ret = -EINTR; |
5665 | break; | 5696 | break; |
@@ -5670,12 +5701,16 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) | |||
5670 | break; | 5701 | break; |
5671 | } | 5702 | } |
5672 | 5703 | ||
5673 | ret = migrate_pages(&cc.migratepages, | 5704 | nr_reclaimed = reclaim_clean_pages_from_list(cc->zone, |
5674 | __alloc_contig_migrate_alloc, | 5705 | &cc->migratepages); |
5706 | cc->nr_migratepages -= nr_reclaimed; | ||
5707 | |||
5708 | ret = migrate_pages(&cc->migratepages, | ||
5709 | alloc_migrate_target, | ||
5675 | 0, false, MIGRATE_SYNC); | 5710 | 0, false, MIGRATE_SYNC); |
5676 | } | 5711 | } |
5677 | 5712 | ||
5678 | putback_lru_pages(&cc.migratepages); | 5713 | putback_lru_pages(&cc->migratepages); |
5679 | return ret > 0 ? 0 : ret; | 5714 | return ret > 0 ? 0 : ret; |
5680 | } | 5715 | } |
5681 | 5716 | ||
@@ -5754,6 +5789,15 @@ int alloc_contig_range(unsigned long start, unsigned long end, | |||
5754 | unsigned long outer_start, outer_end; | 5789 | unsigned long outer_start, outer_end; |
5755 | int ret = 0, order; | 5790 | int ret = 0, order; |
5756 | 5791 | ||
5792 | struct compact_control cc = { | ||
5793 | .nr_migratepages = 0, | ||
5794 | .order = -1, | ||
5795 | .zone = page_zone(pfn_to_page(start)), | ||
5796 | .sync = true, | ||
5797 | .ignore_skip_hint = true, | ||
5798 | }; | ||
5799 | INIT_LIST_HEAD(&cc.migratepages); | ||
5800 | |||
5757 | /* | 5801 | /* |
5758 | * What we do here is we mark all pageblocks in range as | 5802 | * What we do here is we mark all pageblocks in range as |
5759 | * MIGRATE_ISOLATE. Because pageblock and max order pages may | 5803 | * MIGRATE_ISOLATE. Because pageblock and max order pages may |
@@ -5783,7 +5827,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, | |||
5783 | if (ret) | 5827 | if (ret) |
5784 | goto done; | 5828 | goto done; |
5785 | 5829 | ||
5786 | ret = __alloc_contig_migrate_range(start, end); | 5830 | ret = __alloc_contig_migrate_range(&cc, start, end); |
5787 | if (ret) | 5831 | if (ret) |
5788 | goto done; | 5832 | goto done; |
5789 | 5833 | ||
@@ -5832,7 +5876,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, | |||
5832 | __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start); | 5876 | __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start); |
5833 | 5877 | ||
5834 | /* Grab isolated pages from freelists. */ | 5878 | /* Grab isolated pages from freelists. */ |
5835 | outer_end = isolate_freepages_range(outer_start, end); | 5879 | outer_end = isolate_freepages_range(&cc, outer_start, end); |
5836 | if (!outer_end) { | 5880 | if (!outer_end) { |
5837 | ret = -EBUSY; | 5881 | ret = -EBUSY; |
5838 | goto done; | 5882 | goto done; |
@@ -5874,6 +5918,7 @@ static int __meminit __zone_pcp_update(void *data) | |||
5874 | local_irq_save(flags); | 5918 | local_irq_save(flags); |
5875 | if (pcp->count > 0) | 5919 | if (pcp->count > 0) |
5876 | free_pcppages_bulk(zone, pcp->count, pcp); | 5920 | free_pcppages_bulk(zone, pcp->count, pcp); |
5921 | drain_zonestat(zone, pset); | ||
5877 | setup_pageset(pset, batch); | 5922 | setup_pageset(pset, batch); |
5878 | local_irq_restore(flags); | 5923 | local_irq_restore(flags); |
5879 | } | 5924 | } |
@@ -5890,10 +5935,16 @@ void __meminit zone_pcp_update(struct zone *zone) | |||
5890 | void zone_pcp_reset(struct zone *zone) | 5935 | void zone_pcp_reset(struct zone *zone) |
5891 | { | 5936 | { |
5892 | unsigned long flags; | 5937 | unsigned long flags; |
5938 | int cpu; | ||
5939 | struct per_cpu_pageset *pset; | ||
5893 | 5940 | ||
5894 | /* avoid races with drain_pages() */ | 5941 | /* avoid races with drain_pages() */ |
5895 | local_irq_save(flags); | 5942 | local_irq_save(flags); |
5896 | if (zone->pageset != &boot_pageset) { | 5943 | if (zone->pageset != &boot_pageset) { |
5944 | for_each_online_cpu(cpu) { | ||
5945 | pset = per_cpu_ptr(zone->pageset, cpu); | ||
5946 | drain_zonestat(zone, pset); | ||
5947 | } | ||
5897 | free_percpu(zone->pageset); | 5948 | free_percpu(zone->pageset); |
5898 | zone->pageset = &boot_pageset; | 5949 | zone->pageset = &boot_pageset; |
5899 | } | 5950 | } |
@@ -6047,3 +6098,37 @@ void dump_page(struct page *page) | |||
6047 | dump_page_flags(page->flags); | 6098 | dump_page_flags(page->flags); |
6048 | mem_cgroup_print_bad_page(page); | 6099 | mem_cgroup_print_bad_page(page); |
6049 | } | 6100 | } |
6101 | |||
6102 | /* reset zone->present_pages */ | ||
6103 | void reset_zone_present_pages(void) | ||
6104 | { | ||
6105 | struct zone *z; | ||
6106 | int i, nid; | ||
6107 | |||
6108 | for_each_node_state(nid, N_HIGH_MEMORY) { | ||
6109 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
6110 | z = NODE_DATA(nid)->node_zones + i; | ||
6111 | z->present_pages = 0; | ||
6112 | } | ||
6113 | } | ||
6114 | } | ||
6115 | |||
6116 | /* calculate zone's present pages in buddy system */ | ||
6117 | void fixup_zone_present_pages(int nid, unsigned long start_pfn, | ||
6118 | unsigned long end_pfn) | ||
6119 | { | ||
6120 | struct zone *z; | ||
6121 | unsigned long zone_start_pfn, zone_end_pfn; | ||
6122 | int i; | ||
6123 | |||
6124 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
6125 | z = NODE_DATA(nid)->node_zones + i; | ||
6126 | zone_start_pfn = z->zone_start_pfn; | ||
6127 | zone_end_pfn = zone_start_pfn + z->spanned_pages; | ||
6128 | |||
6129 | /* if the two regions intersect */ | ||
6130 | if (!(zone_start_pfn >= end_pfn || zone_end_pfn <= start_pfn)) | ||
6131 | z->present_pages += min(end_pfn, zone_end_pfn) - | ||
6132 | max(start_pfn, zone_start_pfn); | ||
6133 | } | ||
6134 | } | ||
diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 247d1f175739..f2f5b4818e94 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c | |||
@@ -76,8 +76,13 @@ int set_migratetype_isolate(struct page *page) | |||
76 | 76 | ||
77 | out: | 77 | out: |
78 | if (!ret) { | 78 | if (!ret) { |
79 | unsigned long nr_pages; | ||
80 | int migratetype = get_pageblock_migratetype(page); | ||
81 | |||
79 | set_pageblock_isolate(page); | 82 | set_pageblock_isolate(page); |
80 | move_freepages_block(zone, page, MIGRATE_ISOLATE); | 83 | nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); |
84 | |||
85 | __mod_zone_freepage_state(zone, -nr_pages, migratetype); | ||
81 | } | 86 | } |
82 | 87 | ||
83 | spin_unlock_irqrestore(&zone->lock, flags); | 88 | spin_unlock_irqrestore(&zone->lock, flags); |
@@ -89,12 +94,14 @@ out: | |||
89 | void unset_migratetype_isolate(struct page *page, unsigned migratetype) | 94 | void unset_migratetype_isolate(struct page *page, unsigned migratetype) |
90 | { | 95 | { |
91 | struct zone *zone; | 96 | struct zone *zone; |
92 | unsigned long flags; | 97 | unsigned long flags, nr_pages; |
98 | |||
93 | zone = page_zone(page); | 99 | zone = page_zone(page); |
94 | spin_lock_irqsave(&zone->lock, flags); | 100 | spin_lock_irqsave(&zone->lock, flags); |
95 | if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) | 101 | if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) |
96 | goto out; | 102 | goto out; |
97 | move_freepages_block(zone, page, migratetype); | 103 | nr_pages = move_freepages_block(zone, page, migratetype); |
104 | __mod_zone_freepage_state(zone, nr_pages, migratetype); | ||
98 | restore_pageblock_isolate(page, migratetype); | 105 | restore_pageblock_isolate(page, migratetype); |
99 | out: | 106 | out: |
100 | spin_unlock_irqrestore(&zone->lock, flags); | 107 | spin_unlock_irqrestore(&zone->lock, flags); |
@@ -193,10 +200,25 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) | |||
193 | continue; | 200 | continue; |
194 | } | 201 | } |
195 | page = pfn_to_page(pfn); | 202 | page = pfn_to_page(pfn); |
196 | if (PageBuddy(page)) | 203 | if (PageBuddy(page)) { |
204 | /* | ||
205 | * If race between isolatation and allocation happens, | ||
206 | * some free pages could be in MIGRATE_MOVABLE list | ||
207 | * although pageblock's migratation type of the page | ||
208 | * is MIGRATE_ISOLATE. Catch it and move the page into | ||
209 | * MIGRATE_ISOLATE list. | ||
210 | */ | ||
211 | if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) { | ||
212 | struct page *end_page; | ||
213 | |||
214 | end_page = page + (1 << page_order(page)) - 1; | ||
215 | move_freepages(page_zone(page), page, end_page, | ||
216 | MIGRATE_ISOLATE); | ||
217 | } | ||
197 | pfn += 1 << page_order(page); | 218 | pfn += 1 << page_order(page); |
219 | } | ||
198 | else if (page_count(page) == 0 && | 220 | else if (page_count(page) == 0 && |
199 | page_private(page) == MIGRATE_ISOLATE) | 221 | get_freepage_migratetype(page) == MIGRATE_ISOLATE) |
200 | pfn += 1; | 222 | pfn += 1; |
201 | else | 223 | else |
202 | break; | 224 | break; |
@@ -233,3 +255,14 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | |||
233 | spin_unlock_irqrestore(&zone->lock, flags); | 255 | spin_unlock_irqrestore(&zone->lock, flags); |
234 | return ret ? 0 : -EBUSY; | 256 | return ret ? 0 : -EBUSY; |
235 | } | 257 | } |
258 | |||
259 | struct page *alloc_migrate_target(struct page *page, unsigned long private, | ||
260 | int **resultp) | ||
261 | { | ||
262 | gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; | ||
263 | |||
264 | if (PageHighMem(page)) | ||
265 | gfp_mask |= __GFP_HIGHMEM; | ||
266 | |||
267 | return alloc_page(gfp_mask); | ||
268 | } | ||
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 74c0ddaa6fa0..e642627da6b7 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c | |||
@@ -120,3 +120,53 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, | |||
120 | } | 120 | } |
121 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 121 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
122 | #endif | 122 | #endif |
123 | |||
124 | #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT | ||
125 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
126 | void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) | ||
127 | { | ||
128 | assert_spin_locked(&mm->page_table_lock); | ||
129 | |||
130 | /* FIFO */ | ||
131 | if (!mm->pmd_huge_pte) | ||
132 | INIT_LIST_HEAD(&pgtable->lru); | ||
133 | else | ||
134 | list_add(&pgtable->lru, &mm->pmd_huge_pte->lru); | ||
135 | mm->pmd_huge_pte = pgtable; | ||
136 | } | ||
137 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
138 | #endif | ||
139 | |||
140 | #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW | ||
141 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
142 | /* no "address" argument so destroys page coloring of some arch */ | ||
143 | pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm) | ||
144 | { | ||
145 | pgtable_t pgtable; | ||
146 | |||
147 | assert_spin_locked(&mm->page_table_lock); | ||
148 | |||
149 | /* FIFO */ | ||
150 | pgtable = mm->pmd_huge_pte; | ||
151 | if (list_empty(&pgtable->lru)) | ||
152 | mm->pmd_huge_pte = NULL; | ||
153 | else { | ||
154 | mm->pmd_huge_pte = list_entry(pgtable->lru.next, | ||
155 | struct page, lru); | ||
156 | list_del(&pgtable->lru); | ||
157 | } | ||
158 | return pgtable; | ||
159 | } | ||
160 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
161 | #endif | ||
162 | |||
163 | #ifndef __HAVE_ARCH_PMDP_INVALIDATE | ||
164 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
165 | void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, | ||
166 | pmd_t *pmdp) | ||
167 | { | ||
168 | set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp)); | ||
169 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | ||
170 | } | ||
171 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
172 | #endif | ||
diff --git a/mm/prio_tree.c b/mm/prio_tree.c deleted file mode 100644 index 799dcfd7cd8c..000000000000 --- a/mm/prio_tree.c +++ /dev/null | |||
@@ -1,208 +0,0 @@ | |||
1 | /* | ||
2 | * mm/prio_tree.c - priority search tree for mapping->i_mmap | ||
3 | * | ||
4 | * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu> | ||
5 | * | ||
6 | * This file is released under the GPL v2. | ||
7 | * | ||
8 | * Based on the radix priority search tree proposed by Edward M. McCreight | ||
9 | * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985 | ||
10 | * | ||
11 | * 02Feb2004 Initial version | ||
12 | */ | ||
13 | |||
14 | #include <linux/mm.h> | ||
15 | #include <linux/prio_tree.h> | ||
16 | #include <linux/prefetch.h> | ||
17 | |||
18 | /* | ||
19 | * See lib/prio_tree.c for details on the general radix priority search tree | ||
20 | * code. | ||
21 | */ | ||
22 | |||
23 | /* | ||
24 | * The following #defines are mirrored from lib/prio_tree.c. They're only used | ||
25 | * for debugging, and should be removed (along with the debugging code using | ||
26 | * them) when switching also VMAs to the regular prio_tree code. | ||
27 | */ | ||
28 | |||
29 | #define RADIX_INDEX(vma) ((vma)->vm_pgoff) | ||
30 | #define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT) | ||
31 | /* avoid overflow */ | ||
32 | #define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1)) | ||
33 | |||
34 | /* | ||
35 | * Radix priority search tree for address_space->i_mmap | ||
36 | * | ||
37 | * For each vma that map a unique set of file pages i.e., unique [radix_index, | ||
38 | * heap_index] value, we have a corresponding priority search tree node. If | ||
39 | * multiple vmas have identical [radix_index, heap_index] value, then one of | ||
40 | * them is used as a tree node and others are stored in a vm_set list. The tree | ||
41 | * node points to the first vma (head) of the list using vm_set.head. | ||
42 | * | ||
43 | * prio_tree_root | ||
44 | * | | ||
45 | * A vm_set.head | ||
46 | * / \ / | ||
47 | * L R -> H-I-J-K-M-N-O-P-Q-S | ||
48 | * ^ ^ <-- vm_set.list --> | ||
49 | * tree nodes | ||
50 | * | ||
51 | * We need some way to identify whether a vma is a tree node, head of a vm_set | ||
52 | * list, or just a member of a vm_set list. We cannot use vm_flags to store | ||
53 | * such information. The reason is, in the above figure, it is possible that | ||
54 | * vm_flags' of R and H are covered by the different mmap_sems. When R is | ||
55 | * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold | ||
56 | * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now. | ||
57 | * That's why some trick involving shared.vm_set.parent is used for identifying | ||
58 | * tree nodes and list head nodes. | ||
59 | * | ||
60 | * vma radix priority search tree node rules: | ||
61 | * | ||
62 | * vma->shared.vm_set.parent != NULL ==> a tree node | ||
63 | * vma->shared.vm_set.head != NULL ==> list of others mapping same range | ||
64 | * vma->shared.vm_set.head == NULL ==> no others map the same range | ||
65 | * | ||
66 | * vma->shared.vm_set.parent == NULL | ||
67 | * vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range | ||
68 | * vma->shared.vm_set.head == NULL ==> a list node | ||
69 | */ | ||
70 | |||
71 | /* | ||
72 | * Add a new vma known to map the same set of pages as the old vma: | ||
73 | * useful for fork's dup_mmap as well as vma_prio_tree_insert below. | ||
74 | * Note that it just happens to work correctly on i_mmap_nonlinear too. | ||
75 | */ | ||
76 | void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old) | ||
77 | { | ||
78 | /* Leave these BUG_ONs till prio_tree patch stabilizes */ | ||
79 | BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old)); | ||
80 | BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old)); | ||
81 | |||
82 | vma->shared.vm_set.head = NULL; | ||
83 | vma->shared.vm_set.parent = NULL; | ||
84 | |||
85 | if (!old->shared.vm_set.parent) | ||
86 | list_add(&vma->shared.vm_set.list, | ||
87 | &old->shared.vm_set.list); | ||
88 | else if (old->shared.vm_set.head) | ||
89 | list_add_tail(&vma->shared.vm_set.list, | ||
90 | &old->shared.vm_set.head->shared.vm_set.list); | ||
91 | else { | ||
92 | INIT_LIST_HEAD(&vma->shared.vm_set.list); | ||
93 | vma->shared.vm_set.head = old; | ||
94 | old->shared.vm_set.head = vma; | ||
95 | } | ||
96 | } | ||
97 | |||
98 | void vma_prio_tree_insert(struct vm_area_struct *vma, | ||
99 | struct prio_tree_root *root) | ||
100 | { | ||
101 | struct prio_tree_node *ptr; | ||
102 | struct vm_area_struct *old; | ||
103 | |||
104 | vma->shared.vm_set.head = NULL; | ||
105 | |||
106 | ptr = raw_prio_tree_insert(root, &vma->shared.prio_tree_node); | ||
107 | if (ptr != (struct prio_tree_node *) &vma->shared.prio_tree_node) { | ||
108 | old = prio_tree_entry(ptr, struct vm_area_struct, | ||
109 | shared.prio_tree_node); | ||
110 | vma_prio_tree_add(vma, old); | ||
111 | } | ||
112 | } | ||
113 | |||
114 | void vma_prio_tree_remove(struct vm_area_struct *vma, | ||
115 | struct prio_tree_root *root) | ||
116 | { | ||
117 | struct vm_area_struct *node, *head, *new_head; | ||
118 | |||
119 | if (!vma->shared.vm_set.head) { | ||
120 | if (!vma->shared.vm_set.parent) | ||
121 | list_del_init(&vma->shared.vm_set.list); | ||
122 | else | ||
123 | raw_prio_tree_remove(root, &vma->shared.prio_tree_node); | ||
124 | } else { | ||
125 | /* Leave this BUG_ON till prio_tree patch stabilizes */ | ||
126 | BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma); | ||
127 | if (vma->shared.vm_set.parent) { | ||
128 | head = vma->shared.vm_set.head; | ||
129 | if (!list_empty(&head->shared.vm_set.list)) { | ||
130 | new_head = list_entry( | ||
131 | head->shared.vm_set.list.next, | ||
132 | struct vm_area_struct, | ||
133 | shared.vm_set.list); | ||
134 | list_del_init(&head->shared.vm_set.list); | ||
135 | } else | ||
136 | new_head = NULL; | ||
137 | |||
138 | raw_prio_tree_replace(root, &vma->shared.prio_tree_node, | ||
139 | &head->shared.prio_tree_node); | ||
140 | head->shared.vm_set.head = new_head; | ||
141 | if (new_head) | ||
142 | new_head->shared.vm_set.head = head; | ||
143 | |||
144 | } else { | ||
145 | node = vma->shared.vm_set.head; | ||
146 | if (!list_empty(&vma->shared.vm_set.list)) { | ||
147 | new_head = list_entry( | ||
148 | vma->shared.vm_set.list.next, | ||
149 | struct vm_area_struct, | ||
150 | shared.vm_set.list); | ||
151 | list_del_init(&vma->shared.vm_set.list); | ||
152 | node->shared.vm_set.head = new_head; | ||
153 | new_head->shared.vm_set.head = node; | ||
154 | } else | ||
155 | node->shared.vm_set.head = NULL; | ||
156 | } | ||
157 | } | ||
158 | } | ||
159 | |||
160 | /* | ||
161 | * Helper function to enumerate vmas that map a given file page or a set of | ||
162 | * contiguous file pages. The function returns vmas that at least map a single | ||
163 | * page in the given range of contiguous file pages. | ||
164 | */ | ||
165 | struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma, | ||
166 | struct prio_tree_iter *iter) | ||
167 | { | ||
168 | struct prio_tree_node *ptr; | ||
169 | struct vm_area_struct *next; | ||
170 | |||
171 | if (!vma) { | ||
172 | /* | ||
173 | * First call is with NULL vma | ||
174 | */ | ||
175 | ptr = prio_tree_next(iter); | ||
176 | if (ptr) { | ||
177 | next = prio_tree_entry(ptr, struct vm_area_struct, | ||
178 | shared.prio_tree_node); | ||
179 | prefetch(next->shared.vm_set.head); | ||
180 | return next; | ||
181 | } else | ||
182 | return NULL; | ||
183 | } | ||
184 | |||
185 | if (vma->shared.vm_set.parent) { | ||
186 | if (vma->shared.vm_set.head) { | ||
187 | next = vma->shared.vm_set.head; | ||
188 | prefetch(next->shared.vm_set.list.next); | ||
189 | return next; | ||
190 | } | ||
191 | } else { | ||
192 | next = list_entry(vma->shared.vm_set.list.next, | ||
193 | struct vm_area_struct, shared.vm_set.list); | ||
194 | if (!next->shared.vm_set.head) { | ||
195 | prefetch(next->shared.vm_set.list.next); | ||
196 | return next; | ||
197 | } | ||
198 | } | ||
199 | |||
200 | ptr = prio_tree_next(iter); | ||
201 | if (ptr) { | ||
202 | next = prio_tree_entry(ptr, struct vm_area_struct, | ||
203 | shared.prio_tree_node); | ||
204 | prefetch(next->shared.vm_set.head); | ||
205 | return next; | ||
206 | } else | ||
207 | return NULL; | ||
208 | } | ||
@@ -127,12 +127,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, | |||
127 | avc->vma = vma; | 127 | avc->vma = vma; |
128 | avc->anon_vma = anon_vma; | 128 | avc->anon_vma = anon_vma; |
129 | list_add(&avc->same_vma, &vma->anon_vma_chain); | 129 | list_add(&avc->same_vma, &vma->anon_vma_chain); |
130 | 130 | anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); | |
131 | /* | ||
132 | * It's critical to add new vmas to the tail of the anon_vma, | ||
133 | * see comment in huge_memory.c:__split_huge_page(). | ||
134 | */ | ||
135 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); | ||
136 | } | 131 | } |
137 | 132 | ||
138 | /** | 133 | /** |
@@ -269,51 +264,6 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) | |||
269 | } | 264 | } |
270 | 265 | ||
271 | /* | 266 | /* |
272 | * Some rmap walk that needs to find all ptes/hugepmds without false | ||
273 | * negatives (like migrate and split_huge_page) running concurrent | ||
274 | * with operations that copy or move pagetables (like mremap() and | ||
275 | * fork()) to be safe. They depend on the anon_vma "same_anon_vma" | ||
276 | * list to be in a certain order: the dst_vma must be placed after the | ||
277 | * src_vma in the list. This is always guaranteed by fork() but | ||
278 | * mremap() needs to call this function to enforce it in case the | ||
279 | * dst_vma isn't newly allocated and chained with the anon_vma_clone() | ||
280 | * function but just an extension of a pre-existing vma through | ||
281 | * vma_merge. | ||
282 | * | ||
283 | * NOTE: the same_anon_vma list can still be changed by other | ||
284 | * processes while mremap runs because mremap doesn't hold the | ||
285 | * anon_vma mutex to prevent modifications to the list while it | ||
286 | * runs. All we need to enforce is that the relative order of this | ||
287 | * process vmas isn't changing (we don't care about other vmas | ||
288 | * order). Each vma corresponds to an anon_vma_chain structure so | ||
289 | * there's no risk that other processes calling anon_vma_moveto_tail() | ||
290 | * and changing the same_anon_vma list under mremap() will screw with | ||
291 | * the relative order of this process vmas in the list, because we | ||
292 | * they can't alter the order of any vma that belongs to this | ||
293 | * process. And there can't be another anon_vma_moveto_tail() running | ||
294 | * concurrently with mremap() coming from this process because we hold | ||
295 | * the mmap_sem for the whole mremap(). fork() ordering dependency | ||
296 | * also shouldn't be affected because fork() only cares that the | ||
297 | * parent vmas are placed in the list before the child vmas and | ||
298 | * anon_vma_moveto_tail() won't reorder vmas from either the fork() | ||
299 | * parent or child. | ||
300 | */ | ||
301 | void anon_vma_moveto_tail(struct vm_area_struct *dst) | ||
302 | { | ||
303 | struct anon_vma_chain *pavc; | ||
304 | struct anon_vma *root = NULL; | ||
305 | |||
306 | list_for_each_entry_reverse(pavc, &dst->anon_vma_chain, same_vma) { | ||
307 | struct anon_vma *anon_vma = pavc->anon_vma; | ||
308 | VM_BUG_ON(pavc->vma != dst); | ||
309 | root = lock_anon_vma_root(root, anon_vma); | ||
310 | list_del(&pavc->same_anon_vma); | ||
311 | list_add_tail(&pavc->same_anon_vma, &anon_vma->head); | ||
312 | } | ||
313 | unlock_anon_vma_root(root); | ||
314 | } | ||
315 | |||
316 | /* | ||
317 | * Attach vma to its own anon_vma, as well as to the anon_vmas that | 267 | * Attach vma to its own anon_vma, as well as to the anon_vmas that |
318 | * the corresponding VMA in the parent process is attached to. | 268 | * the corresponding VMA in the parent process is attached to. |
319 | * Returns 0 on success, non-zero on failure. | 269 | * Returns 0 on success, non-zero on failure. |
@@ -381,13 +331,13 @@ void unlink_anon_vmas(struct vm_area_struct *vma) | |||
381 | struct anon_vma *anon_vma = avc->anon_vma; | 331 | struct anon_vma *anon_vma = avc->anon_vma; |
382 | 332 | ||
383 | root = lock_anon_vma_root(root, anon_vma); | 333 | root = lock_anon_vma_root(root, anon_vma); |
384 | list_del(&avc->same_anon_vma); | 334 | anon_vma_interval_tree_remove(avc, &anon_vma->rb_root); |
385 | 335 | ||
386 | /* | 336 | /* |
387 | * Leave empty anon_vmas on the list - we'll need | 337 | * Leave empty anon_vmas on the list - we'll need |
388 | * to free them outside the lock. | 338 | * to free them outside the lock. |
389 | */ | 339 | */ |
390 | if (list_empty(&anon_vma->head)) | 340 | if (RB_EMPTY_ROOT(&anon_vma->rb_root)) |
391 | continue; | 341 | continue; |
392 | 342 | ||
393 | list_del(&avc->same_vma); | 343 | list_del(&avc->same_vma); |
@@ -416,7 +366,7 @@ static void anon_vma_ctor(void *data) | |||
416 | 366 | ||
417 | mutex_init(&anon_vma->mutex); | 367 | mutex_init(&anon_vma->mutex); |
418 | atomic_set(&anon_vma->refcount, 0); | 368 | atomic_set(&anon_vma->refcount, 0); |
419 | INIT_LIST_HEAD(&anon_vma->head); | 369 | anon_vma->rb_root = RB_ROOT; |
420 | } | 370 | } |
421 | 371 | ||
422 | void __init anon_vma_init(void) | 372 | void __init anon_vma_init(void) |
@@ -560,22 +510,26 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma) | |||
560 | 510 | ||
561 | /* | 511 | /* |
562 | * At what user virtual address is page expected in @vma? | 512 | * At what user virtual address is page expected in @vma? |
563 | * Returns virtual address or -EFAULT if page's index/offset is not | ||
564 | * within the range mapped the @vma. | ||
565 | */ | 513 | */ |
566 | inline unsigned long | 514 | static inline unsigned long |
567 | vma_address(struct page *page, struct vm_area_struct *vma) | 515 | __vma_address(struct page *page, struct vm_area_struct *vma) |
568 | { | 516 | { |
569 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 517 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
570 | unsigned long address; | ||
571 | 518 | ||
572 | if (unlikely(is_vm_hugetlb_page(vma))) | 519 | if (unlikely(is_vm_hugetlb_page(vma))) |
573 | pgoff = page->index << huge_page_order(page_hstate(page)); | 520 | pgoff = page->index << huge_page_order(page_hstate(page)); |
574 | address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); | 521 | |
575 | if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { | 522 | return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); |
576 | /* page should be within @vma mapping range */ | 523 | } |
577 | return -EFAULT; | 524 | |
578 | } | 525 | inline unsigned long |
526 | vma_address(struct page *page, struct vm_area_struct *vma) | ||
527 | { | ||
528 | unsigned long address = __vma_address(page, vma); | ||
529 | |||
530 | /* page should be within @vma mapping range */ | ||
531 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | ||
532 | |||
579 | return address; | 533 | return address; |
580 | } | 534 | } |
581 | 535 | ||
@@ -585,6 +539,7 @@ vma_address(struct page *page, struct vm_area_struct *vma) | |||
585 | */ | 539 | */ |
586 | unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | 540 | unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) |
587 | { | 541 | { |
542 | unsigned long address; | ||
588 | if (PageAnon(page)) { | 543 | if (PageAnon(page)) { |
589 | struct anon_vma *page__anon_vma = page_anon_vma(page); | 544 | struct anon_vma *page__anon_vma = page_anon_vma(page); |
590 | /* | 545 | /* |
@@ -600,7 +555,10 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | |||
600 | return -EFAULT; | 555 | return -EFAULT; |
601 | } else | 556 | } else |
602 | return -EFAULT; | 557 | return -EFAULT; |
603 | return vma_address(page, vma); | 558 | address = __vma_address(page, vma); |
559 | if (unlikely(address < vma->vm_start || address >= vma->vm_end)) | ||
560 | return -EFAULT; | ||
561 | return address; | ||
604 | } | 562 | } |
605 | 563 | ||
606 | /* | 564 | /* |
@@ -674,8 +632,8 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) | |||
674 | pte_t *pte; | 632 | pte_t *pte; |
675 | spinlock_t *ptl; | 633 | spinlock_t *ptl; |
676 | 634 | ||
677 | address = vma_address(page, vma); | 635 | address = __vma_address(page, vma); |
678 | if (address == -EFAULT) /* out of vma range */ | 636 | if (unlikely(address < vma->vm_start || address >= vma->vm_end)) |
679 | return 0; | 637 | return 0; |
680 | pte = page_check_address(page, vma->vm_mm, address, &ptl, 1); | 638 | pte = page_check_address(page, vma->vm_mm, address, &ptl, 1); |
681 | if (!pte) /* the page is not in this mm */ | 639 | if (!pte) /* the page is not in this mm */ |
@@ -769,6 +727,7 @@ static int page_referenced_anon(struct page *page, | |||
769 | { | 727 | { |
770 | unsigned int mapcount; | 728 | unsigned int mapcount; |
771 | struct anon_vma *anon_vma; | 729 | struct anon_vma *anon_vma; |
730 | pgoff_t pgoff; | ||
772 | struct anon_vma_chain *avc; | 731 | struct anon_vma_chain *avc; |
773 | int referenced = 0; | 732 | int referenced = 0; |
774 | 733 | ||
@@ -777,11 +736,10 @@ static int page_referenced_anon(struct page *page, | |||
777 | return referenced; | 736 | return referenced; |
778 | 737 | ||
779 | mapcount = page_mapcount(page); | 738 | mapcount = page_mapcount(page); |
780 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { | 739 | pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
740 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | ||
781 | struct vm_area_struct *vma = avc->vma; | 741 | struct vm_area_struct *vma = avc->vma; |
782 | unsigned long address = vma_address(page, vma); | 742 | unsigned long address = vma_address(page, vma); |
783 | if (address == -EFAULT) | ||
784 | continue; | ||
785 | /* | 743 | /* |
786 | * If we are reclaiming on behalf of a cgroup, skip | 744 | * If we are reclaiming on behalf of a cgroup, skip |
787 | * counting on behalf of references from different | 745 | * counting on behalf of references from different |
@@ -820,7 +778,6 @@ static int page_referenced_file(struct page *page, | |||
820 | struct address_space *mapping = page->mapping; | 778 | struct address_space *mapping = page->mapping; |
821 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 779 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
822 | struct vm_area_struct *vma; | 780 | struct vm_area_struct *vma; |
823 | struct prio_tree_iter iter; | ||
824 | int referenced = 0; | 781 | int referenced = 0; |
825 | 782 | ||
826 | /* | 783 | /* |
@@ -846,10 +803,8 @@ static int page_referenced_file(struct page *page, | |||
846 | */ | 803 | */ |
847 | mapcount = page_mapcount(page); | 804 | mapcount = page_mapcount(page); |
848 | 805 | ||
849 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 806 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
850 | unsigned long address = vma_address(page, vma); | 807 | unsigned long address = vma_address(page, vma); |
851 | if (address == -EFAULT) | ||
852 | continue; | ||
853 | /* | 808 | /* |
854 | * If we are reclaiming on behalf of a cgroup, skip | 809 | * If we are reclaiming on behalf of a cgroup, skip |
855 | * counting on behalf of references from different | 810 | * counting on behalf of references from different |
@@ -929,7 +884,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, | |||
929 | pte_t entry; | 884 | pte_t entry; |
930 | 885 | ||
931 | flush_cache_page(vma, address, pte_pfn(*pte)); | 886 | flush_cache_page(vma, address, pte_pfn(*pte)); |
932 | entry = ptep_clear_flush_notify(vma, address, pte); | 887 | entry = ptep_clear_flush(vma, address, pte); |
933 | entry = pte_wrprotect(entry); | 888 | entry = pte_wrprotect(entry); |
934 | entry = pte_mkclean(entry); | 889 | entry = pte_mkclean(entry); |
935 | set_pte_at(mm, address, pte, entry); | 890 | set_pte_at(mm, address, pte, entry); |
@@ -937,6 +892,9 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, | |||
937 | } | 892 | } |
938 | 893 | ||
939 | pte_unmap_unlock(pte, ptl); | 894 | pte_unmap_unlock(pte, ptl); |
895 | |||
896 | if (ret) | ||
897 | mmu_notifier_invalidate_page(mm, address); | ||
940 | out: | 898 | out: |
941 | return ret; | 899 | return ret; |
942 | } | 900 | } |
@@ -945,17 +903,14 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) | |||
945 | { | 903 | { |
946 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 904 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
947 | struct vm_area_struct *vma; | 905 | struct vm_area_struct *vma; |
948 | struct prio_tree_iter iter; | ||
949 | int ret = 0; | 906 | int ret = 0; |
950 | 907 | ||
951 | BUG_ON(PageAnon(page)); | 908 | BUG_ON(PageAnon(page)); |
952 | 909 | ||
953 | mutex_lock(&mapping->i_mmap_mutex); | 910 | mutex_lock(&mapping->i_mmap_mutex); |
954 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 911 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
955 | if (vma->vm_flags & VM_SHARED) { | 912 | if (vma->vm_flags & VM_SHARED) { |
956 | unsigned long address = vma_address(page, vma); | 913 | unsigned long address = vma_address(page, vma); |
957 | if (address == -EFAULT) | ||
958 | continue; | ||
959 | ret += page_mkclean_one(page, vma, address); | 914 | ret += page_mkclean_one(page, vma, address); |
960 | } | 915 | } |
961 | } | 916 | } |
@@ -1128,7 +1083,7 @@ void page_add_new_anon_rmap(struct page *page, | |||
1128 | else | 1083 | else |
1129 | __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); | 1084 | __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); |
1130 | __page_set_anon_rmap(page, vma, address, 1); | 1085 | __page_set_anon_rmap(page, vma, address, 1); |
1131 | if (page_evictable(page, vma)) | 1086 | if (!mlocked_vma_newpage(vma, page)) |
1132 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); | 1087 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); |
1133 | else | 1088 | else |
1134 | add_page_to_unevictable_list(page); | 1089 | add_page_to_unevictable_list(page); |
@@ -1203,7 +1158,10 @@ void page_remove_rmap(struct page *page) | |||
1203 | } else { | 1158 | } else { |
1204 | __dec_zone_page_state(page, NR_FILE_MAPPED); | 1159 | __dec_zone_page_state(page, NR_FILE_MAPPED); |
1205 | mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); | 1160 | mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); |
1161 | mem_cgroup_end_update_page_stat(page, &locked, &flags); | ||
1206 | } | 1162 | } |
1163 | if (unlikely(PageMlocked(page))) | ||
1164 | clear_page_mlock(page); | ||
1207 | /* | 1165 | /* |
1208 | * It would be tidy to reset the PageAnon mapping here, | 1166 | * It would be tidy to reset the PageAnon mapping here, |
1209 | * but that might overwrite a racing page_add_anon_rmap | 1167 | * but that might overwrite a racing page_add_anon_rmap |
@@ -1213,6 +1171,7 @@ void page_remove_rmap(struct page *page) | |||
1213 | * Leaving it set also helps swapoff to reinstate ptes | 1171 | * Leaving it set also helps swapoff to reinstate ptes |
1214 | * faster for those pages still in swapcache. | 1172 | * faster for those pages still in swapcache. |
1215 | */ | 1173 | */ |
1174 | return; | ||
1216 | out: | 1175 | out: |
1217 | if (!anon) | 1176 | if (!anon) |
1218 | mem_cgroup_end_update_page_stat(page, &locked, &flags); | 1177 | mem_cgroup_end_update_page_stat(page, &locked, &flags); |
@@ -1256,7 +1215,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1256 | 1215 | ||
1257 | /* Nuke the page table entry. */ | 1216 | /* Nuke the page table entry. */ |
1258 | flush_cache_page(vma, address, page_to_pfn(page)); | 1217 | flush_cache_page(vma, address, page_to_pfn(page)); |
1259 | pteval = ptep_clear_flush_notify(vma, address, pte); | 1218 | pteval = ptep_clear_flush(vma, address, pte); |
1260 | 1219 | ||
1261 | /* Move the dirty bit to the physical page now the pte is gone. */ | 1220 | /* Move the dirty bit to the physical page now the pte is gone. */ |
1262 | if (pte_dirty(pteval)) | 1221 | if (pte_dirty(pteval)) |
@@ -1318,6 +1277,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1318 | 1277 | ||
1319 | out_unmap: | 1278 | out_unmap: |
1320 | pte_unmap_unlock(pte, ptl); | 1279 | pte_unmap_unlock(pte, ptl); |
1280 | if (ret != SWAP_FAIL) | ||
1281 | mmu_notifier_invalidate_page(mm, address); | ||
1321 | out: | 1282 | out: |
1322 | return ret; | 1283 | return ret; |
1323 | 1284 | ||
@@ -1382,6 +1343,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
1382 | spinlock_t *ptl; | 1343 | spinlock_t *ptl; |
1383 | struct page *page; | 1344 | struct page *page; |
1384 | unsigned long address; | 1345 | unsigned long address; |
1346 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
1347 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
1385 | unsigned long end; | 1348 | unsigned long end; |
1386 | int ret = SWAP_AGAIN; | 1349 | int ret = SWAP_AGAIN; |
1387 | int locked_vma = 0; | 1350 | int locked_vma = 0; |
@@ -1405,6 +1368,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
1405 | if (!pmd_present(*pmd)) | 1368 | if (!pmd_present(*pmd)) |
1406 | return ret; | 1369 | return ret; |
1407 | 1370 | ||
1371 | mmun_start = address; | ||
1372 | mmun_end = end; | ||
1373 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
1374 | |||
1408 | /* | 1375 | /* |
1409 | * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, | 1376 | * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, |
1410 | * keep the sem while scanning the cluster for mlocking pages. | 1377 | * keep the sem while scanning the cluster for mlocking pages. |
@@ -1438,7 +1405,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
1438 | 1405 | ||
1439 | /* Nuke the page table entry. */ | 1406 | /* Nuke the page table entry. */ |
1440 | flush_cache_page(vma, address, pte_pfn(*pte)); | 1407 | flush_cache_page(vma, address, pte_pfn(*pte)); |
1441 | pteval = ptep_clear_flush_notify(vma, address, pte); | 1408 | pteval = ptep_clear_flush(vma, address, pte); |
1442 | 1409 | ||
1443 | /* If nonlinear, store the file page offset in the pte. */ | 1410 | /* If nonlinear, store the file page offset in the pte. */ |
1444 | if (page->index != linear_page_index(vma, address)) | 1411 | if (page->index != linear_page_index(vma, address)) |
@@ -1454,6 +1421,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
1454 | (*mapcount)--; | 1421 | (*mapcount)--; |
1455 | } | 1422 | } |
1456 | pte_unmap_unlock(pte - 1, ptl); | 1423 | pte_unmap_unlock(pte - 1, ptl); |
1424 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
1457 | if (locked_vma) | 1425 | if (locked_vma) |
1458 | up_read(&vma->vm_mm->mmap_sem); | 1426 | up_read(&vma->vm_mm->mmap_sem); |
1459 | return ret; | 1427 | return ret; |
@@ -1492,6 +1460,7 @@ bool is_vma_temporary_stack(struct vm_area_struct *vma) | |||
1492 | static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | 1460 | static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) |
1493 | { | 1461 | { |
1494 | struct anon_vma *anon_vma; | 1462 | struct anon_vma *anon_vma; |
1463 | pgoff_t pgoff; | ||
1495 | struct anon_vma_chain *avc; | 1464 | struct anon_vma_chain *avc; |
1496 | int ret = SWAP_AGAIN; | 1465 | int ret = SWAP_AGAIN; |
1497 | 1466 | ||
@@ -1499,7 +1468,8 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | |||
1499 | if (!anon_vma) | 1468 | if (!anon_vma) |
1500 | return ret; | 1469 | return ret; |
1501 | 1470 | ||
1502 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { | 1471 | pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
1472 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | ||
1503 | struct vm_area_struct *vma = avc->vma; | 1473 | struct vm_area_struct *vma = avc->vma; |
1504 | unsigned long address; | 1474 | unsigned long address; |
1505 | 1475 | ||
@@ -1516,8 +1486,6 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | |||
1516 | continue; | 1486 | continue; |
1517 | 1487 | ||
1518 | address = vma_address(page, vma); | 1488 | address = vma_address(page, vma); |
1519 | if (address == -EFAULT) | ||
1520 | continue; | ||
1521 | ret = try_to_unmap_one(page, vma, address, flags); | 1489 | ret = try_to_unmap_one(page, vma, address, flags); |
1522 | if (ret != SWAP_AGAIN || !page_mapped(page)) | 1490 | if (ret != SWAP_AGAIN || !page_mapped(page)) |
1523 | break; | 1491 | break; |
@@ -1547,7 +1515,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1547 | struct address_space *mapping = page->mapping; | 1515 | struct address_space *mapping = page->mapping; |
1548 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 1516 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
1549 | struct vm_area_struct *vma; | 1517 | struct vm_area_struct *vma; |
1550 | struct prio_tree_iter iter; | ||
1551 | int ret = SWAP_AGAIN; | 1518 | int ret = SWAP_AGAIN; |
1552 | unsigned long cursor; | 1519 | unsigned long cursor; |
1553 | unsigned long max_nl_cursor = 0; | 1520 | unsigned long max_nl_cursor = 0; |
@@ -1555,10 +1522,8 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1555 | unsigned int mapcount; | 1522 | unsigned int mapcount; |
1556 | 1523 | ||
1557 | mutex_lock(&mapping->i_mmap_mutex); | 1524 | mutex_lock(&mapping->i_mmap_mutex); |
1558 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 1525 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
1559 | unsigned long address = vma_address(page, vma); | 1526 | unsigned long address = vma_address(page, vma); |
1560 | if (address == -EFAULT) | ||
1561 | continue; | ||
1562 | ret = try_to_unmap_one(page, vma, address, flags); | 1527 | ret = try_to_unmap_one(page, vma, address, flags); |
1563 | if (ret != SWAP_AGAIN || !page_mapped(page)) | 1528 | if (ret != SWAP_AGAIN || !page_mapped(page)) |
1564 | goto out; | 1529 | goto out; |
@@ -1576,7 +1541,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1576 | goto out; | 1541 | goto out; |
1577 | 1542 | ||
1578 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 1543 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, |
1579 | shared.vm_set.list) { | 1544 | shared.nonlinear) { |
1580 | cursor = (unsigned long) vma->vm_private_data; | 1545 | cursor = (unsigned long) vma->vm_private_data; |
1581 | if (cursor > max_nl_cursor) | 1546 | if (cursor > max_nl_cursor) |
1582 | max_nl_cursor = cursor; | 1547 | max_nl_cursor = cursor; |
@@ -1608,7 +1573,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1608 | 1573 | ||
1609 | do { | 1574 | do { |
1610 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 1575 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, |
1611 | shared.vm_set.list) { | 1576 | shared.nonlinear) { |
1612 | cursor = (unsigned long) vma->vm_private_data; | 1577 | cursor = (unsigned long) vma->vm_private_data; |
1613 | while ( cursor < max_nl_cursor && | 1578 | while ( cursor < max_nl_cursor && |
1614 | cursor < vma->vm_end - vma->vm_start) { | 1579 | cursor < vma->vm_end - vma->vm_start) { |
@@ -1631,7 +1596,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1631 | * in locked vmas). Reset cursor on all unreserved nonlinear | 1596 | * in locked vmas). Reset cursor on all unreserved nonlinear |
1632 | * vmas, now forgetting on which ones it had fallen behind. | 1597 | * vmas, now forgetting on which ones it had fallen behind. |
1633 | */ | 1598 | */ |
1634 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) | 1599 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) |
1635 | vma->vm_private_data = NULL; | 1600 | vma->vm_private_data = NULL; |
1636 | out: | 1601 | out: |
1637 | mutex_unlock(&mapping->i_mmap_mutex); | 1602 | mutex_unlock(&mapping->i_mmap_mutex); |
@@ -1716,6 +1681,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | |||
1716 | struct vm_area_struct *, unsigned long, void *), void *arg) | 1681 | struct vm_area_struct *, unsigned long, void *), void *arg) |
1717 | { | 1682 | { |
1718 | struct anon_vma *anon_vma; | 1683 | struct anon_vma *anon_vma; |
1684 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
1719 | struct anon_vma_chain *avc; | 1685 | struct anon_vma_chain *avc; |
1720 | int ret = SWAP_AGAIN; | 1686 | int ret = SWAP_AGAIN; |
1721 | 1687 | ||
@@ -1729,11 +1695,9 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | |||
1729 | if (!anon_vma) | 1695 | if (!anon_vma) |
1730 | return ret; | 1696 | return ret; |
1731 | anon_vma_lock(anon_vma); | 1697 | anon_vma_lock(anon_vma); |
1732 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { | 1698 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { |
1733 | struct vm_area_struct *vma = avc->vma; | 1699 | struct vm_area_struct *vma = avc->vma; |
1734 | unsigned long address = vma_address(page, vma); | 1700 | unsigned long address = vma_address(page, vma); |
1735 | if (address == -EFAULT) | ||
1736 | continue; | ||
1737 | ret = rmap_one(page, vma, address, arg); | 1701 | ret = rmap_one(page, vma, address, arg); |
1738 | if (ret != SWAP_AGAIN) | 1702 | if (ret != SWAP_AGAIN) |
1739 | break; | 1703 | break; |
@@ -1748,16 +1712,13 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, | |||
1748 | struct address_space *mapping = page->mapping; | 1712 | struct address_space *mapping = page->mapping; |
1749 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 1713 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
1750 | struct vm_area_struct *vma; | 1714 | struct vm_area_struct *vma; |
1751 | struct prio_tree_iter iter; | ||
1752 | int ret = SWAP_AGAIN; | 1715 | int ret = SWAP_AGAIN; |
1753 | 1716 | ||
1754 | if (!mapping) | 1717 | if (!mapping) |
1755 | return ret; | 1718 | return ret; |
1756 | mutex_lock(&mapping->i_mmap_mutex); | 1719 | mutex_lock(&mapping->i_mmap_mutex); |
1757 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 1720 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
1758 | unsigned long address = vma_address(page, vma); | 1721 | unsigned long address = vma_address(page, vma); |
1759 | if (address == -EFAULT) | ||
1760 | continue; | ||
1761 | ret = rmap_one(page, vma, address, arg); | 1722 | ret = rmap_one(page, vma, address, arg); |
1762 | if (ret != SWAP_AGAIN) | 1723 | if (ret != SWAP_AGAIN) |
1763 | break; | 1724 | break; |
diff --git a/mm/shmem.c b/mm/shmem.c index d3752110c8c7..cc12072f8787 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -1339,7 +1339,6 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) | |||
1339 | { | 1339 | { |
1340 | file_accessed(file); | 1340 | file_accessed(file); |
1341 | vma->vm_ops = &shmem_vm_ops; | 1341 | vma->vm_ops = &shmem_vm_ops; |
1342 | vma->vm_flags |= VM_CAN_NONLINEAR; | ||
1343 | return 0; | 1342 | return 0; |
1344 | } | 1343 | } |
1345 | 1344 | ||
@@ -2643,6 +2642,7 @@ static const struct vm_operations_struct shmem_vm_ops = { | |||
2643 | .set_policy = shmem_set_policy, | 2642 | .set_policy = shmem_set_policy, |
2644 | .get_policy = shmem_get_policy, | 2643 | .get_policy = shmem_get_policy, |
2645 | #endif | 2644 | #endif |
2645 | .remap_pages = generic_file_remap_pages, | ||
2646 | }; | 2646 | }; |
2647 | 2647 | ||
2648 | static struct dentry *shmem_mount(struct file_system_type *fs_type, | 2648 | static struct dentry *shmem_mount(struct file_system_type *fs_type, |
@@ -2836,7 +2836,6 @@ int shmem_zero_setup(struct vm_area_struct *vma) | |||
2836 | fput(vma->vm_file); | 2836 | fput(vma->vm_file); |
2837 | vma->vm_file = file; | 2837 | vma->vm_file = file; |
2838 | vma->vm_ops = &shmem_vm_ops; | 2838 | vma->vm_ops = &shmem_vm_ops; |
2839 | vma->vm_flags |= VM_CAN_NONLINEAR; | ||
2840 | return 0; | 2839 | return 0; |
2841 | } | 2840 | } |
2842 | 2841 | ||
@@ -446,13 +446,22 @@ void mark_page_accessed(struct page *page) | |||
446 | } | 446 | } |
447 | EXPORT_SYMBOL(mark_page_accessed); | 447 | EXPORT_SYMBOL(mark_page_accessed); |
448 | 448 | ||
449 | /* | ||
450 | * Order of operations is important: flush the pagevec when it's already | ||
451 | * full, not when adding the last page, to make sure that last page is | ||
452 | * not added to the LRU directly when passed to this function. Because | ||
453 | * mark_page_accessed() (called after this when writing) only activates | ||
454 | * pages that are on the LRU, linear writes in subpage chunks would see | ||
455 | * every PAGEVEC_SIZE page activated, which is unexpected. | ||
456 | */ | ||
449 | void __lru_cache_add(struct page *page, enum lru_list lru) | 457 | void __lru_cache_add(struct page *page, enum lru_list lru) |
450 | { | 458 | { |
451 | struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; | 459 | struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; |
452 | 460 | ||
453 | page_cache_get(page); | 461 | page_cache_get(page); |
454 | if (!pagevec_add(pvec, page)) | 462 | if (!pagevec_space(pvec)) |
455 | __pagevec_lru_add(pvec, lru); | 463 | __pagevec_lru_add(pvec, lru); |
464 | pagevec_add(pvec, page); | ||
456 | put_cpu_var(lru_add_pvecs); | 465 | put_cpu_var(lru_add_pvecs); |
457 | } | 466 | } |
458 | EXPORT_SYMBOL(__lru_cache_add); | 467 | EXPORT_SYMBOL(__lru_cache_add); |
@@ -742,7 +751,7 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, | |||
742 | 751 | ||
743 | SetPageLRU(page_tail); | 752 | SetPageLRU(page_tail); |
744 | 753 | ||
745 | if (page_evictable(page_tail, NULL)) { | 754 | if (page_evictable(page_tail)) { |
746 | if (PageActive(page)) { | 755 | if (PageActive(page)) { |
747 | SetPageActive(page_tail); | 756 | SetPageActive(page_tail); |
748 | active = 1; | 757 | active = 1; |
diff --git a/mm/truncate.c b/mm/truncate.c index 75801acdaac7..d51ce92d6e83 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -107,7 +107,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page) | |||
107 | 107 | ||
108 | cancel_dirty_page(page, PAGE_CACHE_SIZE); | 108 | cancel_dirty_page(page, PAGE_CACHE_SIZE); |
109 | 109 | ||
110 | clear_page_mlock(page); | ||
111 | ClearPageMappedToDisk(page); | 110 | ClearPageMappedToDisk(page); |
112 | delete_from_page_cache(page); | 111 | delete_from_page_cache(page); |
113 | return 0; | 112 | return 0; |
@@ -132,7 +131,6 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) | |||
132 | if (page_has_private(page) && !try_to_release_page(page, 0)) | 131 | if (page_has_private(page) && !try_to_release_page(page, 0)) |
133 | return 0; | 132 | return 0; |
134 | 133 | ||
135 | clear_page_mlock(page); | ||
136 | ret = remove_mapping(mapping, page); | 134 | ret = remove_mapping(mapping, page); |
137 | 135 | ||
138 | return ret; | 136 | return ret; |
@@ -398,7 +396,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) | |||
398 | if (PageDirty(page)) | 396 | if (PageDirty(page)) |
399 | goto failed; | 397 | goto failed; |
400 | 398 | ||
401 | clear_page_mlock(page); | ||
402 | BUG_ON(page_has_private(page)); | 399 | BUG_ON(page_has_private(page)); |
403 | __delete_from_page_cache(page); | 400 | __delete_from_page_cache(page); |
404 | spin_unlock_irq(&mapping->tree_lock); | 401 | spin_unlock_irq(&mapping->tree_lock); |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 2bb90b1d241c..78e08300db21 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -2163,8 +2163,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, | |||
2163 | usize -= PAGE_SIZE; | 2163 | usize -= PAGE_SIZE; |
2164 | } while (usize > 0); | 2164 | } while (usize > 0); |
2165 | 2165 | ||
2166 | /* Prevent "things" like memory migration? VM_flags need a cleanup... */ | 2166 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
2167 | vma->vm_flags |= VM_RESERVED; | ||
2168 | 2167 | ||
2169 | return 0; | 2168 | return 0; |
2170 | } | 2169 | } |
@@ -2572,7 +2571,7 @@ static int s_show(struct seq_file *m, void *p) | |||
2572 | { | 2571 | { |
2573 | struct vm_struct *v = p; | 2572 | struct vm_struct *v = p; |
2574 | 2573 | ||
2575 | seq_printf(m, "0x%p-0x%p %7ld", | 2574 | seq_printf(m, "0x%pK-0x%pK %7ld", |
2576 | v->addr, v->addr + v->size, v->size); | 2575 | v->addr, v->addr + v->size, v->size); |
2577 | 2576 | ||
2578 | if (v->caller) | 2577 | if (v->caller) |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 99b434b674c0..2624edcfb420 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -553,7 +553,7 @@ void putback_lru_page(struct page *page) | |||
553 | redo: | 553 | redo: |
554 | ClearPageUnevictable(page); | 554 | ClearPageUnevictable(page); |
555 | 555 | ||
556 | if (page_evictable(page, NULL)) { | 556 | if (page_evictable(page)) { |
557 | /* | 557 | /* |
558 | * For evictable pages, we can use the cache. | 558 | * For evictable pages, we can use the cache. |
559 | * In event of a race, worst case is we end up with an | 559 | * In event of a race, worst case is we end up with an |
@@ -587,7 +587,7 @@ redo: | |||
587 | * page is on unevictable list, it never be freed. To avoid that, | 587 | * page is on unevictable list, it never be freed. To avoid that, |
588 | * check after we added it to the list, again. | 588 | * check after we added it to the list, again. |
589 | */ | 589 | */ |
590 | if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) { | 590 | if (lru == LRU_UNEVICTABLE && page_evictable(page)) { |
591 | if (!isolate_lru_page(page)) { | 591 | if (!isolate_lru_page(page)) { |
592 | put_page(page); | 592 | put_page(page); |
593 | goto redo; | 593 | goto redo; |
@@ -674,8 +674,10 @@ static enum page_references page_check_references(struct page *page, | |||
674 | static unsigned long shrink_page_list(struct list_head *page_list, | 674 | static unsigned long shrink_page_list(struct list_head *page_list, |
675 | struct zone *zone, | 675 | struct zone *zone, |
676 | struct scan_control *sc, | 676 | struct scan_control *sc, |
677 | enum ttu_flags ttu_flags, | ||
677 | unsigned long *ret_nr_dirty, | 678 | unsigned long *ret_nr_dirty, |
678 | unsigned long *ret_nr_writeback) | 679 | unsigned long *ret_nr_writeback, |
680 | bool force_reclaim) | ||
679 | { | 681 | { |
680 | LIST_HEAD(ret_pages); | 682 | LIST_HEAD(ret_pages); |
681 | LIST_HEAD(free_pages); | 683 | LIST_HEAD(free_pages); |
@@ -689,10 +691,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
689 | 691 | ||
690 | mem_cgroup_uncharge_start(); | 692 | mem_cgroup_uncharge_start(); |
691 | while (!list_empty(page_list)) { | 693 | while (!list_empty(page_list)) { |
692 | enum page_references references; | ||
693 | struct address_space *mapping; | 694 | struct address_space *mapping; |
694 | struct page *page; | 695 | struct page *page; |
695 | int may_enter_fs; | 696 | int may_enter_fs; |
697 | enum page_references references = PAGEREF_RECLAIM_CLEAN; | ||
696 | 698 | ||
697 | cond_resched(); | 699 | cond_resched(); |
698 | 700 | ||
@@ -707,7 +709,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
707 | 709 | ||
708 | sc->nr_scanned++; | 710 | sc->nr_scanned++; |
709 | 711 | ||
710 | if (unlikely(!page_evictable(page, NULL))) | 712 | if (unlikely(!page_evictable(page))) |
711 | goto cull_mlocked; | 713 | goto cull_mlocked; |
712 | 714 | ||
713 | if (!sc->may_unmap && page_mapped(page)) | 715 | if (!sc->may_unmap && page_mapped(page)) |
@@ -758,7 +760,9 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
758 | wait_on_page_writeback(page); | 760 | wait_on_page_writeback(page); |
759 | } | 761 | } |
760 | 762 | ||
761 | references = page_check_references(page, sc); | 763 | if (!force_reclaim) |
764 | references = page_check_references(page, sc); | ||
765 | |||
762 | switch (references) { | 766 | switch (references) { |
763 | case PAGEREF_ACTIVATE: | 767 | case PAGEREF_ACTIVATE: |
764 | goto activate_locked; | 768 | goto activate_locked; |
@@ -788,7 +792,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
788 | * processes. Try to unmap it here. | 792 | * processes. Try to unmap it here. |
789 | */ | 793 | */ |
790 | if (page_mapped(page) && mapping) { | 794 | if (page_mapped(page) && mapping) { |
791 | switch (try_to_unmap(page, TTU_UNMAP)) { | 795 | switch (try_to_unmap(page, ttu_flags)) { |
792 | case SWAP_FAIL: | 796 | case SWAP_FAIL: |
793 | goto activate_locked; | 797 | goto activate_locked; |
794 | case SWAP_AGAIN: | 798 | case SWAP_AGAIN: |
@@ -960,6 +964,33 @@ keep: | |||
960 | return nr_reclaimed; | 964 | return nr_reclaimed; |
961 | } | 965 | } |
962 | 966 | ||
967 | unsigned long reclaim_clean_pages_from_list(struct zone *zone, | ||
968 | struct list_head *page_list) | ||
969 | { | ||
970 | struct scan_control sc = { | ||
971 | .gfp_mask = GFP_KERNEL, | ||
972 | .priority = DEF_PRIORITY, | ||
973 | .may_unmap = 1, | ||
974 | }; | ||
975 | unsigned long ret, dummy1, dummy2; | ||
976 | struct page *page, *next; | ||
977 | LIST_HEAD(clean_pages); | ||
978 | |||
979 | list_for_each_entry_safe(page, next, page_list, lru) { | ||
980 | if (page_is_file_cache(page) && !PageDirty(page)) { | ||
981 | ClearPageActive(page); | ||
982 | list_move(&page->lru, &clean_pages); | ||
983 | } | ||
984 | } | ||
985 | |||
986 | ret = shrink_page_list(&clean_pages, zone, &sc, | ||
987 | TTU_UNMAP|TTU_IGNORE_ACCESS, | ||
988 | &dummy1, &dummy2, true); | ||
989 | list_splice(&clean_pages, page_list); | ||
990 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); | ||
991 | return ret; | ||
992 | } | ||
993 | |||
963 | /* | 994 | /* |
964 | * Attempt to remove the specified page from its LRU. Only take this page | 995 | * Attempt to remove the specified page from its LRU. Only take this page |
965 | * if it is of the appropriate PageActive status. Pages which are being | 996 | * if it is of the appropriate PageActive status. Pages which are being |
@@ -978,8 +1009,8 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) | |||
978 | if (!PageLRU(page)) | 1009 | if (!PageLRU(page)) |
979 | return ret; | 1010 | return ret; |
980 | 1011 | ||
981 | /* Do not give back unevictable pages for compaction */ | 1012 | /* Compaction should not handle unevictable pages but CMA can do so */ |
982 | if (PageUnevictable(page)) | 1013 | if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE)) |
983 | return ret; | 1014 | return ret; |
984 | 1015 | ||
985 | ret = -EBUSY; | 1016 | ret = -EBUSY; |
@@ -1186,7 +1217,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) | |||
1186 | 1217 | ||
1187 | VM_BUG_ON(PageLRU(page)); | 1218 | VM_BUG_ON(PageLRU(page)); |
1188 | list_del(&page->lru); | 1219 | list_del(&page->lru); |
1189 | if (unlikely(!page_evictable(page, NULL))) { | 1220 | if (unlikely(!page_evictable(page))) { |
1190 | spin_unlock_irq(&zone->lru_lock); | 1221 | spin_unlock_irq(&zone->lru_lock); |
1191 | putback_lru_page(page); | 1222 | putback_lru_page(page); |
1192 | spin_lock_irq(&zone->lru_lock); | 1223 | spin_lock_irq(&zone->lru_lock); |
@@ -1278,8 +1309,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1278 | if (nr_taken == 0) | 1309 | if (nr_taken == 0) |
1279 | return 0; | 1310 | return 0; |
1280 | 1311 | ||
1281 | nr_reclaimed = shrink_page_list(&page_list, zone, sc, | 1312 | nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, |
1282 | &nr_dirty, &nr_writeback); | 1313 | &nr_dirty, &nr_writeback, false); |
1283 | 1314 | ||
1284 | spin_lock_irq(&zone->lru_lock); | 1315 | spin_lock_irq(&zone->lru_lock); |
1285 | 1316 | ||
@@ -1439,7 +1470,7 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
1439 | page = lru_to_page(&l_hold); | 1470 | page = lru_to_page(&l_hold); |
1440 | list_del(&page->lru); | 1471 | list_del(&page->lru); |
1441 | 1472 | ||
1442 | if (unlikely(!page_evictable(page, NULL))) { | 1473 | if (unlikely(!page_evictable(page))) { |
1443 | putback_lru_page(page); | 1474 | putback_lru_page(page); |
1444 | continue; | 1475 | continue; |
1445 | } | 1476 | } |
@@ -1729,6 +1760,28 @@ static bool in_reclaim_compaction(struct scan_control *sc) | |||
1729 | return false; | 1760 | return false; |
1730 | } | 1761 | } |
1731 | 1762 | ||
1763 | #ifdef CONFIG_COMPACTION | ||
1764 | /* | ||
1765 | * If compaction is deferred for sc->order then scale the number of pages | ||
1766 | * reclaimed based on the number of consecutive allocation failures | ||
1767 | */ | ||
1768 | static unsigned long scale_for_compaction(unsigned long pages_for_compaction, | ||
1769 | struct lruvec *lruvec, struct scan_control *sc) | ||
1770 | { | ||
1771 | struct zone *zone = lruvec_zone(lruvec); | ||
1772 | |||
1773 | if (zone->compact_order_failed <= sc->order) | ||
1774 | pages_for_compaction <<= zone->compact_defer_shift; | ||
1775 | return pages_for_compaction; | ||
1776 | } | ||
1777 | #else | ||
1778 | static unsigned long scale_for_compaction(unsigned long pages_for_compaction, | ||
1779 | struct lruvec *lruvec, struct scan_control *sc) | ||
1780 | { | ||
1781 | return pages_for_compaction; | ||
1782 | } | ||
1783 | #endif | ||
1784 | |||
1732 | /* | 1785 | /* |
1733 | * Reclaim/compaction is used for high-order allocation requests. It reclaims | 1786 | * Reclaim/compaction is used for high-order allocation requests. It reclaims |
1734 | * order-0 pages before compacting the zone. should_continue_reclaim() returns | 1787 | * order-0 pages before compacting the zone. should_continue_reclaim() returns |
@@ -1776,6 +1829,9 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec, | |||
1776 | * inactive lists are large enough, continue reclaiming | 1829 | * inactive lists are large enough, continue reclaiming |
1777 | */ | 1830 | */ |
1778 | pages_for_compaction = (2UL << sc->order); | 1831 | pages_for_compaction = (2UL << sc->order); |
1832 | |||
1833 | pages_for_compaction = scale_for_compaction(pages_for_compaction, | ||
1834 | lruvec, sc); | ||
1779 | inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE); | 1835 | inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE); |
1780 | if (nr_swap_pages > 0) | 1836 | if (nr_swap_pages > 0) |
1781 | inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON); | 1837 | inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON); |
@@ -2839,6 +2895,14 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
2839 | */ | 2895 | */ |
2840 | set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); | 2896 | set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); |
2841 | 2897 | ||
2898 | /* | ||
2899 | * Compaction records what page blocks it recently failed to | ||
2900 | * isolate pages from and skips them in the future scanning. | ||
2901 | * When kswapd is going to sleep, it is reasonable to assume | ||
2902 | * that pages and compaction may succeed so reset the cache. | ||
2903 | */ | ||
2904 | reset_isolation_suitable(pgdat); | ||
2905 | |||
2842 | if (!kthread_should_stop()) | 2906 | if (!kthread_should_stop()) |
2843 | schedule(); | 2907 | schedule(); |
2844 | 2908 | ||
@@ -3101,9 +3165,9 @@ int kswapd_run(int nid) | |||
3101 | if (IS_ERR(pgdat->kswapd)) { | 3165 | if (IS_ERR(pgdat->kswapd)) { |
3102 | /* failure at boot is fatal */ | 3166 | /* failure at boot is fatal */ |
3103 | BUG_ON(system_state == SYSTEM_BOOTING); | 3167 | BUG_ON(system_state == SYSTEM_BOOTING); |
3104 | printk("Failed to start kswapd on node %d\n",nid); | ||
3105 | pgdat->kswapd = NULL; | 3168 | pgdat->kswapd = NULL; |
3106 | ret = -1; | 3169 | pr_err("Failed to start kswapd on node %d\n", nid); |
3170 | ret = PTR_ERR(pgdat->kswapd); | ||
3107 | } | 3171 | } |
3108 | return ret; | 3172 | return ret; |
3109 | } | 3173 | } |
@@ -3350,27 +3414,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
3350 | /* | 3414 | /* |
3351 | * page_evictable - test whether a page is evictable | 3415 | * page_evictable - test whether a page is evictable |
3352 | * @page: the page to test | 3416 | * @page: the page to test |
3353 | * @vma: the VMA in which the page is or will be mapped, may be NULL | ||
3354 | * | 3417 | * |
3355 | * Test whether page is evictable--i.e., should be placed on active/inactive | 3418 | * Test whether page is evictable--i.e., should be placed on active/inactive |
3356 | * lists vs unevictable list. The vma argument is !NULL when called from the | 3419 | * lists vs unevictable list. |
3357 | * fault path to determine how to instantate a new page. | ||
3358 | * | 3420 | * |
3359 | * Reasons page might not be evictable: | 3421 | * Reasons page might not be evictable: |
3360 | * (1) page's mapping marked unevictable | 3422 | * (1) page's mapping marked unevictable |
3361 | * (2) page is part of an mlocked VMA | 3423 | * (2) page is part of an mlocked VMA |
3362 | * | 3424 | * |
3363 | */ | 3425 | */ |
3364 | int page_evictable(struct page *page, struct vm_area_struct *vma) | 3426 | int page_evictable(struct page *page) |
3365 | { | 3427 | { |
3366 | 3428 | return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page); | |
3367 | if (mapping_unevictable(page_mapping(page))) | ||
3368 | return 0; | ||
3369 | |||
3370 | if (PageMlocked(page) || (vma && mlocked_vma_newpage(vma, page))) | ||
3371 | return 0; | ||
3372 | |||
3373 | return 1; | ||
3374 | } | 3429 | } |
3375 | 3430 | ||
3376 | #ifdef CONFIG_SHMEM | 3431 | #ifdef CONFIG_SHMEM |
@@ -3408,7 +3463,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages) | |||
3408 | if (!PageLRU(page) || !PageUnevictable(page)) | 3463 | if (!PageLRU(page) || !PageUnevictable(page)) |
3409 | continue; | 3464 | continue; |
3410 | 3465 | ||
3411 | if (page_evictable(page, NULL)) { | 3466 | if (page_evictable(page)) { |
3412 | enum lru_list lru = page_lru_base_type(page); | 3467 | enum lru_list lru = page_lru_base_type(page); |
3413 | 3468 | ||
3414 | VM_BUG_ON(PageActive(page)); | 3469 | VM_BUG_ON(PageActive(page)); |
diff --git a/mm/vmstat.c b/mm/vmstat.c index b3e3b9d525d0..c7370579111b 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -495,6 +495,18 @@ void refresh_cpu_vm_stats(int cpu) | |||
495 | atomic_long_add(global_diff[i], &vm_stat[i]); | 495 | atomic_long_add(global_diff[i], &vm_stat[i]); |
496 | } | 496 | } |
497 | 497 | ||
498 | void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) | ||
499 | { | ||
500 | int i; | ||
501 | |||
502 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | ||
503 | if (pset->vm_stat_diff[i]) { | ||
504 | int v = pset->vm_stat_diff[i]; | ||
505 | pset->vm_stat_diff[i] = 0; | ||
506 | atomic_long_add(v, &zone->vm_stat[i]); | ||
507 | atomic_long_add(v, &vm_stat[i]); | ||
508 | } | ||
509 | } | ||
498 | #endif | 510 | #endif |
499 | 511 | ||
500 | #ifdef CONFIG_NUMA | 512 | #ifdef CONFIG_NUMA |
@@ -722,6 +734,7 @@ const char * const vmstat_text[] = { | |||
722 | "numa_other", | 734 | "numa_other", |
723 | #endif | 735 | #endif |
724 | "nr_anon_transparent_hugepages", | 736 | "nr_anon_transparent_hugepages", |
737 | "nr_free_cma", | ||
725 | "nr_dirty_threshold", | 738 | "nr_dirty_threshold", |
726 | "nr_dirty_background_threshold", | 739 | "nr_dirty_background_threshold", |
727 | 740 | ||
@@ -781,7 +794,6 @@ const char * const vmstat_text[] = { | |||
781 | "unevictable_pgs_munlocked", | 794 | "unevictable_pgs_munlocked", |
782 | "unevictable_pgs_cleared", | 795 | "unevictable_pgs_cleared", |
783 | "unevictable_pgs_stranded", | 796 | "unevictable_pgs_stranded", |
784 | "unevictable_pgs_mlockfreed", | ||
785 | 797 | ||
786 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 798 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
787 | "thp_fault_alloc", | 799 | "thp_fault_alloc", |